airflow scheduler executes dag earlier than its start date - python-datetime

I try to execute a dag on start_date= datetime.strptime('3/2/2020 8:20:00', '%m/%d/%Y %H:%M:%S') and schedule_interval=schedule_interval='20/5 8 * * *' but dag get executed before the actual start_date.
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
def print_hello():
return 'Hello world!'
default_args = {
'owner':'admin',
'retries': 0,
'email_on_retry': False,
'depends_on_past': False,
'email_on_failure': False,
'priority_weight': 10,
'email': [''],
'queue':'bash_queue',
'start_date': datetime.strptime('3/2/2020 8:20:00', '%m/%d/%Y %H:%M:%S'),
}
dag = DAG('somename',
default_args=default_args,
catchup = False,
schedule_interval='20/5 8 * * *'
)
dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag)
dummy_operator >> hello_operator
can anyone help me to understand the odd behavior of airflow?

Related

T5 while doing hyperparameter search shows "ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds"

I am working with the huggingface transformers and training a pretrained byt5-small on my data. I am also trying to do hyperparameter search using Trainer API with optuna as backend. But the following error is appearing every time. Please help me out. below is the while code.
transformer version = 4.23.1
from transformers import HfArgumentParser, TensorFlowBenchmark, TensorFlowBenchmarkArguments
import pandas as pd
from transformers import T5ForConditionalGeneration, ByT5Tokenizer
from transformers import TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer
import datasets
import transformers
from tqdm import tqdm
from numba import cuda
device = cuda.get_current_device()
device.reset()
train_df = pd.read_csv("/home/bhavuk/project1/data/train_split.csv")
eval_df = pd.read_csv("/home/bhavuk/project1/data/eval_split.csv")
test_df = pd.read_csv("/home/bhavuk/project1/data/test_split.csv")
train_df = train_df.dropna()
eval_df = eval_df.dropna()
test_df = test_df.dropna(subset=["Hypothesis","Reference"])
train_df.shape, eval_df.shape[0], test_df.shape[0]
args_dict = {
"output_dir": './byt5-small-hp-search',
"overwrite_output_dir": True,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 2,
"gradient_accumulation_steps": 4,
"learning_rate": 1e-1,
"warmup_steps": 2,
"logging_steps": 100,
"evaluation_strategy": "steps",
"eval_steps": 250,
"num_train_epochs": 4,
"do_train": True,
"do_eval": True,
"fp16": False,
"max_steps": 100000,
"load_best_model_at_end":True,
"logging_dir": './logs',
"save_total_limit" : 2,
"weight_decay" : 0.1,
"label_smoothing_factor" : 0.1
}
parser = HfArgumentParser(
(TrainingArguments))
training_args = parser.parse_dict(args_dict)
args = training_args[0]
def optuna_hp_space(trial):
return {
"learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
"dropout_rate": trial.suggest_float("dropout_rate", 0.1, 0.6, step=0.1),
"weight_decay": trial.suggest_float("weight_decay", 0.1, 0.3, step=0.1),
"label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.1, 0.3, step=0.1)
}
config = '/home/bhavuk/project1/notebooks/models--google--byt5-small/snapshots/ce8f3a48ed7676af36476a01fb01f95ea529599c/config.json'
def model_init(trial):
return T5ForConditionalGeneration.from_pretrained(
'google/byt5-small',
config=config,
dropout_rate = 0.1
)
tokenizer = ByT5Tokenizer.from_pretrained(
"google/byt5-small",
cache_dir=".",
max_length=512
)
class GPReviewDataset(Dataset):
def __init__(self, Text, Label):
self.Text = Text
self.Label = Label
# self.tokenizer = tokenizer
# self.max_len = max_len
def __len__(self):
return len(self.Text)
def __getitem__(self, item):
Text = str(self.Text[item])
Label = self.Label[item]
inputs = tokenizer(Text, padding="max_length", truncation=True, max_length=512)
outputs = tokenizer(Label, padding="max_length", truncation=True, max_length=512)
return {
"input_ids":inputs.input_ids,
"attention_mask" : inputs.attention_mask,
"labels" : outputs.input_ids,
"decoder_attention_mask" : outputs.attention_mask,
# "labels" : lbz
}
ds_train = GPReviewDataset(
Text=train_df.Hypothesis.to_numpy(),
Label=train_df.Reference.to_numpy()
ds_test = GPReviewDataset(
Text=eval_df.Hypothesis.to_numpy(),
Label=eval_df.Reference.to_numpy()
# tokenizer=tokenizer,
# max_len=max_len
)
train_dataset = ds_train
valid_dataset = ds_test
trainer = Trainer(
model=None,
args=args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
tokenizer=tokenizer,
model_init=model_init
)
best_trial = trainer.hyperparameter_search(
direction="minimize",
backend="optuna",
hp_space=optuna_hp_space,
n_trials=20
)
ERROR:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/home/bhavuk/project1/notebooks/byT5small_hp_search_2.ipynb Cell 14 in <cell line: 1>()
----> 1 best_trial = trainer.hyperparameter_search(
2 direction="minimize",
3 backend="optuna",
4 hp_space=optuna_hp_space,
5 n_trials=20
6 )
File ~/anaconda3/envs/cvenv/lib/python3.9/site-packages/transformers/trainer.py:2368, in Trainer.hyperparameter_search(self, hp_space, compute_objective, n_trials, direction, backend, hp_name, **kwargs)
2360 self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
2362 backend_dict = {
2363 HPSearchBackend.OPTUNA: run_hp_search_optuna,
2364 HPSearchBackend.RAY: run_hp_search_ray,
2365 HPSearchBackend.SIGOPT: run_hp_search_sigopt,
2366 HPSearchBackend.WANDB: run_hp_search_wandb,
2367 }
-> 2368 best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
2370 self.hp_search_backend = None
2371 return best_run
File ~/anaconda3/envs/cvenv/lib/python3.9/site-packages/transformers/integrations.py:189, in run_hp_search_optuna(trainer, n_trials, direction, **kwargs)
187 n_jobs = kwargs.pop("n_jobs", 1)
188 study = optuna.create_study(direction=direction, **kwargs)
...
return forward_call(*input, **kwargs)
File "/home/bhavuk/anaconda3/envs/cvenv/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 937, in forward
raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

Azure Event Hub No Longer receiving messages : Eventhub has request but no messages

For some reason my Azure Event Hub is no longer receiving messages. It was working fine last night.
I am using Databricks Data Generator to send data to Azure Event Hubs with the following code:
import dbldatagen as dg
from pyspark.sql.types import IntegerType, StringType, FloatType
import json
from pyspark.sql.types import StructType, StructField, IntegerType, DecimalType, StringType, TimestampType, Row
from pyspark.sql.functions import *
import pyspark.sql.functions as F
num_rows = 1 * 10000 # number of rows to generate
num_partitions = 2 # number of Spark dataframe partitions
delay_reasons = ["Air Carrier", "Extreme Weather", "National Aviation System", "Security", "Late Aircraft"]
# will have implied column `id` for ordinal of row
flightdata_defn = (dg.DataGenerator(spark, name="flight_delay_data", rows=num_rows, partitions=num_partitions)
#.withColumn("body",StringType(), False)
.withColumn("flightNumber", "int", minValue=1000, uniqueValues=10000, random=True)
.withColumn("airline", "string", minValue=1, maxValue=500, prefix="airline", random=True, distribution="normal")
.withColumn("original_departure", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", interval="1 minute", random=True)
.withColumn("delay_minutes", "int", minValue=20, maxValue=600, distribution=dg.distributions.Gamma(1.0, 2.0))
.withColumn("delayed_departure", "timestamp", expr="cast(original_departure as bigint) + (delay_minutes * 60) ", baseColumn=["original_departure", "delay_minutes"])
.withColumn("reason", "string", values=delay_reasons, random=True)
)
df_flight_data = flightdata_defn.build(withStreaming=True, options={'rowsPerSecond': 100})
streamingDelays = (
df_flight_data
.groupBy(
#df_flight_data.body,
df_flight_data.flightNumber,
df_flight_data.airline,
df_flight_data.original_departure,
df_flight_data.delay_minutes,
df_flight_data.delayed_departure,
df_flight_data.reason,
window(df_flight_data.original_departure, "1 hour")
)
.count()
)
writeConnectionString = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)
checkpointLocation = "///checkpoint"
ehWriteConf = {
'eventhubs.connectionString' : writeConnectionString
}
# Write body data from a DataFrame to EventHubs. Events are distributed across partitions using round-robin model.
ds = streamingDelays \
.select(F.to_json(F.struct("*")).alias("body")) \
.writeStream.format("eventhubs") \
.options(**ehWriteConf) \
.outputMode("complete") \
.option("checkpointLocation", "...") \
.start()
# display(streamingDelays)
From the image you will notice that I'm bearly receiving and requests, and absolutely no messages. However, just yesterday I was getting both requests and messages.
I created a new Event Hub, but I'm
I'm sure its something very simple that I'm missing....
I should mention that my Databricks notebook appears to get stuck at 'Stream initializing...

Issues with using data from upload in other components in Dash

When writing a program in Dash, I have been having issues. When using the Upload component, I am struggling to properly use that data on other components.
My goal is to use the data uploaded (CSV file) in order to add options to 2 identical Dropdown components, those being the names of the columns of the imported file.
A graph is to be generated using the selected values on the dropdowns afterward as the axis for the graph.
Any help would be appreciated.
import base64
import datetime
import io
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.express as px
import pandas as pd
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
df = pd.DataFrame()
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div([
html.Div(children='this is an attempt to do stuff right'),
dcc.Dropdown(id='Drop1'),
dcc.Dropdown(id='Drop2'),
dcc.Dropdown(id='graphtype', options=[
{'label': 'Bar', 'value': 'Bar'},
{'label': 'Scatter', 'value': 'Scatter'},
{'label': 'Histogram', 'value': 'Hist'}
]),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select Files')
]),
style={
'width': '100%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
# Allow multiple files to be uploaded
multiple=True
),
html.Div(id='output-data-upload'),
dcc.Graph(id='output-graph')
]
)
def parse_contents(contents, filename, date):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
try:
if 'csv' in filename:
# Assume that the user uploaded a CSV file
df = pd.read_csv(
io.StringIO(decoded.decode('utf-8')))
elif 'xls' in filename:
# Assume that the user uploaded an excel file
df = pd.read_excel(io.BytesIO(decoded))
except Exception as e:
print(e)
return html.Div([
'There was an error processing this file.'
])
return html.Div([
html.H5(filename),
html.H6(datetime.datetime.fromtimestamp(date)),
dash_table.DataTable(
data=df.to_dict('records'),
columns=[{'name': i, 'id': i} for i in df.columns]
),
html.Hr(), # horizontal line
# For debugging, display the raw contents provided by the web browser
html.Div('Raw Content'),
html.Pre(contents[0:200] + '...', style={
'whiteSpace': 'pre-wrap',
'wordBreak': 'break-all'
})
])
#app.callback(Output('output-data-upload', 'children'),
[Input('upload-data', 'contents')],
[State('upload-data', 'filename'),
State('upload-data', 'last_modified')])
def update_output(list_of_contents, list_of_names, list_of_dates):
if list_of_contents is not None:
children = [
parse_contents(c, n, d) for c, n, d in
zip(list_of_contents, list_of_names, list_of_dates)]
print(children)
return children
if __name__ == '__main__':
app.run_server(debug=True)
This is the page from the docs that should give you all you need. If you upload a CSV file you can use:
df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
and from there just use it as a normal pandas data frame.

Display selected features after Gridsearch

I'm using GridSearchCV to perform feature selection (SelectKBest) for a linear regression. The results show that 10 features are selected (using .best_params_), but I'm unsure how to display which features this are.
The code is pasted below. I'm using a pipeline because the next models will also need hyperparameter selection. x_train is a dataframe with 12 columns that I cannot share due to data restrictions.
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)), ('regr',
LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(x_train, y_train)
print(clf.best_params_)
You can access the information about feature_selection step like this:
<GridSearch_model_variable>.best_estimater_.named_steps[<feature_selection_step>]
So, in your case, it would be like this:
print(clf.best_estimator_.named_steps['feature_selection'])
#Output: SelectKBest(k=8, score_func=<function mutual_info_regression at 0x13d37b430>)
Next you can use the get_support function to get the boolean map of the selected features:
print(clf.best_estimator_.named_steps['feature_selection'].get_support())
# Output: array([ True, False, True, False, True, True, True, False, False,
True, True, False, True])
Now provide this map over the original columns:
data_columns = X.columns # List of columns in your dataset
# This is the original list of columns
print(data_columns)
# Output: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT']
# Now print the select columns
print(data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()])
# Output: ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']
So you can see out of 13 features only 8 were selected ( as in my data k=4 was the best case)
Here is the full code with boston dataset:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
boston_dataset = load_boston()
X = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
y = boston_dataset.target
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)),
('regr', LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(X, y)
print(clf.best_params_)
data_columns = X.columns
selected_features = data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()]
print(selected_features)
# Output : Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')
References:
https://stackoverflow.com/a/33378303/8160718
https://stackoverflow.com/a/38788087/8160718

Lambda times out after 3 sec

Lambda runs and times out after 3 seconds causing my script to make 3 snapshots instead of one. Then it does not continue with the rest of the script. When I run the script on an ec2 instance, then script works fine. I was also wondering would it be better to separate this script into two scripts.
import boto3
from datetime import date
SOURCE_REGION = 'us-east-1'
DEST_REGION = 'us-west-2'
ec2_source = boto3.client('ec2', region_name = SOURCE_REGION, config=Config(connect_timeout=5, read_timeout=60, retries={'max_attempts': 2}))
ec2_destination = boto3.client('ec2', region_name = DEST_REGION, config=Config(connect_timeout=5, read_timeout=60, retries={'max_attempts': 2}))
def lambda_handler(event, context):
volumes = ec2_source.describe_volumes(Filters=[{'Name':'tag:Backup','Values':['Yes']}])['Volumes']
for volume in volumes:
print('Getting:', volume['VolumeId'])
response = ec2_source.create_snapshot(
Description = 'Prod_' + volume['VolumeId'],
VolumeId = volume['VolumeId'],
)
SourceVolumeID = response['SnapshotId']
ec2_source.create_tags(Resources=[SourceVolumeID],
Tags=volume['Tags']
)
SourceTagID = response['SnapshotId']
ec2_source.delete_tags(Resources=[SourceTagID],
Tags=[ {'Key': 'Backup', 'Value': 'Yes' },]
)
SourceTagID = response['SnapshotId']
ec2_source.create_tags(Resources=[SourceTagID],
Tags=[ {'Key': 'Backup', 'Value': 'Complete' },]
)
SourceTagID = response['SnapshotId']
ec2_source.create_tags(Resources=[SourceTagID],
Tags=[ {'Key': 'Done', 'Value': 'Copied' },]
)
try:
waiter = ec2_source.get_waiter('snapshot_completed')
waiter.wait(
SnapshotIds=[SourceTagID],
DryRun=False,
WaiterConfig={'Delay': 10,'MaxAttempts': 123}
)
except botocore.exceptions.WaiterError as e:
print(e.message)
snaps = ec2_source.describe_snapshots(OwnerIds=['self'], Filters=[{'Name':'tag:Backup','Values':['Complete']}])['Snapshots']
Tday = date.today()
Tday_snaps = [ s for s in snaps if s['StartTime'].date() == Tday ]
for Tday_s in Tday_snaps:
print('Copying:', Tday_s['SnapshotId'])
DestinationSnapshot = ec2_destination.copy_snapshot(
SourceSnapshotId = Tday_s['SnapshotId'],
SourceRegion = SOURCE_REGION,
Description = Tday_s['VolumeId']
)
DestinationTagID = DestinationSnapshot['SnapshotId']
ec2_destination.create_tags(Resources=[DestinationTagID],
Tags=Tday_s['Tags']
)
DestinationTagID = DestinationSnapshot['SnapshotId']
ec2_destination.delete_tags(Resources=[DestinationTagID],
Tags=[ {'Key': 'Backup', 'Value': 'Complete' },]
)
The default timeout for a Lambda is 3 seconds. To update, go to the Lambda console and select your Lambda. Then scroll down a bit to "Basic Settings". Select that:
And update as you need.
If you're using the aws cli, you can run something like:
aws lambda update-function-configuration --function-name functionName --timeout 45
to change the Lambda function functionName to have a 45 second timeout.

Resources