Lambda times out after 3 sec - aws-lambda

Lambda runs and times out after 3 seconds causing my script to make 3 snapshots instead of one. Then it does not continue with the rest of the script. When I run the script on an ec2 instance, then script works fine. I was also wondering would it be better to separate this script into two scripts.
import boto3
from datetime import date
SOURCE_REGION = 'us-east-1'
DEST_REGION = 'us-west-2'
ec2_source = boto3.client('ec2', region_name = SOURCE_REGION, config=Config(connect_timeout=5, read_timeout=60, retries={'max_attempts': 2}))
ec2_destination = boto3.client('ec2', region_name = DEST_REGION, config=Config(connect_timeout=5, read_timeout=60, retries={'max_attempts': 2}))
def lambda_handler(event, context):
volumes = ec2_source.describe_volumes(Filters=[{'Name':'tag:Backup','Values':['Yes']}])['Volumes']
for volume in volumes:
print('Getting:', volume['VolumeId'])
response = ec2_source.create_snapshot(
Description = 'Prod_' + volume['VolumeId'],
VolumeId = volume['VolumeId'],
)
SourceVolumeID = response['SnapshotId']
ec2_source.create_tags(Resources=[SourceVolumeID],
Tags=volume['Tags']
)
SourceTagID = response['SnapshotId']
ec2_source.delete_tags(Resources=[SourceTagID],
Tags=[ {'Key': 'Backup', 'Value': 'Yes' },]
)
SourceTagID = response['SnapshotId']
ec2_source.create_tags(Resources=[SourceTagID],
Tags=[ {'Key': 'Backup', 'Value': 'Complete' },]
)
SourceTagID = response['SnapshotId']
ec2_source.create_tags(Resources=[SourceTagID],
Tags=[ {'Key': 'Done', 'Value': 'Copied' },]
)
try:
waiter = ec2_source.get_waiter('snapshot_completed')
waiter.wait(
SnapshotIds=[SourceTagID],
DryRun=False,
WaiterConfig={'Delay': 10,'MaxAttempts': 123}
)
except botocore.exceptions.WaiterError as e:
print(e.message)
snaps = ec2_source.describe_snapshots(OwnerIds=['self'], Filters=[{'Name':'tag:Backup','Values':['Complete']}])['Snapshots']
Tday = date.today()
Tday_snaps = [ s for s in snaps if s['StartTime'].date() == Tday ]
for Tday_s in Tday_snaps:
print('Copying:', Tday_s['SnapshotId'])
DestinationSnapshot = ec2_destination.copy_snapshot(
SourceSnapshotId = Tday_s['SnapshotId'],
SourceRegion = SOURCE_REGION,
Description = Tday_s['VolumeId']
)
DestinationTagID = DestinationSnapshot['SnapshotId']
ec2_destination.create_tags(Resources=[DestinationTagID],
Tags=Tday_s['Tags']
)
DestinationTagID = DestinationSnapshot['SnapshotId']
ec2_destination.delete_tags(Resources=[DestinationTagID],
Tags=[ {'Key': 'Backup', 'Value': 'Complete' },]
)

The default timeout for a Lambda is 3 seconds. To update, go to the Lambda console and select your Lambda. Then scroll down a bit to "Basic Settings". Select that:
And update as you need.
If you're using the aws cli, you can run something like:
aws lambda update-function-configuration --function-name functionName --timeout 45
to change the Lambda function functionName to have a 45 second timeout.

Related

Azure Event Hub No Longer receiving messages : Eventhub has request but no messages

For some reason my Azure Event Hub is no longer receiving messages. It was working fine last night.
I am using Databricks Data Generator to send data to Azure Event Hubs with the following code:
import dbldatagen as dg
from pyspark.sql.types import IntegerType, StringType, FloatType
import json
from pyspark.sql.types import StructType, StructField, IntegerType, DecimalType, StringType, TimestampType, Row
from pyspark.sql.functions import *
import pyspark.sql.functions as F
num_rows = 1 * 10000 # number of rows to generate
num_partitions = 2 # number of Spark dataframe partitions
delay_reasons = ["Air Carrier", "Extreme Weather", "National Aviation System", "Security", "Late Aircraft"]
# will have implied column `id` for ordinal of row
flightdata_defn = (dg.DataGenerator(spark, name="flight_delay_data", rows=num_rows, partitions=num_partitions)
#.withColumn("body",StringType(), False)
.withColumn("flightNumber", "int", minValue=1000, uniqueValues=10000, random=True)
.withColumn("airline", "string", minValue=1, maxValue=500, prefix="airline", random=True, distribution="normal")
.withColumn("original_departure", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", interval="1 minute", random=True)
.withColumn("delay_minutes", "int", minValue=20, maxValue=600, distribution=dg.distributions.Gamma(1.0, 2.0))
.withColumn("delayed_departure", "timestamp", expr="cast(original_departure as bigint) + (delay_minutes * 60) ", baseColumn=["original_departure", "delay_minutes"])
.withColumn("reason", "string", values=delay_reasons, random=True)
)
df_flight_data = flightdata_defn.build(withStreaming=True, options={'rowsPerSecond': 100})
streamingDelays = (
df_flight_data
.groupBy(
#df_flight_data.body,
df_flight_data.flightNumber,
df_flight_data.airline,
df_flight_data.original_departure,
df_flight_data.delay_minutes,
df_flight_data.delayed_departure,
df_flight_data.reason,
window(df_flight_data.original_departure, "1 hour")
)
.count()
)
writeConnectionString = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)
checkpointLocation = "///checkpoint"
ehWriteConf = {
'eventhubs.connectionString' : writeConnectionString
}
# Write body data from a DataFrame to EventHubs. Events are distributed across partitions using round-robin model.
ds = streamingDelays \
.select(F.to_json(F.struct("*")).alias("body")) \
.writeStream.format("eventhubs") \
.options(**ehWriteConf) \
.outputMode("complete") \
.option("checkpointLocation", "...") \
.start()
# display(streamingDelays)
From the image you will notice that I'm bearly receiving and requests, and absolutely no messages. However, just yesterday I was getting both requests and messages.
I created a new Event Hub, but I'm
I'm sure its something very simple that I'm missing....
I should mention that my Databricks notebook appears to get stuck at 'Stream initializing...

Issues with using data from upload in other components in Dash

When writing a program in Dash, I have been having issues. When using the Upload component, I am struggling to properly use that data on other components.
My goal is to use the data uploaded (CSV file) in order to add options to 2 identical Dropdown components, those being the names of the columns of the imported file.
A graph is to be generated using the selected values on the dropdowns afterward as the axis for the graph.
Any help would be appreciated.
import base64
import datetime
import io
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.express as px
import pandas as pd
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
df = pd.DataFrame()
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div([
html.Div(children='this is an attempt to do stuff right'),
dcc.Dropdown(id='Drop1'),
dcc.Dropdown(id='Drop2'),
dcc.Dropdown(id='graphtype', options=[
{'label': 'Bar', 'value': 'Bar'},
{'label': 'Scatter', 'value': 'Scatter'},
{'label': 'Histogram', 'value': 'Hist'}
]),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select Files')
]),
style={
'width': '100%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
# Allow multiple files to be uploaded
multiple=True
),
html.Div(id='output-data-upload'),
dcc.Graph(id='output-graph')
]
)
def parse_contents(contents, filename, date):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
try:
if 'csv' in filename:
# Assume that the user uploaded a CSV file
df = pd.read_csv(
io.StringIO(decoded.decode('utf-8')))
elif 'xls' in filename:
# Assume that the user uploaded an excel file
df = pd.read_excel(io.BytesIO(decoded))
except Exception as e:
print(e)
return html.Div([
'There was an error processing this file.'
])
return html.Div([
html.H5(filename),
html.H6(datetime.datetime.fromtimestamp(date)),
dash_table.DataTable(
data=df.to_dict('records'),
columns=[{'name': i, 'id': i} for i in df.columns]
),
html.Hr(), # horizontal line
# For debugging, display the raw contents provided by the web browser
html.Div('Raw Content'),
html.Pre(contents[0:200] + '...', style={
'whiteSpace': 'pre-wrap',
'wordBreak': 'break-all'
})
])
#app.callback(Output('output-data-upload', 'children'),
[Input('upload-data', 'contents')],
[State('upload-data', 'filename'),
State('upload-data', 'last_modified')])
def update_output(list_of_contents, list_of_names, list_of_dates):
if list_of_contents is not None:
children = [
parse_contents(c, n, d) for c, n, d in
zip(list_of_contents, list_of_names, list_of_dates)]
print(children)
return children
if __name__ == '__main__':
app.run_server(debug=True)
This is the page from the docs that should give you all you need. If you upload a CSV file you can use:
df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
and from there just use it as a normal pandas data frame.

Display selected features after Gridsearch

I'm using GridSearchCV to perform feature selection (SelectKBest) for a linear regression. The results show that 10 features are selected (using .best_params_), but I'm unsure how to display which features this are.
The code is pasted below. I'm using a pipeline because the next models will also need hyperparameter selection. x_train is a dataframe with 12 columns that I cannot share due to data restrictions.
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)), ('regr',
LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(x_train, y_train)
print(clf.best_params_)
You can access the information about feature_selection step like this:
<GridSearch_model_variable>.best_estimater_.named_steps[<feature_selection_step>]
So, in your case, it would be like this:
print(clf.best_estimator_.named_steps['feature_selection'])
#Output: SelectKBest(k=8, score_func=<function mutual_info_regression at 0x13d37b430>)
Next you can use the get_support function to get the boolean map of the selected features:
print(clf.best_estimator_.named_steps['feature_selection'].get_support())
# Output: array([ True, False, True, False, True, True, True, False, False,
True, True, False, True])
Now provide this map over the original columns:
data_columns = X.columns # List of columns in your dataset
# This is the original list of columns
print(data_columns)
# Output: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT']
# Now print the select columns
print(data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()])
# Output: ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT']
So you can see out of 13 features only 8 were selected ( as in my data k=4 was the best case)
Here is the full code with boston dataset:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
boston_dataset = load_boston()
X = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
y = boston_dataset.target
cv_folds = KFold(n_splits=5, shuffle=False)
steps = [('feature_selection', SelectKBest(mutual_info_regression, k=3)),
('regr', LinearRegression())]
pipe = Pipeline(steps)
search_space = [{'feature_selection__k': [1,2,3,4,5,6,7,8,9,10,11,12]}]
clf = GridSearchCV(pipe, search_space, scoring='neg_mean_squared_error', cv=5, verbose=0)
clf = clf.fit(X, y)
print(clf.best_params_)
data_columns = X.columns
selected_features = data_columns[clf.best_estimator_.named_steps['feature_selection'].get_support()]
print(selected_features)
# Output : Index(['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')
References:
https://stackoverflow.com/a/33378303/8160718
https://stackoverflow.com/a/38788087/8160718

airflow scheduler executes dag earlier than its start date

I try to execute a dag on start_date= datetime.strptime('3/2/2020 8:20:00', '%m/%d/%Y %H:%M:%S') and schedule_interval=schedule_interval='20/5 8 * * *' but dag get executed before the actual start_date.
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
def print_hello():
return 'Hello world!'
default_args = {
'owner':'admin',
'retries': 0,
'email_on_retry': False,
'depends_on_past': False,
'email_on_failure': False,
'priority_weight': 10,
'email': [''],
'queue':'bash_queue',
'start_date': datetime.strptime('3/2/2020 8:20:00', '%m/%d/%Y %H:%M:%S'),
}
dag = DAG('somename',
default_args=default_args,
catchup = False,
schedule_interval='20/5 8 * * *'
)
dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag)
dummy_operator >> hello_operator
can anyone help me to understand the odd behavior of airflow?

How to get commit history of users in gitlab in rails

I am trying to create an app which will fetch the gitlab/github commit history of the user which i can just show in a side panel and it can be checked on or off depending on my criteria. I want to know if there is a way to fetch the current logged user's gitlab/github commit history. I tried to use the gem
https://github.com/libgit2/rugged
But couldn't find a way to implement my need. If anyone knows how to implement this it would be so much helpful. Thanks.
Update
I have now managed to get the user by using :
Gitlab.endpoint = 'https://gitlab.com/api/v4'
Gitlab.private_token = 'token'
g = Gitlab.client(
endpoint: 'https://gitlab.com/api/v4',
private_token: 'token',
httparty: {
headers: { 'Cookie' => 'gitlab_canary=true' }
}
)
By using the command g.user i am able to get the user but i need to get the commits of the user that he has done in gitlab.
Use this gitlab API GET /projects/:id/repository/commits to fetch all the commits on a repository gitlab api. Check the attached code for more details.
Basically this git log --author="user_name" command can give you git commit history for a specific user, you can even use email address just the first name or last name in the above command.
Once you have authenticated with gitlab you can run the following command from ruby.
cmd = 'git log --author="user_name"'
system(cmd)
Sample code form ruby to connect to gitlab using private token which is not ideal but just an example
require 'json'
require 'curb'
require 'net/http'
begin
def parseCoverageReport(report_text)
coverage_perc = report_text.match /All files\s+\|\s+(\d+\.?\d+).*\n/
if not coverage_perc then
coverage_perc = report_text.match /^TOTAL\s+\d+\s+\d+\s+(\d+)%$/
end
if coverage_perc then
#if we found coverage value in job trace
#puts "coverage_perc[1]: #{coverage_perc[1]}"
coverage_perc[1].to_i
end
end
gen_config = YAML.load_file("config/general.yml")
gitlab_config = YAML.load_file("config/gitlab.yml")
SCHEDULER.every gen_config[:job_trigger_interval], :first_in => 0 do |job|
table = {
title: "Projects",
hrows: Array.new.push({cols: [
{value: "Project name"},
{value: "Open Merge Requests"},
{value: "Code coverage"}
]}),
rows: Array.new
}
instances = gitlab_config['instances']
instances.each do |instance|
gitlab_url = gitlab_config['instances'][instance.first]['url']
# gitlab_token = gitlab_config['instances'][instance.first]['api_key']
gitlab_token = ENV[gitlab_config['instances'][instance.first]['api_key']]
red_threshold = gitlab_config['instances'][instance.first]['red_threshold']
orange_threshold = gitlab_config['instances'][instance.first]['orange_threshold']
cov_red_threshold = gitlab_config['instances'][instance.first]['cov_red_threshold']
cov_orange_threshold = gitlab_config['instances'][instance.first]['cov_orange_threshold']
projects = gitlab_config['instances'][instance.first]['projects']
projects.each do |name, project|
merge_reqs = JSON.parse(Curl.get("#{gitlab_url}/api/v4/projects/#{project['id']}/merge_requests?state=opened&private_token=#{gitlab_token}&per_page=200").body_str)
git_project = JSON.parse(Curl.get("#{gitlab_url}/api/v4/projects/#{project['id']}?private_token=#{gitlab_token}").body_str)
opened_mrs = merge_reqs.select { |merge_reqs| %w[opened].include? merge_reqs['state'] }
repo_name = git_project['name']
repo_url = git_project['web_url']
status = case
when opened_mrs.size >= red_threshold then 'danger'
when opened_mrs.size >= orange_threshold then 'warning'
else
'ok'
end
mrs_count = "#{opened_mrs.size}"
send_event("#{name}_mr", { current: mrs_count, status: status })
color = case
when opened_mrs.size >= red_threshold then 'red'
when opened_mrs.size >= orange_threshold then 'orange'
else
'green'
end
font_color = color == 'orange' ? 'black' : 'white'
cov_color = color
font_cov_color = 'white'
code_coverage = "---"
code_coverage_tag = "---"
cov_job_url = ''
jobs = JSON.parse(Curl.get("#{gitlab_url}/api/v4/projects/#{project['id']}/jobs?scope=success&private_token=#{gitlab_token}&per_page=30").body_str)
code_cov_job = jobs.find { |gitlab_job| !gitlab_job['coverage'].nil? }
if not code_cov_job then
#if no job has 'coverage' feature set up in Gitlab try to parse
#'coverage' from jobs trace manually
jobs.each do |job|
trace_report = Curl.get("#{gitlab_url}/api/v4/projects/#{project['id']}/jobs/#{job['id']}/trace?private_token=#{gitlab_token}").body_str
code_cov_percentage = parseCoverageReport(trace_report)
if code_cov_percentage then
code_cov_job = job
code_cov_job['coverage'] = code_cov_percentage
break
end
end
end
if code_cov_job then
#found code coverage data => process them
code_coverage = code_cov_job['coverage'].to_i
cov_job_url = code_cov_job['web_url'].to_s
#update code covergate SprintProgress widgets at the same job
widget_title = "code_coverage_progress_#{project['id']}"
send_event(widget_title, {
title: "Code Coverage - #{git_project['name']}",
sprintboard_url: cov_job_url,
min: 0,
max: 100,
value: code_coverage,
moreinfo: ''
})
cov_color = case
when code_coverage <= cov_red_threshold then 'red'
when code_coverage <= cov_orange_threshold then 'orange'
else
'green'
end
code_coverage = "#{code_coverage}%"
code_coverage_tag = "<a href='#{cov_job_url}' target='_blank'>#{code_coverage.to_s}</a>"
end
repo_name_a_tag = "<a href='#{repo_url}' target='_blank'>#{repo_name}</a>"
open_mrs_size = "<a href='#{repo_url}/merge_requests' target='_blank'>#{opened_mrs.size}</a>"
table[:rows].push({
cols: [
{ value: repo_name_a_tag, style: "color: #{font_color}; background-color: #{color}" },
{ value: open_mrs_size, style: "color: #{font_color}; background-color: #{color}" },
{ value: code_coverage_tag, style: "color: #{cov_color == 'orange' ? 'black' : 'white'}; background-color: #{cov_color}" }
]
})
end
end
send_event('open_merge_requests_table', table)
end
rescue Errno::ENOENT
puts "No config file found for gitlab - not starting the Gitlab job"
end
In the above ruby example please have a look at the following code snippet
merge_reqs = JSON.parse(Curl.get("#{gitlab_url}/api/v4/projects/#{project['id']}/merge_requests?state=opened&private_token=#{gitlab_token}&per_page=200").body_str)
git_project = JSON.parse(Curl.get("#{gitlab_url}/api/v4/projects/#{project['id']}?private_token=#{gitlab_token}").body_str)
opened_mrs = merge_reqs.select { |merge_reqs| %w[opened].include? merge_reqs['state'] }
repo_name = git_project['name']
repo_url = git_project['web_url']
In here what i am trying to do is connect to our gitlab instance using a private_token and then for a specific project id (which you can get it form the UI of gitlab) check for the open merge request. I also get the git_project from which i get the name and web_url (which was my use case).
For your use case you will have to get the project_id(for gitlab UI) and then use some appropriate method to get the commits.gitlab docs

Resources