I am asking for your help.
I have an existing dag that cleans up airflow_db from our production airflow db. Now I need to adjust the code and add another connection to our test environment, I was advised to create a for cycle like
for conn_id in {"prod_db", "test_db"}:
with DAG( ... conn_id = conn_id, ...)
But I am not sure how to do that really, as I did the change the syntax is incorrect now.
Do you have any example/hint how to do that?
for conn_id in {"prod_db","test_db"}:
with DAG(
conn_id = conn_id,
dag_id="airflow_db_cleanup",
default_args={
"owner": "our team,
"email": mail_mail_mail,
"email_on_failure": True,
"email_on_retry": False,
"start_date": datetime(2022, 1, 1, 0, 0, 0),
"retries": 1,
"retry_delay": timedelta(minutes=1),
},
schedule_interval=timedelta(days=1),
max_active_runs=1,
) as dag,
dag.doc_md = doc
etc-then various functions that create sessions follow.
The operator here:
def get_max_db_entry_age_in_days(dag_run_conf):
max_db_entry_age_in_days = None
default_max_db_entry_age_in_days = int(Variable.get("max_db_entry_age_in_days", 90))
if dag_run_conf:
max_db_entry_age_in_days = dag_run_conf.get("maxDBEntryAgeInDays", None)
logging.info("maxDBEntryAgeInDays from dag_run.conf: " + str(dag_run_conf))
if max_db_entry_age_in_days is None:
logging.info(
"maxDBEntryAgeInDays conf variable isn't included. Using Default '"
+ str(default_max_db_entry_age_in_days)
+ "'"
)
max_db_entry_age_in_days = default_max_db_entry_age_in_days
return max_db_entry_age_in_days
def cleanup_function(**context):
dag_run = context.get("dag_run")
dag_run_conf = getattr(dag_run, "conf", None)
max_db_entry_age_in_days = get_max_db_entry_age_in_days(dag_run_conf)
execution_date = context.get("execution_date")
max_date = execution_date + timedelta(-max_db_entry_age_in_days)
dry_run = Variable.get("dry_run", "True") == "True"
airflow_db_model = context["params"].get("airflow_db_model")
age_check_column = context["params"].get("age_check_column")
keep_last_run = context["params"].get("keep_last_run")
dag_id = context["params"].get("dag_id")
cascade_delete = context["params"].get("cascade_delete")
with create_session(conn_id) as session:
logging.info("Configurations:")
logging.info("max_date: " + str(max_date))
logging.info("session: " + str(session))
logging.info("airflow_db_model: " + str(airflow_db_model))
logging.info("age_check_column: " + str(age_check_column))
logging.info("keep_last_run: " + str(keep_last_run))
logging.info("dag_id: " + str(dag_id))
logging.info("")
logging.info("Running Cleanup Process...")
query = get_main_query(
session, airflow_db_model, age_check_column, keep_last_run, dag_id, max_date
)
entries_to_delete_count = query.count()
logging.info(f"Query : {query}")
logging.info(
f"Will be deleting {entries_to_delete_count} {airflow_db_model.__name__}"
)
logging.info(list(query.values(dag_id)))
if cascade_delete:
for child in cascade_delete:
child_model = child.get("child_model")
child_id = child.get("child_id")
parent_id = child.get("parent_id")
delete_query = get_cascade_delete_query(
session, query, child_model, child_id, parent_id
)
child_rows = delete_query.count()
logging.info(
f"Going to delete {child_rows} rows of {child_model.__name__}."
)
logging.info(list(delete_query.values(child_id)))
if not dry_run:
delete_query.delete(synchronize_session=False)
if dry_run:
logging.info("Dry run finished")
else:
logging.info("Performing main delete...")
query.delete(synchronize_session=False)
session.commit()
logging.info("Cleanup process finished")
for db_object in DATABASE_OBJECTS:
cleanup_op = PythonOperator(
task_id="cleanup_" + str(db_object["airflow_db_model"].__name__),
python_callable=cleanup_function,
params=db_object,
provide_context=True,
dag=dag,
)
def remove_inactive_dags_from_ui(**context):
max_db_age_inactive_dags = int(Variable.get("max_db_age_inactive_dags", 7))
dry_run = Variable.get("dry_run", "True") == "True"
threshold_date = context["execution_date"].subtract(days=max_db_age_inactive_dags)
with create_session(conn_id) as session:
query = (
session.query(DagModel)
.filter(
and_(
DagModel.is_active.is_(False),
DagModel.is_subdag.is_(False),
DagModel.last_parsed_time <= threshold_date,
)
)
.options(load_only(DagModel.dag_id))
)
inactive_dags = query.all()
logging.info(f"Will be deleted {query.count()} dags")
for inactive_dag in inactive_dags:
logging.info(f"Deleting dag {inactive_dag.dag_id} from UI")
if not dry_run:
delete_dag(inactive_dag.dag_id, session=session)
if dry_run:
logging.info("Dry run finished")
else:
session.commit()
logging.info("Cleanup process finished")
remove_inactive_dags_task = PythonOperator(
task_id="remove_inactive_dags_from_ui",
python_callable=remove_inactive_dags_from_ui,
provide_context=True,
dag=dag,
)
Creating a loop of DAGs or Tasks is bad practice inside Airflow, I'm not sure if it's even possible.
The correct approach would be to have two tasks that run the same DAG code, but passing a different connection through each time. You could even parallel these tasks up so both DBs are cleared down at the same time rather than one after another.
Related
I am using Toshiba FlashAir card to wirelessly upload photos from my drone to my web server via Lua script installed on FlashAir card. The card works fine most of the time, but sometimes the photos doesn't upload. When I connect to the card via my mac, I get this message, "CONFIG" is damaged and can't be opened.
Not sure why this is happening. The ftpUpload.lua script that does the uploading is listed below:
require("Settings")
local blackList = {}
local dateTable = {}
local parseThreads = 0
local ftpstring = "ftp://"..user..":"..passwd.."#"..server.."/"..serverDir.."/"
local lockdir = "/uploaded/" -- trailing slash required, and folder must already exist
local fileTable = {}
--Options
local sortByFileName = false -- If true sorts alphabetically by file else sorts by modified date
local jpegsOnly = true --Excludes .RAW files if set to true
local function exists(path)
if lfs.attributes(path) then
return true
else
return false
end
end
local function is_uploaded(path)
local hash = fa.hash("md5", path, "")
return exists(lockdir .. hash)
end
local function set_uploaded(path)
local hash = fa.hash("md5", path, "")
local file = io.open(lockdir .. hash, "w")
file:close()
end
local function delete(path)
-- Both of the following methods cause the next photo to be lost / not stored.
fa.remove(path)
-- fa.request("http://127.0.0.1/upload.cgi?DEL="..path)
end
--Format file date to something meanigful
local function FatDateTimeToTable(datetime_fat)
local function getbits(x, from, to)
local mask = bit32.lshift(1, to - from + 1) - 1
local shifted = bit32.rshift(x, from)
return bit32.band(shifted, mask)
end
local fatdate = bit32.rshift(datetime_fat, 16)
local day = getbits(fatdate, 0, 4)
local month = getbits(fatdate, 5, 8)
local year = getbits(fatdate, 9, 15) + 1980
local fattime = getbits(datetime_fat, 0, 15)
local sec = getbits(fattime, 0, 4) * 2
local min = getbits(fattime, 5, 10)
local hour = getbits(fattime, 11, 15)
return {
year = string.format('%02d', year),
month = string.format('%02d', month),
day = string.format('%02d', day),
hour = string.format('%02d', hour),
min = string.format('%02d', min),
sec = string.format('%02d', sec),
}
end
--Looks for value in table
local function exists_in_table(tbl, val)
for i = 1, #tbl do
if (tbl[i] == val) then
return true
end
end
return false
end
local function create_thumbs()
print("Creating Thumbs!")
for i = 1, #dateTable do
local message = "d="..dateTable[i]
local headers = {
["Content-Length"] = string.len(message),
["Content-Type"] = "application/x-www-form-urlencoded"
}
local b, c, h = fa.request{
url = "http://"..server.."/"..serverDir.."/ct.php",
method = "POST",
headers = headers,
body = message
}
end
print("COMPLETE!")
end
local function upload_file(folder, file, subfolder)
local path = folder .. "/" .. file
-- Open the log file
local outfile = io.open(logfile, "a")
outfile:write(file .. " ... ")
local url = ftpstring..subfolder.."/"..file
local response = fa.ftp("put", url, path)
--print("Uploading", url)
--Check to see if it worked, and log the result!
if response ~= nil then
print("Success!")
outfile:write(" Success!\n")
set_uploaded(path)
if delete_after_upload == true then
print("Deleting " .. file)
outfile:write("Deleting " .. file .. "\n")
sleep(1000)
delete(path)
sleep(1000)
end
else
print(" Fail ")
outfile:write(" Fail\n")
end
--Close our log file
outfile:close()
end
local function sort_upload_table()
if (sortByFileName) then
print("Sorting filenames alphabetically")
table.sort(fileTable, function(a,b)
return a.file>b.file
end)
else
print("Sorting filenames by modded date")
table.sort(fileTable, function(a,b)
return a.sortDate>b.sortDate
end)
end
end
local function run_upload()
print("Starting upload")
for i = 1, #fileTable do
local ft = fileTable[i]
print("Uploading:", ft.folder, ft.file, ft.dateString)
upload_file(ft.folder, ft.file, ft.dateString)
end
create_thumbs()
end
local function walk_directory(folder)
parseThreads = parseThreads+1
for file in lfs.dir(folder) do
local path = folder .. "/" .. file
local skip = string.sub( file, 1, 2 ) == "._"
local attr = lfs.attributes(path)
local dt={}
if (not skip) then
print( "Found "..attr.mode..": " .. path )
if attr.mode == "file" then
local dateString = ""
if (attr.modification~=nil) then
dt = FatDateTimeToTable(attr.modification)
dateString = dt.day.."-"..dt.month.."-"..dt.year
end
if not is_uploaded(path) then
if (not exists_in_table(blackList, dateString)) then
local s,f = true, true
if (jpegsOnly) then
s,f = string.find(string.lower(file), ".jpg")
end
if (s and f) then
fileTable[#fileTable+1] =
{
folder = folder,
file = file,
dateString = dateString,
sortDate=dt.year..dt.month..dt.day..dt.hour..dt.min..dt.sec
}
--upload_file(folder, file, dateString)
end
else
print("Skipping ".. dateString.." - Blacklisted")
end
else
print(path .. " previously uploaded, skipping")
end
elseif attr.mode == "directory" then
print("Entering " .. path)
walk_directory(path)
end
end
end
parseThreads = parseThreads-1
if (parseThreads == 0) then
--create_thumbs()
sort_upload_table()
run_upload()
end
end
local function create_folders(folder)
if (#dateTable==0) then
print("ERROR: DID NOT FIND ANY DATES!")
return
end
for i = 1, #dateTable do
local message = "d="..dateTable[i]
local headers = {
["Content-Length"] = string.len(message),
["Content-Type"] = "application/x-www-form-urlencoded"
}
local b, c, h = fa.request{
url = "http://"..server.."/"..serverDir.."/cd.php",
method = "POST",
headers = headers,
body = message
}
if (b~=nil) then
b = string.gsub(b, "\n", "")
b = string.gsub(b, "\r", "")
end
if (b and b == "success") then
print("SUCCESS FROM SERVER FOR FOLDER:"..dateTable[i].."<<")
else
print("FAILED RESPONSE FROM SERVER FOR FOLDER:"..dateTable[i].."<<")
print("ADDING TO BLACKLIST")
blackList[#blackList+1] = dateTable[i]
end
end
print("OK FTP Starting...")
walk_directory(folder)
end
local function get_folders(folder)
parseThreads = parseThreads+1
local tableCount = 1
--Get the date range from the file
for file in lfs.dir(folder) do
local path = folder .. "/" .. file
local skip = string.sub( file, 1, 2 ) == "._"
local attr = lfs.attributes(path)
if (not skip) then
if (attr.mode == "file") then
print( "Datesearch Found "..attr.mode..": " .. path )
local dateString = ""
if (attr.modification~=nil) then
local dt = FatDateTimeToTable(attr.modification)
dateString = dt.day.."-"..dt.month.."-"..dt.year
end
if (not exists_in_table(dateTable, dateString)) then
dateTable[#dateTable+1] = dateString
end
elseif attr.mode == "directory" then
print("Datesearch Entering " .. path)
get_folders(path)
end
end
end
parseThreads = parseThreads-1
if (parseThreads == 0) then
create_folders(folder)
end
end
-- wait for wifi to connect
while string.sub(fa.ReadStatusReg(),13,13) ~= "a" do
print("Wifi not connected. Waiting...")
sleep(1000)
end
sleep(30*1000)
get_folders(folder)
Im trying to run a list of commands. The problem is that the list can be very long, so having multiple commands run simultaneously would be great.
How do I do this with the multiprocessing module?
list_of_commands = [cmd foo, cmd bar, ...]
main_log_file = open( os.getcwd() + '/Error.log', 'w+')
Count = 0
for Job in list_of_commands:
Count += 1
child = subprocess.Popen(Job, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
streamdata = child.communicate()[0]
errcode = child.returncode
if errcode == 0:
print ( 'Job', Count, 'Success' )
elif errcode == 1:
print ( 'Job', Count, 'Completed With Errors' )
elif errcode == 2:
print ( 'Job', Count, 'Error' )
main_log_file.write ( streamdata.decode('ascii') + str(errcode) + '\n' )
main_log_file.close()
Order of execution doesn't matter
You can use the concurrent.futures.ThreadPoolExecutor map function to run a costant number of parallel executions.
WORKERS = 5 # amount of concurrent executions you want
def launcher(job):
child = subprocess.Popen(job, ... )
streamdata = child.communicate()[0]
...
with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as pool:
pool.map(launcher, jobs)
I have a list of documents to process, and for each record I want to attach some metadata to the document "member" inside the "corpus" data structure that tm, the R package, generates (from reading in text files).
This for-loop works but it is very slow,
Performance seems to degrade as a function f ~ 1/n_docs.
for (i in seq(from= 1, to=length(corpus), by=1)){
if(opts$options$verbose == TRUE || i %% 50 == 0){
print(paste(i, " ", substr(corpus[[i]], 1, 140), sep = " "))
}
DublinCore(corpus[[i]], "title") = csv[[i,10]]
DublinCore(corpus[[i]], "Publisher" ) = csv[[i,16]] #institutions
}
This may do something to the corpus variable but I don't know what.
But when I put it inside a tm_map() (similar to lapply() function), it runs much faster, but the changes are not made persistent:
i = 0
corpus = tm_map(corpus, function(x){
i <<- i + 1
if(opts$options$verbose == TRUE){
print(paste(i, " ", substr(x, 1, 140), sep = " "))
}
meta(x, tag = "Heading") = csv[[i,10]]
meta(x, tag = "publisher" ) = csv[[i,16]]
})
Variable corpus has empty metadata fields after exiting the tm_map function. It should be filled. I have a few other things to do with the collection.
The R documentation for the meta() function says this:
Examples:
data("crude")
meta(crude[[1]])
DublinCore(crude[[1]])
meta(crude[[1]], tag = "Topics")
meta(crude[[1]], tag = "Comment") <- "A short comment."
meta(crude[[1]], tag = "Topics") <- NULL
DublinCore(crude[[1]], tag = "creator") <- "Ano Nymous"
DublinCore(crude[[1]], tag = "Format") <- "XML"
DublinCore(crude[[1]])
meta(crude[[1]])
meta(crude)
meta(crude, type = "corpus")
meta(crude, "labels") <- 21:40
meta(crude)
I tried many of these calls (with var "corpus" instead of "crude"), but they do not seem to work.
Someone else once seemed to have had the same problem with a similar data set (forum post from 2009, no response)
Here's a bit of benchmarking...
With the for loop :
expr.for <- function() {
for (i in seq(from= 1, to=length(corpus), by=1)){
DublinCore(corpus[[i]], "title") = LETTERS[round(runif(26))]
DublinCore(corpus[[i]], "Publisher" ) = LETTERS[round(runif(26))]
}
}
microbenchmark(expr.for())
# Unit: milliseconds
# expr min lq median uq max
# 1 expr.for() 21.50504 22.40111 23.56246 23.90446 70.12398
With tm_map :
corpus <- crude
expr.map <- function() {
tm_map(corpus, function(x) {
meta(x, "title") = LETTERS[round(runif(26))]
meta(x, "Publisher" ) = LETTERS[round(runif(26))]
x
})
}
microbenchmark(expr.map())
# Unit: milliseconds
# expr min lq median uq max
# 1 expr.map() 5.575842 5.700616 5.796284 5.886589 8.753482
So the tm_map version, as you noticed, seems to be about 4 times faster.
In your question you say that the changes in the tm_map version are not persistent, it is because you don't return x at the end of your anonymous function. In the end it should be :
meta(x, tag = "Heading") = csv[[i,10]]
meta(x, tag = "publisher" ) = csv[[i,16]]
x
I'm doing the R-Hadoop tutorial (october 2012) of Jeffrey Breen.
At the moment I try to populate hdfs and then run the commands Jeffrey published in his tutorial in RStudio. Unfortunately I got some troubles with it:
UPDATE: I now moved the data folder to:
/home/cloudera/data/hadoop/wordcount (and same for airline-Data)
No when I run populate.hdfs.sh I get the following output:
[cloudera#localhost ~]$ /home/cloudera/TutorialBreen/bin/populate.hdfs.sh
mkdir: cannot create directory /user/cloudera: File exists
mkdir: cannot create directory /user/cloudera/wordcount: File exists
mkdir: cannot create directory /user/cloudera/wordcount/data: File exists
mkdir: cannot create directory /user/cloudera/airline: File exists
mkdir: cannot create directory /user/cloudera/airline/data: File exists
put: Target /user/cloudera/airline/data/20040325.csv already exists
And then I tried the commands in RStudio as shown in the tutorial but I get errors at the end. Can someone show me what I did wrong?
> if (LOCAL)
+ {
+ rmr.options.set(backend = 'local')
+ hdfs.data.root = 'data/local/airline'
+ hdfs.data = file.path(hdfs.data.root, '20040325-jfk-lax.csv')
+ hdfs.out.root = 'out/airline'
+ hdfs.out = file.path(hdfs.out.root, 'out')
+ if (!file.exists(hdfs.out))
+ dir.create(hdfs.out.root, recursive=T)
+ } else {
+ rmr.options.set(backend = 'hadoop')
+ hdfs.data.root = 'airline'
+ hdfs.data = file.path(hdfs.data.root, 'data')
+ hdfs.out.root = hdfs.data.root
+ hdfs.out = file.path(hdfs.out.root, 'out')
+ }
> asa.csvtextinputformat = make.input.format( format = function(con, nrecs) {
+ line = readLines(con, nrecs)
+ values = unlist( strsplit(line, "\\,") )
+ if (!is.null(values)) {
+ names(values) = c('Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime',
+ 'ArrTime','CRSArrTime','UniqueCarrier','FlightNum','TailNum',
+ 'ActualElapsedTime','CRSElapsedTime','AirTime','ArrDelay',
+ 'DepDelay','Origin','Dest','Distance','TaxiIn','TaxiOut',
+ 'Cancelled','CancellationCode','Diverted','CarrierDelay',
+ 'WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay')
+ return( keyval(NULL, values) )
+ }
+ }, mode='text' )
> mapper.year.market.enroute_time = function(key, val) {
+ if ( !identical(as.character(val['Year']), 'Year')
+ & identical(as.numeric(val['Cancelled']), 0)
+ & identical(as.numeric(val['Diverted']), 0) ) {
+ if (val['Origin'] < val['Dest'])
+ market = paste(val['Origin'], val['Dest'], sep='-')
+ else
+ market = paste(val['Dest'], val['Origin'], sep='-')
+ output.key = c(val['Year'], market)
+ output.val = c(val['CRSElapsedTime'], val['ActualElapsedTime'], val['AirTime'])
+ return( keyval(output.key, output.val) )
+ }
+ }
> reducer.year.market.enroute_time = function(key, val.list) {
+ if ( require(plyr) )
+ val.df = ldply(val.list, as.numeric)
+ else { # this is as close as my deficient *apply skills can come w/o plyr
+ val.list = lapply(val.list, as.numeric)
+ val.df = data.frame( do.call(rbind, val.list) )
+ }
+ colnames(val.df) = c('crs', 'actual','air')
+ output.key = key
+ output.val = c( nrow(val.df), mean(val.df$crs, na.rm=T),
+ mean(val.df$actual, na.rm=T),
+ mean(val.df$air, na.rm=T) )
+ return( keyval(output.key, output.val) )
+ }
> mr.year.market.enroute_time = function (input, output) {
+ mapreduce(input = input,
+ output = output,
+ input.format = asa.csvtextinputformat,
+ output.format='csv', # note to self: 'csv' for data, 'text' for bug
+ map = mapper.year.market.enroute_time,
+ reduce = reducer.year.market.enroute_time,
+ backend.parameters = list(
+ hadoop = list(D = "mapred.reduce.tasks=2")
+ ),
+ verbose=T)
+ }
> out = mr.year.market.enroute_time(hdfs.data, hdfs.out)
Error in file(f, if (format$mode == "text") "r" else "rb") :
cannot open the connection
In addition: Warning message:
In file(f, if (format$mode == "text") "r" else "rb") :
cannot open file 'data/local/airline/20040325-jfk-lax.csv': No such file or directory
> if (LOCAL)
+ {
+ results.df = as.data.frame( from.dfs(out, structured=T) )
+ colnames(results.df) = c('year', 'market', 'flights', 'scheduled', 'actual', 'in.air')
+ print(head(results.df))
+ }
Error in to.dfs.path(input) : object 'out' not found
Thank you so much!
First of all, it looks like the command:
/usr/bin/hadoop fs -mkdir /user/cloudera/wordcount/data
Is being split into multiple lines. Make sure you're entering it as-is.
Also, it is saying that the local directory data/hadoop/wordcount does not exist. Verify that you're running this command from the correct directory and that your local data is where you expect it to be.
I need to build a where clause at runtime but I need to do an OR with the where clause. Is this possible?
Here is my code. Basically "filter" is a enum Bitwise, son hence filter could be equal to more than 1 of the following. Hence I need to build up the where clause.
If I execute the WHEREs separately then imagine if I do the Untested first, and it returns 0 records that means I can't execute a where on the Tested because its now 0 records.
I will put some pseudo-code below:
string myWhere = "";
if ((filter & Filters.Tested) == Filters.Tested)
{
if (myWhere != "" ) myWhere =myWhere + "||";
myWhere = myWhere " Status == "Tested";
}
if ((filter & Filters.Untested) == Filters.Untested)
{
if (myWhere != "" ) myWhere =myWhere + "||";
myWhere = myWhere " Status == "Untested";
}
if ((filter & Filters.Failed) == Filters.Failed)
{
if (myWhere != "" ) myWhere =myWhere + "||";
myWhere = myWhere " Status == "Failed";
}
// dataApplications = a List of items that include Tested,Failed and Untested.
// dataApplication.Where ( myWhere) --- Confused here!
Is this possible?
I don't want to include lots of "IFs" because there are lots of combinations i.e. no filter, filter= tested Only, filter = Untested and Tested ... and lots more.
If you have this:
IEnumerable<MyType> res = from p in myquery select p;
You can define a
var conditions = new List<Func<MyType, bool>>();
conditions.Add(p => p.PropertyOne == 1);
conditions.Add(p => p.PropertyTwo == 2);
res = res.Where(p => conditions.Any(q => q(p)));
And now the trick to make Lists of Funcs of anonymous objects (and you can easily change it to "extract" the type of anonymous objects)
static List<Func<T, bool>> MakeList<T>(IEnumerable<T> elements)
{
return new List<Func<T, bool>>();
}
You call it by passing the result of a LINQ query. So
var res = from p in elements select new { Id = p.Id, Val = p.Value };
var conditions = MakeList(res);
var statusTexts = new List<string>(); // Add desired status texts
dataApplication.Where(item =>
statusTexts.Any(status => item.Status == status))
Use HashSet<> for statuses, then .Contains will be O(1) instead of usual O(n) for List<>:
var statuses = new HashSet<string>() {"a", "b", "c"};
var list = new[] {
new { Id = 1, status = "a"},
new { Id = 2, status = "b"},
new { Id = 3, status = "z"}
};
var filtered = list.Where(l => statuses.Contains(s => l.status == s));