I'm trying to create a gold table notebook in Databricks, however it would take 9 days to fully reprocess the historical data (43GB, 35k parquet files). I tried scaling up the cluster but it doesn't go above 5000 records/second. The bottleneck seems to be the applyInPandas() function. I'm wondering if I could replace pandas with anything else to make the gold notebook execute faster.
Silver table has 60 columns (read_id, reader_id, tracker_timestamp, event_type, ebook_id, page_id, agent_ip, agent_device_type, ...). Each row of data represents read event of an ebook. E.g 'page turn', 'click on image', 'click on link',... All of the events that have occurred in the single session have the same read.id. In the gold table I'm trying to group those events in sessions and calculate the number of times each event has occurred in the single session. So instead of 100+ rows of data for a read session in silver table I would end up just with a single aggregated row in gold table.
Input is the silver delta table:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
from pyspark.sql.functions import pandas_udf
input = (spark
.readStream
.format("delta")
.option("withEventTimeOrder", "true")
.option("maxFilesPerTrigger", 100)
.load(f"path_to_silver_bucket")
)
I use withWatermark and session_window functions to ensure I end up grouping all of the events from the single read session. (read session automatically ends 30 minutes after the last reader activity)
group = input.withWatermark("tracker_timestamp", "10 minutes").groupBy("read_id", F.session_window(input.tracker_timestamp, "30 minutes"))
In the next step I use the applyInPandas function like so:
sessions = group.applyInPandas(processing_function, schema=processing_function_output_schema)
Definition of the processing_function used in applyInPandas:
def processing_function(df):
surf_time_ms = df.query('event_type == "surf"')['duration'].sum()
immerse_time_ms = df.query('event_type == "immersion"')['duration'].sum()
min_timestamp = df['tracker_timestamp'].min()
max_timestamp = df['tracker_timestamp'].max()
shares = len(df.query('event_type == "share"'))
leads = len(df.query('event_type == "lead_store"'))
is_read = len(df.query('event_type == "surf"')) > 0
distinct_pages = df['page_id'].nunique()
data = {
"read_id": df['read_id'].values[0],
"surf_time_ms": surf_time_ms,
"immerse_time_ms": immerse_time_ms,
"min_timestamp": min_timestamp,
"max_timestamp": max_timestamp,
"shares": shares,
"leads": leads,
"is_read": is_read,
"number_of_events": len(df),
"distinct_pages": distinct_pages
}
for field in not_calculated_string_fields:
data[field] = df[field].values[0]
new_df = pd.DataFrame(data=data, index=['read_id'])
for x in all_events:
new_df[f"count_{x}"] = df.query(f"type == '{x}'").count()
for x in duration_events:
duration = df.query(f"event_type == '{x}'")['duration']
duration_sum = duration.sum()
new_df[f"duration_{x}_ms"] = duration_sum
if duration_sum > 0:
new_df[f"mean_duration_{x}_ms"] = duration.mean()
else:
new_df[f"mean_duration_{x}_ms"] = 0
return new_df
And finally, I'm writing the calculated row to the gold table like so:
for_partitioning = (sessions
.withColumn("tenant", F.col("story_tenant"))
.withColumn("year", F.year(F.col("min_timestamp")))
.withColumn("month", F.month(F.col("min_timestamp"))))
checkpoint_path = "checkpoint-path"
gold_path = f"gold-bucket"
(for_partitioning
.writeStream
.format('delta')
.partitionBy('year', 'month', 'tenant')
.option("mergeSchema", "true")
.option("checkpointLocation", checkpoint_path)
.outputMode("append")
.start(gold_path))
Can anybody think of a more efficient way to do a UDF in PySpark than applyInPandas for the above example? I simply cannot afford to wait 9 days to reprocess 43GB of data...
I've tried playing around with different input and output options (e.g. .option("maxFilesPerTrigger", 100)) but the real problem seems to be applyInPandas.
You could rewrite your processing_function into native Spark if you really wanted.
"read_id": df['read_id'].values[0]
F.first('read_id').alias('read_id')
"surf_time_ms": df.query('event_type == "surf"')['duration'].sum()
F.sum(F.when(F.col('event_type') == 'surf', F.col('duration'))).alias('surf_time_ms')
"immerse_time_ms": df.query('event_type == "immersion"')['duration'].sum()
F.sum(F.when(F.col('event_type') == 'immersion', F.col('duration'))).alias('immerse_time_ms')
"min_timestamp": df['tracker_timestamp'].min()
F.min('tracker_timestamp').alias('min_timestamp')
"max_timestamp": df['tracker_timestamp'].max()
F.max('tracker_timestamp').alias('max_timestamp')
"shares": len(df.query('event_type == "share"'))
F.count(F.when(F.col('event_type') == 'share', F.lit(1))).alias('shares')
"leads": len(df.query('event_type == "lead_store"'))
F.count(F.when(F.col('event_type') == 'lead_store', F.lit(1))).alias('leads')
"is_read": len(df.query('event_type == "surf"')) > 0
(F.count(F.when(F.col('event_type') == 'surf', F.lit(1))) > 0).alias('is_read')
"number_of_events": len(df)
F.count(F.lit(1)).alias('number_of_events')
"distinct_pages": df['page_id'].nunique()
F.countDistinct('page_id').alias('distinct_pages')
for field in not_calculated_string_fields:
data[field] = df[field].values[0]
*[F.first(field).alias(field) for field in not_calculated_string_fields]
for x in all_events:
new_df[f"count_{x}"] = df.query(f"type == '{x}'").count()
The above can probably be skipped? As far as my tests go, new columns get NaN values, because .count() returns a Series object instead of one simple value.
for x in duration_events:
duration = df.query(f"event_type == '{x}'")['duration']
duration_sum = duration.sum()
new_df[f"duration_{x}_ms"] = duration_sum
if duration_sum > 0:
new_df[f"mean_duration_{x}_ms"] = duration.mean()
else:
new_df[f"mean_duration_{x}_ms"] = 0
*[F.sum(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"duration_{x}_ms") for x in duration_events]
*[F.mean(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"mean_duration_{x}_ms") for x in duration_events]
So, instead of
def processing_function(df):
...
...
sessions = group.applyInPandas(processing_function, schema=processing_function_output_schema)
you could use efficient native Spark:
sessions = group.agg(
F.first('read_id').alias('read_id'),
F.sum(F.when(F.col('event_type') == 'surf', F.col('duration'))).alias('surf_time_ms'),
F.sum(F.when(F.col('event_type') == 'immersion', F.col('duration'))).alias('immerse_time_ms'),
F.min('tracker_timestamp').alias('min_timestamp'),
F.max('tracker_timestamp').alias('max_timestamp'),
F.count(F.when(F.col('event_type') == 'share', F.lit(1))).alias('shares'),
F.count(F.when(F.col('event_type') == 'lead_store', F.lit(1))).alias('leads'),
(F.count(F.when(F.col('event_type') == 'surf', F.lit(1))) > 0).alias('is_read'),
F.count(F.lit(1)).alias('number_of_events'),
F.countDistinct('page_id').alias('distinct_pages'),
*[F.first(field).alias(field) for field in not_calculated_string_fields],
# skipped count_{x}
*[F.sum(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"duration_{x}_ms") for x in duration_events],
*[F.mean(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"mean_duration_{x}_ms") for x in duration_events],
)
I'm not sure if I'm barking up the wrong tree but I wanted to create a function to check if an account exists based on all optional parameters so it can be used to pull data depending on whatever you'd like to check on.
Basically the query should be:
where loginName = p1 or loginName = p2 or loginName = p3 but with the paramters all being optional, however at least one will be provided.
This is what I tried so far:
public async Task<bool> CheckAccountExistsAsync(string loginName = "", string authenticatorId = "", string eId = "")
{
if (string.IsNullOrWhiteSpace(loginName) && string.IsNullOrWhiteSpace(authenticatorId) && string.IsNullOrWhiteSpace(eId))
throw new InvalidOperationException("You must pass at least one parameter");
return await _context.Accounts.AnyAsync(a =>
(string.IsNullOrWhiteSpace(loginName) || a.LoginName == loginName)
|| (string.IsNullOrWhiteSpace(authenticatorId) || a.AuthenticatorId == authenticatorId)
|| (string.IsNullOrWhiteSpace(eId) || a.EmployeeId == eId));
}
Problem with this approach is that if I just pass the loginName, then the query is as follows with the condition completely omitted.:
SELECT CASE
WHEN EXISTS (
SELECT 1
FROM [Accounts] AS [a]) THEN CAST(1 AS bit)
ELSE CAST(0 AS bit)
END
I'm sure I'm missing something, is there a better approach?
What you are using is applicable for optional and expressions, e.g.
return await _context.Accounts.AnyAsync(a =>
(string.IsNullOrWhiteSpace(loginName) || a.LoginName == loginName)
&& (string.IsNullOrWhiteSpace(authenticatorId) || a.AuthenticatorId == authenticatorId)
&& (string.IsNullOrWhiteSpace(eId) || a.EmployeeId == eId));
For optional or you have to use optional and sub conditions and add additional check for all optional parameters missing, e.g.
return await _context.Accounts.AnyAsync(a =>
(string.IsNullOrWhiteSpace(loginName)
&& string.IsNullOrWhiteSpace(authenticatorId)
&& string.IsNullOrWhiteSpace(eId))
|| (!string.IsNullOrWhiteSpace(loginName) && a.LoginName == loginName)
|| (!string.IsNullOrWhiteSpace(authenticatorId) && a.AuthenticatorId == authenticatorId)
|| (!string.IsNullOrWhiteSpace(eId) && a.EmployeeId == eId));
I have a simple elasticsearch-dsl-py code. Everything is fine, even tags, except number_of_fragments. Response continuing giving multiply highlights for single word.
def name_autocomplete(q):
multi_match = MultiMatch(query=q, fields=ngram_fields, fuzziness='AUTO', tie_breaker=1)
s = s.query(multi_match)[0:5] \
.highlight_options(pre_tags='<b>', post_tags='</b>',
number_of_fragments=1, # This is not working
order='score') \
.highlight('name.ngram') \
.execute()
response_dict = s.to_dict()
return Response(response_dict)
What am I doing wrong?
ADDED:
Analyzer:
edge_ngram_completion = analyzer(
"edge_ngram_completion",
tokenizer=tokenizer('trigram', 'nGram', min_gram=3, max_gram=4),
filter=["lowercase"]
)
Document:
class TerminalDocument(Document):
"""Document Elasticsearch document."""
id = fields.IntegerField(attr='id')
name = fields.TextField(
fields={
'suggest': fields.TextField(analyzer=ascii_fold),
'ngram': fields.TextField(analyzer=edge_ngram_completion)
}
)
Example of hidhlighting for foobar:
"<b>foo</b>something<b>bar</b>"
Please suggest a better (Ruby-ish) way to summarise mutiple entries in one go given the code below:
def summary_totals
pay_paid, pay_unpaid = 0, 0
rec_paid, rec_unpaid = 0, 0
net_paid, net_unpaid = 0, 0
summary_entries.each do |proj_summary|
pay_paid += proj_summary.payable.paid || 0
pay_unpaid += proj_summary.payable.unpaid || 0
rec_paid += proj_summary.receivable.paid || 0
rec_unpaid += proj_summary.receivable.unpaid || 0
net_paid += proj_summary.net.paid || 0
net_unpaid += proj_summary.net.unpaid || 0
end
pay = PaidUnpaidEntry.new(pay_paid, pay_unpaid)
rec = PaidUnpaidEntry.new(rec_paid, rec_unpaid)
net = PaidUnpaidEntry.new(net_paid, net_unpaid)
ProjectPaymentsSummary.new(pay, rec, net)
end
Update: All you need to do is to rewrite the each loop (which sums up 6 variables) in a better Ruby style.
"Better" could be subjective, but I guess you want to use inject to do the summing. The symbol argument to inject can be used to make it nice and concise. If you pass the result directly to your constructors, there's no need for the local variables, eg:
pay = PaidUnpaidEntry.new(
summary_entries.map { |e| e.payable.paid }.inject(:+),
summary_entries.map { |e| e.payable.unpaid }.inject(:+)
)
# etc
I'm using the pg gem to talk to PostgreSQL from Ruby. Is there a
better way to check if there are no results than using res.ntuples == 0?
conn = PGconn.connect config
cmd = "select * from labels inner join labels_mail using(label_id) " +
"where labels_mail.mail_id = $1 and labels.name = $2"
res = conn.exec(cmd, [mail_id, mailbox])
if res.ntuples == 0 # <=== is there a better way to check this?
cmd = "insert into labels_mail (mail_id, label_id) values ($1, $2)"
conn.exec(cmd, [mail_id, label_id(mailbox)])
end
ntuples, a typo? It is faster and concise to use zero? than == 0
if res.num_tuples.zero?