applyInPandas() aggregation runs slowly on big delta table - performance

I'm trying to create a gold table notebook in Databricks, however it would take 9 days to fully reprocess the historical data (43GB, 35k parquet files). I tried scaling up the cluster but it doesn't go above 5000 records/second. The bottleneck seems to be the applyInPandas() function. I'm wondering if I could replace pandas with anything else to make the gold notebook execute faster.
Silver table has 60 columns (read_id, reader_id, tracker_timestamp, event_type, ebook_id, page_id, agent_ip, agent_device_type, ...). Each row of data represents read event of an ebook. E.g 'page turn', 'click on image', 'click on link',... All of the events that have occurred in the single session have the same read.id. In the gold table I'm trying to group those events in sessions and calculate the number of times each event has occurred in the single session. So instead of 100+ rows of data for a read session in silver table I would end up just with a single aggregated row in gold table.
Input is the silver delta table:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
from pyspark.sql.functions import pandas_udf
input = (spark
.readStream
.format("delta")
.option("withEventTimeOrder", "true")
.option("maxFilesPerTrigger", 100)
.load(f"path_to_silver_bucket")
)
I use withWatermark and session_window functions to ensure I end up grouping all of the events from the single read session. (read session automatically ends 30 minutes after the last reader activity)
group = input.withWatermark("tracker_timestamp", "10 minutes").groupBy("read_id", F.session_window(input.tracker_timestamp, "30 minutes"))
In the next step I use the applyInPandas function like so:
sessions = group.applyInPandas(processing_function, schema=processing_function_output_schema)
Definition of the processing_function used in applyInPandas:
def processing_function(df):
surf_time_ms = df.query('event_type == "surf"')['duration'].sum()
immerse_time_ms = df.query('event_type == "immersion"')['duration'].sum()
min_timestamp = df['tracker_timestamp'].min()
max_timestamp = df['tracker_timestamp'].max()
shares = len(df.query('event_type == "share"'))
leads = len(df.query('event_type == "lead_store"'))
is_read = len(df.query('event_type == "surf"')) > 0
distinct_pages = df['page_id'].nunique()
data = {
"read_id": df['read_id'].values[0],
"surf_time_ms": surf_time_ms,
"immerse_time_ms": immerse_time_ms,
"min_timestamp": min_timestamp,
"max_timestamp": max_timestamp,
"shares": shares,
"leads": leads,
"is_read": is_read,
"number_of_events": len(df),
"distinct_pages": distinct_pages
}
for field in not_calculated_string_fields:
data[field] = df[field].values[0]
new_df = pd.DataFrame(data=data, index=['read_id'])
for x in all_events:
new_df[f"count_{x}"] = df.query(f"type == '{x}'").count()
for x in duration_events:
duration = df.query(f"event_type == '{x}'")['duration']
duration_sum = duration.sum()
new_df[f"duration_{x}_ms"] = duration_sum
if duration_sum > 0:
new_df[f"mean_duration_{x}_ms"] = duration.mean()
else:
new_df[f"mean_duration_{x}_ms"] = 0
return new_df
And finally, I'm writing the calculated row to the gold table like so:
for_partitioning = (sessions
.withColumn("tenant", F.col("story_tenant"))
.withColumn("year", F.year(F.col("min_timestamp")))
.withColumn("month", F.month(F.col("min_timestamp"))))
checkpoint_path = "checkpoint-path"
gold_path = f"gold-bucket"
(for_partitioning
.writeStream
.format('delta')
.partitionBy('year', 'month', 'tenant')
.option("mergeSchema", "true")
.option("checkpointLocation", checkpoint_path)
.outputMode("append")
.start(gold_path))
Can anybody think of a more efficient way to do a UDF in PySpark than applyInPandas for the above example? I simply cannot afford to wait 9 days to reprocess 43GB of data...
I've tried playing around with different input and output options (e.g. .option("maxFilesPerTrigger", 100)) but the real problem seems to be applyInPandas.

You could rewrite your processing_function into native Spark if you really wanted.
"read_id": df['read_id'].values[0]
F.first('read_id').alias('read_id')
"surf_time_ms": df.query('event_type == "surf"')['duration'].sum()
F.sum(F.when(F.col('event_type') == 'surf', F.col('duration'))).alias('surf_time_ms')
"immerse_time_ms": df.query('event_type == "immersion"')['duration'].sum()
F.sum(F.when(F.col('event_type') == 'immersion', F.col('duration'))).alias('immerse_time_ms')
"min_timestamp": df['tracker_timestamp'].min()
F.min('tracker_timestamp').alias('min_timestamp')
"max_timestamp": df['tracker_timestamp'].max()
F.max('tracker_timestamp').alias('max_timestamp')
"shares": len(df.query('event_type == "share"'))
F.count(F.when(F.col('event_type') == 'share', F.lit(1))).alias('shares')
"leads": len(df.query('event_type == "lead_store"'))
F.count(F.when(F.col('event_type') == 'lead_store', F.lit(1))).alias('leads')
"is_read": len(df.query('event_type == "surf"')) > 0
(F.count(F.when(F.col('event_type') == 'surf', F.lit(1))) > 0).alias('is_read')
"number_of_events": len(df)
F.count(F.lit(1)).alias('number_of_events')
"distinct_pages": df['page_id'].nunique()
F.countDistinct('page_id').alias('distinct_pages')
for field in not_calculated_string_fields:
data[field] = df[field].values[0]
*[F.first(field).alias(field) for field in not_calculated_string_fields]
for x in all_events:
new_df[f"count_{x}"] = df.query(f"type == '{x}'").count()
The above can probably be skipped? As far as my tests go, new columns get NaN values, because .count() returns a Series object instead of one simple value.
for x in duration_events:
duration = df.query(f"event_type == '{x}'")['duration']
duration_sum = duration.sum()
new_df[f"duration_{x}_ms"] = duration_sum
if duration_sum > 0:
new_df[f"mean_duration_{x}_ms"] = duration.mean()
else:
new_df[f"mean_duration_{x}_ms"] = 0
*[F.sum(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"duration_{x}_ms") for x in duration_events]
*[F.mean(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"mean_duration_{x}_ms") for x in duration_events]
So, instead of
def processing_function(df):
...
...
sessions = group.applyInPandas(processing_function, schema=processing_function_output_schema)
you could use efficient native Spark:
sessions = group.agg(
F.first('read_id').alias('read_id'),
F.sum(F.when(F.col('event_type') == 'surf', F.col('duration'))).alias('surf_time_ms'),
F.sum(F.when(F.col('event_type') == 'immersion', F.col('duration'))).alias('immerse_time_ms'),
F.min('tracker_timestamp').alias('min_timestamp'),
F.max('tracker_timestamp').alias('max_timestamp'),
F.count(F.when(F.col('event_type') == 'share', F.lit(1))).alias('shares'),
F.count(F.when(F.col('event_type') == 'lead_store', F.lit(1))).alias('leads'),
(F.count(F.when(F.col('event_type') == 'surf', F.lit(1))) > 0).alias('is_read'),
F.count(F.lit(1)).alias('number_of_events'),
F.countDistinct('page_id').alias('distinct_pages'),
*[F.first(field).alias(field) for field in not_calculated_string_fields],
# skipped count_{x}
*[F.sum(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"duration_{x}_ms") for x in duration_events],
*[F.mean(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"mean_duration_{x}_ms") for x in duration_events],
)

Related

Improve code result speed by multiprocessing

I'm self study of Python and it's my first code.
I'm working for analyze logs from the servers. Usually I need analyze full day logs. I created script (this is example, simple logic) just for check speed. If I use normal coding the duration of analyzing 20mil rows about 12-13 minutes. I need 200mil rows by 5 min.
What I tried:
Use multiprocessing (met issue with share memory, think that fix it). But as the result - 300K rows = 20 sec and no matter how many processes. (PS: Also need control processors count in advance)
Use threading (I found that it's not give any speed, 300K rows = 2 sec. But normal code same, 300K = 2 sec)
Use asyncio (I think that script is slow because need reads many files). Result same as threading - 300K = 2 sec.
Finally I think that all three my script incorrect and didn't work correctly.
PS: I try to avoid use specific python modules (like pandas) because in this case it will be more difficult to execute on different servers. Better to use common lib.
Please help to check 1st - multiprocessing.
import csv
import os
from multiprocessing import Process, Queue, Value, Manager
file = {"hcs.log", "hcs1.log", "hcs2.log", "hcs3.log"}
def argument(m, a, n):
proc_num = os.getpid()
a_temp_m = a["vod_miss"]
a_temp_h = a["vod_hit"]
with open(os.getcwd() + '/' + m, newline='') as hcs_1:
hcs_2 = csv.reader(hcs_1, delimiter=' ')
for j in hcs_2:
if j[3].find('MISS') != -1:
a_temp_m[n] = a_temp_m[n] + 1
elif j[3].find('HIT') != -1:
a_temp_h[n] = a_temp_h[n] + 1
a["vod_miss"][n] = a_temp_m[n]
a["vod_hit"][n] = a_temp_h[n]
if __name__ == '__main__':
procs = []
manager = Manager()
vod_live_cuts = manager.dict()
i = "vod_hit"
ii = "vod_miss"
cpu = 1
n = 1
vod_live_cuts[i] = manager.list([0] * cpu)
vod_live_cuts[ii] = manager.list([0] * cpu)
for m in file:
proc = Process(target=argument, args=(m, vod_live_cuts, (n-1)))
procs.append(proc)
proc.start()
if n >= cpu:
n = 1
proc.join()
else:
n += 1
[proc.join() for proc in procs]
[proc.close() for proc in procs]
I'm expect, each file by def argument will be processed by independent process and finally all results will be saved in dict vod_live_cuts. For each process I added independent list in dict. I think it will help cross operation for use this parameter. But maybe it's wrong way :(
using IPC is costly, so only use "shared objects" for saving the final result, not for intermediate results while parsing the file.
limiting the number of processes is done by using a multiprocessing.Pool, the following code uses it to reach the max hard-disk speed, you only need to post-process the results.
you can only parse data as fast as your HDD can read it (typically 30-80 MB/s), so if you need to improve the performance further you should use SSD or RAID0 for higher disk speed, you cannot get much faster than this without changing your hardware.
import csv
import os
from multiprocessing import Process, Queue, Value, Manager, Pool
file = {"hcs.log", "hcs1.log", "hcs2.log", "hcs3.log"}
def argument(m, a):
proc_num = os.getpid()
a_temp_m_n = 0 # make it local to process
a_temp_h_n = 0 # as shared lists use IPC
with open(os.getcwd() + '/' + m, newline='') as hcs_1:
hcs_2 = csv.reader(hcs_1, delimiter=' ')
for j in hcs_2:
if j[3].find('MISS') != -1:
a_temp_m_n = a_temp_m_n + 1
elif j[3].find('HIT') != -1:
a_temp_h_n = a_temp_h_n + 1
a["vod_miss"].append(a_temp_m_n)
a["vod_hit"].append(a_temp_h_n)
if __name__ == '__main__':
manager = Manager()
vod_live_cuts = manager.dict()
i = "vod_hit"
ii = "vod_miss"
cpu = 1
vod_live_cuts[i] = manager.list()
vod_live_cuts[ii] = manager.list()
with Pool(cpu) as pool:
tasks = []
for m in file:
task = pool.apply_async(argument, args=(m, vod_live_cuts))
tasks.append(task)
for task in tasks:
task.get()
print(list(vod_live_cuts[i]))
print(list(vod_live_cuts[ii]))

How do I get historical candlestick data or kline from Phemex Public API?

I need to be able to extract historical candlestick data (such as Open, Close, High, Low, and Volume) of a candlestick in differing intervals (1m, 3m, 5m, 1H, etc.) at a specified time (timestamps) from Phemex.
Other exchanges, such as Binance or FTX, seem to provide REST Websocket API for this, yet I can't seem to find one for Phemex. Mind helping me resolve this issue? Thank you so much.
Steps I have taken, yet found no resolution:
Went to https://phemex.com/user-guides/api-overview
Went to https://github.com/phemex/phemex-api-docs/blob/master/Public-Contract-API-en.md
None of the items listed in 'Market Data API List' seem to do the task
This code will get the candels and save them to a csv file. Hope this helps:)
exchange = ccxt.phemex({
'options': { 'defaultType': 'swap' },
'enableRateLimit': True
})
# Load the markets
markets = exchange.load_markets()
curent_time = int(time.time()*1000)
one_min = 60000
def get_all_candels(symbol,start_time,stop_time):
counter = 0
candel_counter = 0
data_set = []
t = 0
while t < stop_time:
if data_set == []:
block = exchange.fetch_ohlcv(symbol,'1m',start_time)
for candle in block:
if candle == []:
break
data_set.append(candle)
last_time_in_block = block[-1][0]
counter += 1
candel_counter += len(block)
print(f'{counter} - {block[0]} - {candel_counter} - {last_time_in_block}')
if data_set != []:
t = last_time_in_block + one_min
block = exchange.fetch_ohlcv(symbol,'1m',t)
if block == []:
break
for candle in block:
if candle == []:
break
data_set.append(candle)
last_time_in_block = block[-1][0]
candel_counter += len(block)
counter += 1
print(f'{counter} - {block[0]} - {candel_counter} - {last_time_in_block}')
time.sleep(1)
return data_set
data_set = get_all_candels('BTCUSD',1574726400000,curent_time)
print(np.shape(data_set))
with open('raw.csv', 'w', newline='') as csv_file:
column_names = ['time', 'open', 'high', 'low', 'close', 'volume']
csv_writer = csv.DictWriter(csv_file,fieldnames=column_names)
csv_writer.writeheader()
for candel in data_set:
csv_writer.writerow({
'time':candel[0],
'open':candel[1],
'high':candel[2],
'low':candel[3],
'close':candel[4],
'volume':candel[5]
})

Technical Analyis (MACD) for crpto trading

Background:
I have writing a crypto trading bot for fun and profit.
So far, it connects to an exchange and gets streaming price data.
I am using this price to create a technical indicator (MACD).
Generally for MACD, it is recommended to use closing prices for 26, 12 and 9 days.
However, for my trading strategy, I plan to use data for 26, 12 and 9 minutes.
Question:
I am getting multiple (say 10) price ticks in a minute.
Do I simply average them and round the time to the next minute (so they all fall in the same minute bucket)? Or is there is better way to handle this.
Many Thanks!
This is how I handled it. Streaming data comes in < 1s period. Code checks for new low and high during streaming period and builds the candle. Probably ugly since I'm not a trained developer, but it works.
Adjust "...round('20s')" and "if dur > 15:" for whatever candle period you want.
def on_message(self, msg):
df = pd.json_normalize(msg, record_prefix=msg['type'])
df['date'] = df['time']
df['price'] = df['price'].astype(float)
df['low'] = df['low'].astype(float)
for i in range(0, len(self.df)):
if i == (len(self.df) - 1):
self.rounded_time = self.df['date'][i]
self.rounded_time = pd.to_datetime(self.rounded_time).round('20s')
self.lhigh = self.df['price'][i]
self.lhighcandle = self.candle['high'][i]
self.llow = self.df['price'][i]
self.lowcandle = self.candle['low'][i]
self.close = self.df['price'][i]
if self.lhigh > self.lhighcandle:
nhigh = self.lhigh
else:
nhigh = self.lhighcandle
if self.llow < self.lowcandle:
nlow = self.llow
else:
nlow = self.lowcandle
newdata = pd.DataFrame.from_dict({
'date': self.df['date'],
'tkr': tkr,
'open': self.df.price.iloc[0],
'high': nhigh,
'low': nlow,
'close': self.close,
'vol': self.df['last_size']})
self.candle = self.candle.append(newdata, ignore_index=True).fillna(0)
if ctime > self.rounded_time:
closeit = True
self.en = time.time()
if closeit:
dur = (self.en - self.st)
if dur > 15:
self.st = time.time()
out = self.candle[-1:]
out.to_sql(tkr, cnx, if_exists='append')
dat = ['tkr', 0, 0, 100000, 0, 0]
self.candle = pd.DataFrame([dat], columns=['tkr', 'open', 'high', 'low', 'close', 'vol'])
As far as I know, most or all technical indicator formulas rely on same-sized bars to produce accurate and meaningful results. You'll have to do some data transformation. Here's an example of an aggregation technique that uses quantization to get all your bars into uniform sizes. It will convert small bar sizes to larger bar sizes; e.g. second to minute bars.
// C#, see link above for more info
quoteHistory
.OrderBy(x => x.Date)
.GroupBy(x => x.Date.RoundDown(newPeriod))
.Select(x => new Quote
{
Date = x.Key,
Open = x.First().Open,
High = x.Max(t => t.High),
Low = x.Min(t => t.Low),
Close = x.Last().Close,
Volume = x.Sum(t => t.Volume)
});
See Stock.Indicators for .NET for indicators and related tools.

AJAX call taking a long time (2-4 secs)

I'm really stumped on an issue. On my website, whenever you pan the map, an AJAX call is fired, doing a query in the database. The problem is, the AJAX call takes somewhere between 2-10 seconds, which is unacceptable.
Link to website
There are about 500k records in my database. I notice that the more records I add, the slower it gets. That makes sense right, but why is it exceptionally slow and so inconsistent?
I am using Digital Ocean. When I check my control panel, the CPU/RAM/disk are all operating at very low levels.
The AJAX call code:
def filter_closed_listings(request):
zoom = int(request.GET.get('zoomLevel'))
minLat = request.GET.get('minLat')
maxLat = request.GET.get('maxLat')
minLng = request.GET.get('minLng')
maxLng = request.GET.get('maxLng')
sold = request.GET.get('sold')
property_type = request.GET.get('pType')
building_style = request.GET.get('bType')
showActiveListings = request.GET.get('showActiveListings')
showSoldListings = request.GET.get('showSoldListings')
showEndedListings = request.GET.get('showEndedListings')
bed = request.GET.get('bed')
bath = request.GET.get('bath')
low_price = request.GET.get('low')
high_price = request.GET.get('high')
includePandL = request.GET.get('includePandL')
transaction_type = request.GET.get('transactionType')
ended_time_difference = datetime.date.today() + datetime.timedelta(-int(sold)) # Date check
print(ended_time_difference)
# Initial filter with map bounds and date check
data = Listing.objects.filter(la__gt=minLat).filter(la__lt=maxLat).filter(lo__gt=minLng).filter(lo__lt=maxLng)
data = data.filter(transaction_type=transaction_type)
# 0 is 'Any'. Therefore a filter is done if selection is not 'Any'.
if not property_type == '0':
if property_type == '1':
data = data.filter(Q(property_type='Condo Apt') | Q(property_type='Comm Element Condo'))
elif property_type == '2':
data = data.filter(property_type='Detached')
elif property_type == '3':
data = data.filter(property_type='Semi-Detached')
elif property_type == '4':
data = data.filter(Q(property_type='Att/Row/Twnhouse') | Q(property_type='Condo Townhouse'))
else:
data = data.exclude(Q(property_type='Condo Apt') | Q(property_type='Comm Element Condo') | Q(property_type='Detached') | Q(property_type='Semi-Detached') | Q(property_type='Att/Row/Twnhouse') | Q(property_type='Condo Townhouse'))
if showActiveListings == 'n':
data = data.filter(Q(status='Ter') | Q(status='Exp') | Q(status='Sld') | Q(status='Lsd'))
if showSoldListings == 'n':
if transaction_type == 'Sale':
data = data.exclude(status='Sld')
else:
data = data.exclude(status='Lsd')
else:
if transaction_type == 'Sale':
data = data.exclude(Q(status='Sld') & Q(sold_date__lt=ended_time_difference))
else:
data = data.exclude(Q(status='Lsd') & Q(sold_date__lt=ended_time_difference))
if showEndedListings == 'n':
data = data.exclude(Q(status='Ter') | Q(status='Exp'))
else:
data = data.exclude(Q(status='Exp') & Q(expiry_date__lt=ended_time_difference)).exclude(Q(status='Ter') & Q(terminated_date__lt=ended_time_difference))
if includePandL == 'false':
data = data.exclude(Q(property_type='Parking Space') | Q(property_type='Locker'))
# Bedrooms check
if not bed == 'undefined':
bed = bed.split(',')
queries = [Q(bedrooms_ag=b) for b in bed]
query = queries.pop()
if bed.pop() == '6':
query = Q(bedrooms_ag__gte=5)
for i in queries:
query |= i
#print(query)
data = data.filter(query)
# Bathrooms check
if not bath == 'undefined':
bath = bath.split(',')
queries = [Q(washrooms=b) for b in bath]
query = queries.pop()
if bath.pop() == '6':
query = Q(washrooms__gte=5)
for i in queries:
query |= i
#print(query)
data = data.filter(query)
# Filters low price and high price
if high_price == '0' and low_price == '0':
pass
elif high_price =='0':
data = data.filter(Q(Q(list_price__gte=low_price) & Q(list_price__lte=999999999) & Q(sold_price__isnull=True)) | Q(Q(sold_price__gte=low_price) & Q(sold_price__lte=999999999) & Q(sold_price__isnull=False))) # Nested Q object. Use list price if sold price is null.
else:
data = data.filter(Q(Q(list_price__gte=low_price) & Q(list_price__lte=high_price) & Q(sold_price__isnull=True)) | Q(Q(sold_price__gte=low_price) & Q(sold_price__lte=high_price) & Q(sold_price__isnull=False)))
if data.count() > 500:
return JsonResponse({'over500': True})
data = data.values('id', 'la', 'lo')
# Determines lat/lng precision based on zoom level
for i in data:
if zoom >= 13:
i['lo'] = round(i['lo'], 4)
i['la'] = round(i['la'], 4)
elif zoom > 9 and zoom < 13:
i['lo'] = round(i['lo'], 2)
i['la'] = round(i['la'], 2)
else:
i['lo'] = round(i['lo'], 1)
i['la'] = round(i['la'], 1)
return JsonResponse({'results': list(data)}) # This uses list() which is bad design. But only 3 fields are passed and highly unlikely to change so this can stay like this.

python Use Pool to create multiple processes but not execute the results

I put all the functions are placed in a class, including the creation of the process of the function and the implementation of the function, in another file to call the function of this class
from multiprocessing import Pool
def initData(self, type):
# create six process to deal with the data
if type == 'train':
data = pd.read_csv('./data/train_merged_8.csv')
elif type == 'test':
data = pd.read_csv('./data/test_merged_2.csv')
modelvec = allWord2Vec('no').getModel()
modelvec_all = allWord2Vec('all').getModel()
modelvec_stop = allWord2Vec('stop').getModel()
p = Pool(6)
count = 0
for i in data.index:
count += 1
p.apply_async(self.valueCal, args=(i, data, modelvec, modelvec_all, modelvec_stop))
if count % 1000 == 0:
print(str(count // 100) + 'h rows of data has been dealed')
p.close()
p.join
def valueCal(self, i, data, modelvec, modelvec_all, modelvec_stop):
# the function run in process
list_con = []
q1 = str(data.get_value(i, 'question1')).split()
q2 = str(data.get_value(i, 'question2')).split()
f1 = self.getF1_union(q1, q2)
f2 = self.getF2_inter(q1, q2)
f3 = self.getF3_sum(q1, q2)
f4_q1 = len(q1)
f4_q2 = len(q2)
f4_rate = f4_q1/f4_q2
q1 = [','.join(str(ve)) for ve in q1]
q2 = [','.join(str(ve)) for ve in q2]
list_con.append('|'.join(q1))
list_con.append('|'.join(q2))
list_con.append(f1)
list_con.append(f2)
list_con.append(f3)
list_con.append(f4_q1)
list_con.append(f4_q2)
list_con.append(f4_rate)
f = open('./data/test.txt', 'a')
f.write('\t'.join(list_con) + '\n')
f.close()
The result appears very soon like this, but I have not even seen the file being created.But when I check the task manager, there are indeed six processes are created and consumed a lot of resources I cpu. And when the program is finished, the file is still not created.
How can i solve this problem?
10h rows of data have been dealed
20h rows of data have been dealed
30h rows of data have been dealed
40h rows of data have been dealed

Resources