ONNX Runtime Inference | session.run() multiprocessing

ONNX Runtime Inference | session.run() multiprocessing - parallel-processing

Goal: run Inference in parallel on multiple CPU cores
I'm experimenting with Inference using simple_onnxruntime_inference.ipynb.
Individually:
outputs = session.run([output_name], {input_name: x})
Many:
outputs = session.run(["output1", "output2"], {"input1": indata1, "input2": indata2})
Sequentially:
%%time
outputs = [session.run([output_name], {input_name: inputs[i]})[0] for i in range(test_data_num)]
This Multiprocessing tutorial offers many approaches for parallelising any tasks.
However, I want to know which approach would be best for session.run(), with or without outputs being passed.
How do I Inference all outputs and inputs together, in parallel?
Code:
import onnxruntime
import multiprocessing as mp
session = onnxruntime.InferenceSession('bert.opt.quant.onnx')
i = 0
# First Input
input_name = session.get_inputs()[i].name
print("Input Name :", input_name)
# First Output
output_name = session.get_outputs()[i].name
print("Output Name :", output_name)
pool = mp.Pool(mp.cpu_count())
# PARALLELISE THIS LINE
outputs = [session.run([], {input_name: inputs[i]})[0] for i in range(test_data_num)]
# outputs = pool.starmap(func, zip(iter_1, iter_2))
pool.close()
print(results)
Update: this solution suggests using starmap() and zip() in order to pass a function name and 2 separate iterables.
Replacing line with this:
outputs = pool.starmap(session.run, zip([output_name], [ {input_name: inputs[i]}[0] for i in range(test_data_num) ]))
Traceback:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-45-0aab302a55eb> in <module>
25 #%%time
26 #outputs = [session.run([output_name], {input_name: inputs[i]})[0] for i in range(test_data_num)]
---> 27 outputs = pool.starmap(session.run, zip([output_name], [ {input_name: inputs[i]}[0] for i in range(test_data_num) ]))
28
29 pool.close()
<ipython-input-45-0aab302a55eb> in <listcomp>(.0)
25 #%%time
26 #outputs = [session.run([output_name], {input_name: inputs[i]})[0] for i in range(test_data_num)]
---> 27 outputs = pool.starmap(session.run, zip([output_name], [ {input_name: inputs[i]}[0] for i in range(test_data_num) ]))
28
29 pool.close()
KeyError: 0

def run_inference(i):
output_name = session.get_outputs()[0].name
return session.run([output_name], {input_name: inputs[i]})[0] # [0] bc array in list
outputs = pool.map(run_inference, [i for i in range(test_data_num)])
Anyone feel free to critique

Related

How to test a model before fine-tuning in Pytorch Lightning?

Doing things on Google Colab.
transformers: 4.10.2
pytorch-lightning: 1.2.7
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
dataset_for_loader = [
{'data':torch.tensor([0,1]), 'labels':torch.tensor(0)},
{'data':torch.tensor([2,3]), 'labels':torch.tensor(1)},
{'data':torch.tensor([4,5]), 'labels':torch.tensor(2)},
{'data':torch.tensor([6,7]), 'labels':torch.tensor(3)},
]
loader = DataLoader(dataset_for_loader, batch_size=2)
for idx, batch in enumerate(loader):
print(f'# batch {idx}')
print(batch)
category_list = [
'dokujo-tsushin',
'it-life-hack',
'kaden-channel',
'livedoor-homme',
'movie-enter',
'peachy',
'smax',
'sports-watch',
'topic-news'
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
# file ./text has lots of articles, categorized by category
# and they are just plain texts, whose content begins from forth line
for file in glob.glob(f'./text/{category}/{category}*'):
lines = open(file).read().splitlines()
text = '\n'.join(lines[3:])
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True
)
encoding['labels'] = label
encoding = { k: torch.tensor(v) for k, v in encoding.items() }
dataset_for_loader.append(encoding)
SEED=lambda:0.0
# random.shuffle(dataset_for_loader) # ランダムにシャッフル
random.shuffle(dataset_for_loader,SEED)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]
dataloader_train = DataLoader(
dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)
class BertForSequenceClassification_pl(pl.LightningModule):
def __init__(self, model_name, num_labels, lr):
super().__init__()
self.save_hyperparameters()
self.bert_sc = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def training_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
loss = output.loss
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
val_loss = output.loss
self.log('val_loss', val_loss)
def test_step(self, batch, batch_idx):
labels = batch.pop('labels')
output = self.bert_sc(**batch)
labels_predicted = output.logits.argmax(-1)
num_correct = ( labels_predicted == labels ).sum().item()
accuracy = num_correct/labels.size(0)
self.log('accuracy', accuracy)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
checkpoint = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=1,
save_weights_only=True,
dirpath='model/',
)
trainer = pl.Trainer(
gpus=1,
max_epochs=10,
callbacks = [checkpoint]
)
model = BertForSequenceClassification_pl(
MODEL_NAME, num_labels=9, lr=1e-5
)
### (a) ###
# I think this is where I am doing fine-tuning
trainer.fit(model, dataloader_train, dataloader_val)
# this is to score after fine-tuning
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
But I am not really sure how to do a test before fine-tuning, in order to compare two models before and after fine-tuning, in order to show how effective fine-tuning is.
Inserting the following two lines to ### (a) ###:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
I got this result:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-c8b2c67f2d5c> in <module>()
9
10 # 6-19
---> 11 test = trainer.test(test_dataloaders=dataloader_test)
12 print(f'Accuracy: {test[0]["accuracy"]:.2f}')
13
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in test(self, model, test_dataloaders, ckpt_path, verbose, datamodule)
896 self.verbose_test = verbose
897
--> 898 self._set_running_stage(RunningStage.TESTING, model or self.lightning_module)
899
900 # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _set_running_stage(self, stage, model_ref)
563 the trainer and the model
564 """
--> 565 model_ref.running_stage = stage
566 self._running_stage = stage
567
AttributeError: 'NoneType' object has no attribute 'running_stage'
I noticed that Trainer.fit() can take None as arguments other than model, so I tried this:
trainer.fit(model)
test=trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
The result:
MisconfigurationException: No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.
Thanks.

The Trainer needs to call its .fit() in order to set up a lot of things and then only you can do .test() or other methods.
You are right about putting a .fit() just before .test() but the fit call needs to a valid one. You have to feed a dataloader/datamodule to it. But since you don't want to do a training/validation in this fit call, just pass limit_[train/val]_batches=0 while Trainer construction.
trainer = Trainer(gpus=..., ..., limit_train_batches=0, limit_val_batches=0)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # without fine-tuning
The fit call here will just set things up for you and skip training/validation. And then the testing follows. Next time run the same code but without the limit_[train/val]_batches, this will do the pretraining for you
trainer = Trainer(gpus=..., ...)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # with fine-tuning
Clarifying a bit about .fit() taking None for all but model: Its not quite true - you must provide either a DataLoader or a DataModule.

How to build an empirical codon substitution matrix from a multiple sequence alignment

I have been trying to build an empirical codon substitution matrix given a multiple sequence alignment in fasta format using Biopython.
It appears to be relatively straigh-forward for single nucleotide substitution matrices using the AlignInfo module when the aligned sequences have the same length. Here is what I managed to do using python2.7:
#!/usr/bin/env python
import os
import argparse
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio import SubsMat
import sys
version = "0.0.1 (23.04.20)"
name = "Aln2SubMatrix.py"
parser=argparse.ArgumentParser(description="Outputs a codon substitution matrix given a multi-alignment in FastaFormat. Will raise error if alignments contain dots (\".\"), so replace those with dashes (\"-\") beforehand (e.g. using sed)")
parser.add_argument('-i','--input', action = "store", dest = "input", required = True, help = "(aligned) input fasta")
parser.add_argument('-o','--output', action = "store", dest = "output", help = "Output filename (default = <Input-file>.codonSubmatrix")
args=parser.parse_args()
if not args.output:
args.output = args.input + ".codonSubmatrix" #if no outputname was specified set outputname based on inputname
def main():
infile = open(args.input, "r")
outfile = open(args.output, "w")
align = AlignIO.read(infile, "fasta")
summary_align = AlignInfo.SummaryInfo(align)
replace_info = summary_align.replacement_dictionary()
mat = SubsMat.SeqMat(replace_info)
print >> outfile, mat
infile.close()
outfile.close()
sys.stderr.write("\nfinished\n")
main()
Using a multiple sequence alignment file in fasta format with sequences of same length (aln.fa), the output is a half-matrix corresponding to the number of nucleotide substitutions oberved in the alignment (Note that gaps (-) are allowed):
python Aln2SubMatrix.py -i aln.fa
- 0
a 860 232
c 596 75 129
g 571 186 75 173
t 892 58 146 59 141
- a c g t
What I am aiming to do is to compute similar empirical substitution matrix but for all nucleotide triplets (codons) present in a multiple sequence alignment.
I have tried to tweak the _pair_replacement function of the AlignInfo module in order to accept nucleotide triplets by changing:
line 305 to 308
for residue_num in range(len(seq1)):
residue1 = seq1[residue_num]
try:
residue2 = seq2[residue_num]
to
for residue_num in range(0, len(seq1), 3):
residue1 = seq1[residue_num:residue_num+3]
try:
residue2 = seq2[residue_num:residue_num+3]
At this stage it can retrieve the codons from the alignment but complains about the alphabet (the module only accepts single character alphabet?).
Note that
(i) I would like to get a substitution matrix that accounts for the three possible reading frames
Any help is highly appreciated.

RuntimeError: module must have its parameters and buffers on device cuda:1 (device_ids[0]) but found one of them on device: cuda:2

I have 4 GPUs (0,1,2,3) and I want to run one Jupyter notebook on GPU 2 and another one on GPU 0. Thus, after executing,
export CUDA_VISIBLE_DEVICES=0,1,2,3
for the GPU 2 notebook I do,
device = torch.device( f'cuda:{2}' if torch.cuda.is_available() else 'cpu')
device, torch.cuda.device_count(), torch.cuda.is_available(), torch.cuda.current_device(), torch.cuda.get_device_properties(1)
and after creating a new model or loading one,
model = nn.DataParallel( model, device_ids = [ 0, 1, 2, 3])
model = model.to( device)
Then, when I start training the model, I get,
RuntimeError Traceback (most recent call last)
<ipython-input-18-849ffcb53e16> in <module>
46 with torch.set_grad_enabled( phase == 'train'):
47 # [N, Nclass, H, W]
---> 48 prediction = model(X)
49 # print( prediction.shape, y.shape)
50 loss_matrix = criterion( prediction, y)
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
~/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
144 raise RuntimeError("module must have its parameters and buffers "
145 "on device {} (device_ids[0]) but found one of "
--> 146 "them on device: {}".format(self.src_device_obj, t.device))
147
148 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cuda:2

DataParallel requires every input tensor be provided on the first device in its device_ids list.
It basically uses that device as a staging area before scattering to the other GPUs and it's the device where final outputs are gathered before returning from forward. If you want device 2 to be the primary device then you just need to put it at the front of the list as follows
model = nn.DataParallel(model, device_ids = [2, 0, 1, 3])
model.to(f'cuda:{model.device_ids[0]}')
After which all tensors provided to model should be on the first device as well.
x = ... # input tensor
x = x.to(f'cuda:{model.device_ids[0]}')
y = model(x)

this error happened when using the torch, model and data both are not on cuda:
try some code like this to model and data set on cuda
model = model.toDevice(‘cuda’)
images = images.toDevice(‘cuda’)

For me even the following works:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
network = nn.DataParallel(network)
network.to(device)
tnsr = tnsr.to(device)

How to turn off Auto Mixed Precision in validation time?

I try to run the MAC network (https://github.com/stanfordnlp/mac-network/tree/gqa) with Auto Mixed Precision.
def addOptimizerOp(self):
with tf.variable_scope("trainAddOptimizer"):
self.globalStep = tf.Variable(0, dtype = tf.int32, trainable = False, name = "globalStep") # init to 0 every run?
optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
if config.subsetOpt:
self.subsetOptimizer = tf.train.AdamOptimizer(learning_rate = self.lr * config.subsetOptMult)
return optimizer
In the first epoch, training is ok. However, when the model run evaluation on the validation set, I got this error.
Training epoch 1...
2019-08-05 14:51:13.625899: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:13.709959: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1723] Converted 1504/6920 nodes to float16 precision using 150 cast(s) to float16 (excluding Const and Variable casts)
2019-08-05 14:51:16.930248: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-08-05 14:51:17.331687: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudnn.so.7
2019-08-05 14:51:29.378905: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:29.380633: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1241] No whitelist ops found, nothing to do
eb 1, 10000,(160010 / 943000), t = 0.12 (0.00+0.11), lr 0.0003, l = 2.8493, a = 0.4250, avL = 2.5323, avA = 0.4188, g = 3.7617, emL = 2.3097, emA = 0.4119; gqaExperiment
Restoring EMA weights
2019-08-05 14:51:31.132804: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:31.136122: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1241] No whitelist ops found, nothing to do
2019-08-05 14:51:32.322369: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:32.341609: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1723] Converted 661/1848 nodes to float16 precision using 38 cast(s) to float16 (excluding Const and Variable casts)
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1356, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1341, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1429, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: TensorArray dtype is float but Op is trying to write dtype half.
[[{{node macModel/tower0/encoder/birnnLayer/bidirectional_rnn/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3}}]]
[[macModel/tower0/MACnetwork/MACCell_3/write/inter2attselfAttention/Softmax/_1661]]
(1) Invalid argument: TensorArray dtype is float but Op is trying to write dtype half.
[[{{node macModel/tower0/encoder/birnnLayer/bidirectional_rnn/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 866, in <module>
main()
File "main.py", line 728, in main
evalRes = runEvaluation(sess, model, data["main"], dataOps, epoch, getPreds = getPreds, prevRes = evalRes)
File "main.py", line 248, in runEvaluation
minLoss = prevRes["train"]["minLoss"] if prevRes else float("inf"))
File "main.py", line 594, in runEpoch
res = model.runBatch(sess, batch, imagesBatch, train, getPreds, getAtt)
File "/content/model.py", line 948, in runBatch
feed_dict = feed)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
I think the errors may come from not turning off the auto mixed precision on evaluation time or using the same session and model for train and evaluation. I tried "tf.train.experimental.disable_mixed_precision_graph_rewrite()" but I do not know how to use it in the right way.
How can I fix it? Thanks all.
def main():
with open(config.configFile(), "a+") as outFile:
json.dump(vars(config), outFile)
# set gpus
if config.gpus != "":
config.gpusNum = len(config.gpus.split(","))
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpus
tf.logging.set_verbosity(tf.logging.ERROR)
# process data
print(bold("Preprocess data..."))
start = time.time()
preprocessor = Preprocesser()
data, embeddings, answerDict, questionDict = preprocessor.preprocessData()
print("took {} seconds".format(bcolored("{:.2f}".format(time.time() - start), "blue")))
nextElement = None
dataOps = None
# build model
print(bold("Building model..."))
start = time.time()
model = MACnet(embeddings, answerDict, questionDict, nextElement)
print("took {} seconds".format(bcolored("{:.2f}".format(time.time() - start), "blue")))
# initializer
init = tf.global_variables_initializer()
# savers
savers = setSavers(model)
saver, emaSaver = savers["saver"], savers["emaSaver"]
# sessionConfig
sessionConfig = setSession()
with tf.Session(config = sessionConfig) as sess:
# ensure no more ops are added after model is built
sess.graph.finalize()
# restore / initialize weights, initialize epoch variable
epoch = loadWeights(sess, saver, init)
trainRes, evalRes = None, None
if config.train:
start0 = time.time()
bestEpoch = epoch
bestRes = None
prevRes = None
# epoch in [restored + 1, epochs]
for epoch in range(config.restoreEpoch + 1, config.epochs + 1):
print(bcolored("Training epoch {}...".format(epoch), "green"))
start = time.time()
# train
# calle = lambda: model.runEpoch(), collectRuntimeStats, writer
trainingData, alterData = chooseTrainingData(data)
trainRes = runEpoch(sess, model, trainingData, dataOps, train = True, epoch = epoch,
saver = saver, alterData = alterData,
maxAcc = trainRes["maxAcc"] if trainRes else 0.0,
minLoss = trainRes["minLoss"] if trainRes else float("inf"),)
# save weights
saver.save(sess, config.weightsFile(epoch))
if config.saveSubset:
subsetSaver.save(sess, config.subsetWeightsFile(epoch))
# load EMA weights
if config.useEMA:
print(bold("Restoring EMA weights"))
emaSaver.restore(sess, config.weightsFile(epoch))
# evaluation
getPreds = config.getPreds or (config.analysisType != "")
evalRes = runEvaluation(sess, model, data["main"], dataOps, epoch, getPreds = getPreds, prevRes = evalRes)
extraEvalRes = runEvaluation(sess, model, data["extra"], dataOps, epoch,
evalTrain = not config.extraVal, getPreds = getPreds)
# restore standard weights
if config.useEMA:
print(bold("Restoring standard weights"))
saver.restore(sess, config.weightsFile(epoch))
print("")
epochTime = time.time() - start
print("took {:.2f} seconds".format(epochTime))
# print results
printDatasetResults(trainRes, evalRes, extraEvalRes)

Dask multiprocessing fails with embarrassingly parallel for loop including call to MongoDB when number of iterations is high enough

I'm trying to run a kind of simulation in Python for loop in parallel using Dask multiprocessing. Parallelization works fine when number of iterations is fairly low but fails when the amount increases. The issue occurs on Win7 (4 cores, 10 Gb RAM), Win10 (8 cores, 8 Gb RAM) and Azure VM running Windows Server 2016 (16 cores, 32 Gb RAM). The slowest one, Win7, can go through most iterations before failing. The issue can be mitigated by adding long enough sleep time at the end of each function included in the process, but the required amount of sleeping results in very low performance, similar to running sequentially.
I hope someone will be able to help me out here. Thanks in advance for comments and answers!
The following simple code contains some phases of the for loop and repeats the error.
import json
import pandas as pd
from pymongo import MongoClient
# Create random DataFrame
df = pd.DataFrame(np.random.randint(0,100,size=(100,11)), columns=list('ABCDEFGHIJK'))
# Save to Mongo
client = MongoClient()
db = client.errordemo
res = db.errordemo.insert_many(json.loads(df.to_json(orient='records')))
db.client.close()
class ToBeRunParallel:
def __init__(self):
pass
def functionToBeRunParallel(self, i):
# Read data from mongo
with MongoClient() as client:
db = client.errordemo
dataFromMongo = pd.DataFrame.from_records(db.errordemo.find({}, {'_id': 0}))
# Randomize data
dataRand = dataFromMongo.apply(pd.to_numeric).apply(rand, volatility=0.1)
# Sum rows
dataSum = dataRand.sum(axis=1)
# Select randomly one of the resulting values and return
return dataSum.sample().values[0]
Call the function functionToBeRunParallel either in console or Jupyter (both fail). 'errordemo' is a local module containing the class ToBeRunParallel. While running the on Azure VM, the code succeeds with 500 loops and fails with 5,000.
import errordemo
from dask import delayed, compute, multiprocessing
# Determine how many times to loop
rng = range(15000)
# Define empty result lists
resList = []
# Create instance
err = errordemo.ToBeRunParallel()
# Loop in parallel using Dask
for i in rng:
sampleValue = delayed(err.functionToBeRunParallel)(i)
resList.append(sampleValue)
# Compute in parallel
result = compute(*resList, get=multiprocessing.get)
The error stack in Jupyter is as follows.
---------------------------------------------------------------------------
AutoReconnect Traceback (most recent call last)
<ipython-input-3-9f535dd4c621> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', '# Determine how many times to loop\nrng = range(50000)\n\n# Define empty result lists\nresList = []\n\n# Create instance\nerr = errordemo.ToBeRunParallel()\n\n# Loop in parallel using Dask\nfor i in rng:\n sampleValue = delayed(err.functionToBeRunParallel)(i)\n resList.append(sampleValue)\n \n# Compute in parallel \nresult = compute(*resList, get=dask.multiprocessing.get)')
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2113 magic_arg_s = self.var_expand(line, stack_depth)
2114 with self.builtin_trap:
-> 2115 result = fn(magic_arg_s, cell)
2116 return result
2117
<decorator-gen-60> in time(self, line, cell, local_ns)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\magics\execution.py in time(self, line, cell, local_ns)
1178 else:
1179 st = clock2()
-> 1180 exec(code, glob, local_ns)
1181 end = clock2()
1182 out = None
<timed exec> in <module>()
C:\ProgramData\Anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
200 dsk = collections_to_dsk(variables, optimize_graph, **kwargs)
201 keys = [var._keys() for var in variables]
--> 202 results = get(dsk, keys, **kwargs)
203
204 results_iter = iter(results)
C:\ProgramData\Anaconda3\lib\site-packages\dask\multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, **kwargs)
85 result = get_async(pool.apply_async, len(pool._pool), dsk3, keys,
86 get_id=_process_get_id,
---> 87 dumps=dumps, loads=loads, **kwargs)
88 finally:
89 if cleanup:
C:\ProgramData\Anaconda3\lib\site-packages\dask\async.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, dumps, loads, **kwargs)
498 _execute_task(task, data) # Re-execute locally
499 else:
--> 500 raise(remote_exception(res, tb))
501 state['cache'][key] = res
502 finish_task(dsk, key, state, results, keyorder.get)
AutoReconnect: localhost:27017: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
Traceback
---------
File "C:\ProgramData\Anaconda3\lib\site-packages\dask\async.py", line 266, in execute_task
result = _execute_task(task, data)
File "C:\ProgramData\Anaconda3\lib\site-packages\dask\async.py", line 247, in _execute_task
return func(*args2)
File "C:\Git_repository\footie\Pipeline\errordemo.py", line 20, in functionToBeRunParallel
dataFromMongo = pd.DataFrame.from_records(db.errordemo.find({}, {'_id': 0}))
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 981, in from_records
first_row = next(data)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\cursor.py", line 1090, in next
if len(self.__data) or self._refresh():
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\cursor.py", line 1012, in _refresh
self.__read_concern))
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\cursor.py", line 850, in __send_message
**kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\mongo_client.py", line 844, in _send_message_with_response
exhaust)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\mongo_client.py", line 855, in _reset_on_error
return func(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\server.py", line 99, in send_message_with_response
with self.get_socket(all_credentials, exhaust) as sock_info:
File "C:\ProgramData\Anaconda3\lib\contextlib.py", line 82, in __enter__
return next(self.gen)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\server.py", line 163, in get_socket
with self.pool.get_socket(all_credentials, checkout) as sock_info:
File "C:\ProgramData\Anaconda3\lib\contextlib.py", line 82, in __enter__
return next(self.gen)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 582, in get_socket
sock_info = self._get_socket_no_auth()
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 618, in _get_socket_no_auth
sock_info, from_pool = self.connect(), False
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 555, in connect
_raise_connection_failure(self.address, error)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 65, in _raise_connection_failure
raise AutoReconnect(msg)
UPDATE:
Following this post, I created a decorator to catch AutoReconnect exception like shown below. Together with parameters for MongoClient the looping works, but it's still very slow, double the time it should take. (timing on the Azure VM):
500 iterations: 3.74s
50,000 iterations: 12min 12s
def safe_mongocall(call):
def _safe_mongocall(*args, **kwargs):
for i in range(5):
try:
return call(*args, **kwargs)
except errors.AutoReconnect:
sleep(random.random() / 100)
print('Error: Failed operation!')
return _safe_mongocall
#safe_mongocall
def functionToBeRunParallel(self, i):
# Read data from mongo
with MongoClient(connect=False, maxPoolSize=None, maxIdleTimeMS=100) as client:
db = client.errordemo
dataFromMongo = pd.DataFrame.from_records(db.errordemo.find({}, {'_id': 0}))
# Randomize data
dataRand = dataFromMongo.apply(pd.to_numeric).apply(rand, volatility=0.1)
# Sum rows
dataSum = dataRand.sum(axis=1)
# Select randomly one of the resulting values and return
return dataSum.sample().values[0]

The actual issue is exhausting of TCP/IP ports, hence the solution is to avoid exhaustion. Following article by Microsoft, I added the following registry keys and values to HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters:
MaxUserPort: 65534
TcpTimedWaitDelay: 30

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

ONNX Runtime Inference | session.run() multiprocessing - parallel-processing

def run_inference(i): output_name = session.get_outputs()[0].name return session.run([output_name], {input_name: inputs[i]})[0] # [0] bc array in list outputs = pool.map(run_inference, [i for i in range(test_data_num)]) Anyone feel free to critique

Related

How to test a model before fine-tuning in Pytorch Lightning?

How to build an empirical codon substitution matrix from a multiple sequence alignment

RuntimeError: module must have its parameters and buffers on device cuda:1 (device_ids[0]) but found one of them on device: cuda:2

How to turn off Auto Mixed Precision in validation time?

Dask multiprocessing fails with embarrassingly parallel for loop including call to MongoDB when number of iterations is high enough

Categories

Resources