Problem with debugging TorchScript RuntimeError: Dimension out of range - debugging

I've created a simple NLP model in PyTorch, trained it and it works as expected in Python.
Then I've exported it to the TorchScript with jit.trace. And loading it back into Python works fine and model works as expected.
But when I try to execute it in rust with tch-rs (Rust bindings for the C++ api of PyTorch), the following error occurs and I have no idea how to debug it:
Error: Torch("The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
File \"code/__torch__/___torch_mangle_469.py\", line 17, in forward
dropout = self.dropout
bert = self.bert
_0 = (dropout).forward((bert).forward(input_id, mask, ), )
~~~~~~~~~~~~~ <--- HERE
_1 = (relu).forward((linear).forward(_0, ), )
return _1
File \"code/__torch__/transformers/models/bert/modeling_bert/___torch_mangle_465.py\", line 19, in forward
batch_size = ops.prim.NumToTensor(torch.size(input_id, 0))
_0 = int(batch_size)
seq_length = ops.prim.NumToTensor(torch.size(input_id, 1))
~~~~~~~~~~ <--- HERE
_1 = int(seq_length)
_2 = int(seq_length)
Traceback of TorchScript, original code (most recent call last):
/user/.conda/envs/tch/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py(954): forward
/user/.conda/envs/tch/lib/python3.10/site-packages/torch/nn/modules/module.py(1176): _slow_forward
/user/.conda/envs/tch/lib/python3.10/site-packages/torch/nn/modules/module.py(1192): _call_impl
/var/folders/zs/vmmy3w4n0ns1c0kj91skmfnm0000gn/T/ipykernel_10987/868892765.py(17): forward
/user/.conda/envs/tch/lib/python3.10/site-packages/torch/nn/modules/module.py(1176): _slow_forward
/user/.conda/envs/tch/lib/python3.10/site-packages/torch/nn/modules/module.py(1192): _call_impl
/user/.conda/envs/tch/lib/python3.10/site-packages/torch/jit/_trace.py(957): trace_module
/user/.conda/envs/tch/lib/python3.10/site-packages/torch/jit/_trace.py(753): trace
/var/folders/zs/vmmy3w4n0ns1c0kj91skmfnm0000gn/T/ipykernel_10987/749605851.py(1): <module>
/user/.conda/envs/tch/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3430): run_code
/user/.conda/envs/tch/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3341): run_ast_nodes
/user/.conda/envs/tch/lib/python3.10/site-packages/IPython/core/interactiveshell.py(3168): run_cell_async
/user/.conda/envs/tch/lib/python3.10/site-packages/IPython/core/async_helpers.py(129): _pseudo_sync_runner
/user/.conda/envs/tch/lib/python3.10/site-packages/IPython/core/interactiveshell.py(2970): _run_cell
/user/.conda/envs/tch/lib/python3.10/site-packages/IPython/core/interactiveshell.py(2941): run_cell
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel/zmqshell.py(531): run_cell
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel/ipkernel.py(380): do_execute
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel/kernelbase.py(700): execute_request
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel/kernelbase.py(383): dispatch_shell
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel/kernelbase.py(496): process_one
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel/kernelbase.py(510): dispatch_queue
/user/.conda/envs/tch/lib/python3.10/asyncio/events.py(80): _run
/user/.conda/envs/tch/lib/python3.10/asyncio/base_events.py(1868): _run_once
/user/.conda/envs/tch/lib/python3.10/asyncio/base_events.py(597): run_forever
/user/.conda/envs/tch/lib/python3.10/site-packages/tornado/platform/asyncio.py(212): start
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel/kernelapp.py(701): start
/user/.conda/envs/tch/lib/python3.10/site-packages/traitlets/config/application.py(990): launch_instance
/user/.conda/envs/tch/lib/python3.10/site-packages/ipykernel_launcher.py(12): <module>
/user/.conda/envs/tch/lib/python3.10/runpy.py(75): _run_code
/user/.conda/envs/tch/lib/python3.10/runpy.py(191): _run_module_as_main
RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
")
And here is the simple model that I try to execute:
from torch import nn
from transformers import BertModel
class BertClassifier(nn.Module):
def __init__(self, dropout=0.5):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, 5)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
I'm new to ML and I can't find any docs on how to debug TorchScript runtime errors so I appreciate any help in solving this problem

Related

HF pre-trained model for inference on multiple GPUs

I am trying to run inference from a pre-trained HF model on 4 GPUs. Do I need PyTorch Dataloader, or I can use any python function? I used pandas chunk split. My idea is to perform data parallelisation, split the data among 4 GPUs and send the pipeline HF_PIPELINE over each GPUs. Below is my my script. Could you suggest what I am doing wrong?
chunksize= 100
filename = 'dataset.csv'
def run_inference(world_size):
dist.init_process_group("gloo", rank=rank, world_size=world_size)
df_chunk_list = []
with pd.read_csv(filename, chunksize=chunksize) as reader:
for chunk in reader:
print(chunk.head(2))
chunk['prediction'] = list(map(HF_PIPELINE, chunk['text']))
df_chunk_list.append(chunk['prediction'])
print(df_chunk_list)
def main():
world_size = torch.cuda.device_count()
mp.spawn(run_inference,
args=(world_size),
nprocs=world_size,
join=True)
if __name__=="__main__":
main())
However I am getting this error
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 115, in _main
self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'run_inference' on <module '__main__' (built-in)>
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 115, in _main
self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'run_inference' on <module '__main__' (built-in)>

Repost: Discord.py & Sympy(mostly discord.py I think)

I'm trying to make a math discord bot.
I am pretty sure that my Sympy code is correct and it is just discord.py being funky.
Code:
#client.command()
async def solve(ctx, equation):
x, y, z, t = symbols('x y z t')
k, m, n = symbols('k m n', integer=True)
f, g, h = symbols('f g h', cls=Function)
equation = equation.split("=")
eqn = Eq(parse_expr(equation[0]), parse_expr(equation[1]))
await ctx.send(f"```{solve(eqn)}```")
Please assume that I have all imports necessary.
I am getting this error:
Ignoring exception in command solve:
Traceback (most recent call last):
File "C:\Users\justi_zts5a0w\PycharmProjects\discord.py\venv\lib\site-packages\discord\ext\commands\core.py", line 85, in wrapped
ret = await coro(*args, **kwargs)
File "math.py", line 42, in solve
eqn = Eq(parse_expr(equation[0]), parse_expr(equation[1]))
File "C:\Users\justi_zts5a0w\PycharmProjects\discord.py\venv\lib\site-packages\sympy\parsing\sympy_parser.py", line 1008, in parse_expr
return eval_expr(code, local_dict, global_dict)
File "C:\Users\justi_zts5a0w\PycharmProjects\discord.py\venv\lib\site-packages\sympy\parsing\sympy_parser.py", line 902, in eval_expr
expr = eval(
File "<string>", line 1
Integer (2 )+Integer (3 )Symbol ('x' )
^
SyntaxError: invalid syntax
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\justi_zts5a0w\PycharmProjects\discord.py\venv\lib\site-packages\discord\ext\commands\bot.py", line 903, in invoke
await ctx.command.invoke(ctx)
File "C:\Users\justi_zts5a0w\PycharmProjects\discord.py\venv\lib\site-packages\discord\ext\commands\core.py", line 859, in invoke
await injected(*ctx.args, **ctx.kwargs)
File "C:\Users\justi_zts5a0w\PycharmProjects\discord.py\venv\lib\site-packages\discord\ext\commands\core.py", line 94, in wrapped
raise CommandInvokeError(exc) from exc
discord.ext.commands.errors.CommandInvokeError: Command raised an exception: SyntaxError: invalid syntax (<string>, line 1)
Judging from the error you are getting, the problem is not from discord.py, it comes from sympy. And judging at the output of the code it has to do with the way you are introducing the parameter equation.
I did the following calls in SymPy Live and it outputted well:
>>> from sympy.parsing.sympy_parser import parse_expr
>>> equation = 'x+1 = 0'
>>> equation = equation.split('=')
>>> eqn = Eq(parse_expr(equation[0]), parse_expr(equation[1]))
>>> print(eqn)
Eq(x + 1, 0)
In your code, however, it is telling you that the parse_expr cannot evaluate the expression correctly due to the syntax being introduced. Make sure your input equation has a valid format.

FileNotFoundError: No such file: 'someones_epi.nii.gzip'

I am trying to load an MRI, I keep getting the following error:
Traceback (most recent call last):
File "F:/Study/Projects/BTSaG/Programs/t3.py", line 2, in <module> epi_img = nib.load('someones_epi.nii.gzip')
File "C:\Users\AnkitaShinde\AppData\Local\Programs\Python\Python35-32\lib\site-packages\nibabel\loadsave.py", line 38, in load raise FileNotFoundError("No such file: '%s'" % filename)
FileNotFoundError: No such file: 'someones_epi.nii.gzip'
The code is used is as follows:
import nibabel as nib
epi_img = nib.load('someones_epi.nii.gzip')
epi_img_data = epi_img.get_data()
epi_img_data.shape(53, 61, 33)
import matplotlib.pyplot as plt
def show_slices(slices):
""" Function to display row of image slices """
fig, axes = plt.subplots(1, len(slices))
for i, slice in enumerate(slices):
axes[i].imshow(slice.T, cmap="gray", origin="lower")
slice_0 = epi_img_data[26, :, :]
slice_1 = epi_img_data[:, 30, :]
slice_2 = epi_img_data[:, :, 16]
show_slices([slice_0, slice_1, slice_2])
plt.suptitle("Center slices for EPI image")
I have also updated the loadsave.py file in nibabel but it didn't work. Please help.
Edit:
The earlier error was resolved. Now another error has been encountered.
Traceback (most recent call last):File "F:\Study\Projects\BTSaG\Programs\t3.py", line 2, in <module> epi_img = nib.load('someones_epi.nii.gzip')
File "C:\Users\AnkitaShinde\AppData\Local\Programs\Python\Python35-32\lib\site-packages\nibabel\loadsave.py", line 47, in load filename)
nibabel.filebasedimages.ImageFileError: Cannot work out file type of "someones_epi.nii.gzip"
This is an old question, however I may have the solution for it.
I just figured out that nibabel.save() does not allow me to have dot . or dash - in the folder names. These can exist in filenames however. In your case, the current path is:
C:\Users\AnkitaShinde\AppData\Local\Programs\Python\Python35-32\Lib\site-packages\nibabel\someones_epi.nii.gzip
I would change it to:
C:\Users\AnkitaShinde\AppData\Local\Programs\Python\Python35_32\Lib\site_packages\nibabel\someones_epi.nii.gzip
This is just to give an example. Of course, I don't mean that you actually change the names of these package folders as it might cause other errors.
The actual solution would be to move the file someones_epi.nii.gzip to the user structure, something like:
C:\Users\AnkitaShinde\Desktop\nibabel\someones_epi.nii.gzip

BrokenPipeError: [WinError 109] The pipe has been ended during data extraction

I am new to multiprocessing in python.I am extracting some features from a list of 70,000 URLs. I have them from 2 different files. After the feature extraction process I pass the result to a list and then to a CSV file.
The code runs but then stops with the error.I tried to catch the error but it produced another one.
Python version = 3.5
from feature_extractor import Feature_extraction
import pandas as pd
from pandas.core.frame import DataFrame
import sys
from multiprocessing.dummy import Pool as ThreadPool
import threading as thread
from multiprocessing import Process,Manager,Array
import time
class main():
lst = None
def __init__(self):
manager = Manager()
self.lst = manager.list()
self.dostuff()
self.read_lst()
def feature_extraction(self,url):
if self.lst is None:
self.lst = []
features = Feature_extraction(url)
self.lst.append(features.get_features())
print(len(self.lst))
def Pool(self,url):
pool = ThreadPool(8)
results = pool.map(self.feature_extraction, url)
def dostuff(self):
df = pd.read_csv('verified_online.csv',encoding='latin-1')
df['label'] = df['phish_id'] * 0
mal_urls = df['url']
df2 = pd.read_csv('new.csv')
df2['label'] = df['phish_id']/df['phish_id']
ben_urls = df2['urls']
t = Process(target=self.Pool,args=(mal_urls,))
t2 = Process(target=self.Pool,args=(ben_urls,))
t.start()
t2.start()
t.join()
t2.join
def read_lst(self):
nw_df = DataFrame(list(self.lst))
nw_df.columns = ['Redirect count','ssl_classification','url_length','hostname_length','subdomain_count','at_sign_in_url','exe_extension_in_request_url','exe_extension_in_landing_url',
'ip_as_domain_name','no_of_slashes_in requst_url','no_of_slashes_in_landing_url','no_of_dots_in_request_url','no_of_dots_in_landing_url','tld_value','age_of_domain',
'age_of_last_modified','content_length','same_landing_and_request_ip','same_landing_and_request_url']
frames = [df['label'],df2['label']]
new_df = pd.concat(frames)
new_df = new_df.reset_index()
nw_df['label'] = new_df['label']
nw_df.to_csv('dataset.csv', sep=',', encoding='latin-1')
if __name__ == '__main__':
start_time = time.clock()
try:
main()
except BrokenPipeError:
print("broken pipe....")
pass
print (time.clock() - start_time, "seconds")
Error Traceback
Process Process-3:
Traceback (most recent call last):
File "F:\Continuum\Anaconda3\lib\multiprocessing\connection.py", line 312, in _recv_bytes
nread, err = ov.GetOverlappedResult(True)
BrokenPipeError: [WinError 109] The pipe has been ended
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "F:\Continuum\Anaconda3\lib\multiprocessing\process.py", line 249, in _bootstrap
self.run()
File "F:\Continuum\Anaconda3\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "H:\Projects\newoproject\src\main.py", line 33, in Pool
results = pool.map(self.feature_extraction, url)
File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 260, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 608, in get
raise self._value
File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 44, in mapstar
return list(map(*args))
File "H:\Projects\newoproject\src\main.py", line 26, in feature_extraction
self.lst.append(features.get_features())
File "<string>", line 2, in append
File "F:\Continuum\Anaconda3\lib\multiprocessing\managers.py", line 717, in _callmethod
kind, result = conn.recv()
File "F:\Continuum\Anaconda3\lib\multiprocessing\connection.py", line 250, in recv
buf = self._recv_bytes()
File "F:\Continuum\Anaconda3\lib\multiprocessing\connection.py", line 321, in _recv_bytes
raise EOFError
EOFError
My response is late and does not address the posted problem directly; but hopefully will provide a clue to others who encounter similar errors.
Errors that I encountered:
BrokenPipeError
WinError 109 The pipe has been ended &
WinError 232 The pipe is being closed
Observed with Python 36 on Windows 7, when:
(1) the same async function was submitted multiple times, each time with a different instance of a multiprocessing data store, a Queue in my case (multiprocessing.Manager().Queue())
AND
(2) the references to the Queues were saved in short-life local variables in the enveloping function.
The errors were occurring despite the fact that the Queues, shared with the successfully spawned and executing async-functions, had items and would still be in active use (put() & get()) at the time of exception.
The error consistently occurred when the same async_func was called the 2nd time with a 2nd instance of the Queue. Immediately after apply_async() of the function, the connection to the 1st Queue supplied to the async_func the 1st time, would get broken.
The issue got resolved when the references to the Queues were saved in non-overlapping (like a Queue-list) & longer-life variables (like variables returned to functions higher in the call-stack) in the enveloping function.

Replace pickle with dill and pymongo call

I finally understood example how to replace pickle with dill from the following discussion: pickle-dill.
For example, the following code worked for me
import os
import dill
import multiprocessing
def run_dill_encoded(what):
fun, args = dill.loads(what)
return fun(*args)
def apply_async(pool, fun, args):
return pool.apply_async(run_dill_encoded, (dill.dumps((fun, args)),))
if __name__ == '__main__':
pool = multiprocessing.Pool(5)
results = [apply_async(pool, lambda x: x*x, args=(x,)) for x in range(1,7)]
output = [p.get() for p in results]
print(output)
I tried to apply the same philosophy to pymongo. The following code
import os
import dill
import multiprocessing
import pymongo
def run_dill_encoded(what):
fun, args = dill.loads(what)
return fun(*args)
def apply_async(pool, fun, args):
return pool.apply_async(run_dill_encoded, (dill.dumps((fun, args)),))
def write_to_db(value_to_insert):
client = pymongo.MongoClient('localhost', 27017)
db = client['somedb']
collection = db['somecollection']
result = collection.insert_one({"filed1": value_to_insert})
client.close()
if __name__ == '__main__':
pool = multiprocessing.Pool(5)
results = [apply_async(pool, write_to_db, args=(x,)) for x in ['one', 'two', 'three']]
output = [p.get() for p in results]
print(output)
produces error:
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Python34\lib\multiprocessing\pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "C:\...\temp2.py", line 10, in run_dill_encoded
return fun(*args)
File "C:\...\temp2.py", line 21, in write_to_db
client = pymongo.MongoClient('localhost', 27017)
NameError: name 'pymongo' is not defined
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/.../temp2.py", line 32, in <module>
output = [p.get() for p in results]
File "C:/.../temp2.py", line 32, in <listcomp>
output = [p.get() for p in results]
File "C:\Python34\lib\multiprocessing\pool.py", line 599, in get
raise self._value
NameError: name 'pymongo' is not defined
Process finished with exit code 1
What is wrong?
As I mentioned in the comments, you need to put an import pymongo inside the function write_to_db. This is because when the function is serialized, it does not take along any of the global references with it when it is shipped to the other process space.

Resources