How to run perticular code in gpu using PyTorch? - multiprocessing

I am using an image processing code in python opencv. Since that process is taking a lot of time to process say 30 images. I tried to process these image parallel using Multiprocessing. The multiprocessing part is working good in CPU but I want to use that multiprocessing thing in GPU(cuda).
I use torch.multiprocessing for running task in parallel. So I am using torch.device('cuda') for our class to run whole thing in to this perticular device. When I run the code it's showing device using "cuda" but not using any GPU processing.
import cv2
import numpy as np
import torch
import torch.nn as nn
from torch.multiprocessing import Process, Pool, Manager, set_start_method
import sys
import os
class RoadShoulderWidth(nn.Module):
def __init__(self):
super(RoadShoulderWidth, self).__init__()
pass
// Want to run below method in parallel for 30 images.
#staticmethod
def get_dim(image, road_shoulder_width_list):
..... code
def get_road_shoulder_width(self, _root_dir, _img_path_list):
manager = Manager()
road_shoulder_width_list = manager.list()
processes = []
for img_path in img_path_list[:30]:
img = cv2.imread(_root_dir + '/' + img_path)
img = img[72 * 5:72 * 6, 0:1280]
# Do work
p = Process(target=self.get_dim,args=(img,road_shoulder_width_list))
p.start()
processes.append(p)
for p in processes:
p.join()
return road_shoulder_width_list
Use below set of code to run your class
if __name__ == '__main__':
root_dir = '/home/nikhil_m/r'
img_path_list = os.listdir(root_dir)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
dataloader_kwargs = {'pin_memory': True}
set_start_method('fork')
obj = RoadShoulderWidth().to(device)
val = obj.get_road_shoulder_width(str(root_dir), img_path_list)
print(val)
print(torch.cuda.is_available())
Can anybody suggest me how to fix this?

Your class RoadShoulderWidth is a nn.Module subclass which lets you use .to(device). This only means that all other nn.Module objects or nn.Parameters that are members of your RoadShoulderWidth object are moved to the device. As from your example, there are none, so nothing happens.
In general PyTorch does not move code to GPU but data. If all data of a pytorch operation are on the GPU (e.g. a + b, a and b are on GPU) then the operation is executed on the GPU. You can move the data with a.to(device), given a is a torch.Tensor object.
PyTorch can only execute its own operations on GPU. It's not able to execute OpenCV code on GPU.

Related

Is there any good way to rewrite the edgetpu old code by using pycoral api?

I'm a beginner using coral devboard mini.
I want to start a Smart Bird Feeder project.
https://coral.ai/projects/bird-feeder/
I've been trying to execute the code by referring to
I can't run bird_classify.py.
The error is as follows
untimeError: Internal: Unsupported data type in custom op handler: 0Node number 0 (edgetpu-custom-op) failed to prepare.
Originally, the samples in this project seemed to be deprecated, and
The edgetpu requires an old runtimeversion of 13, instead of the current 14.
(tflite is 2.5 ) I have downloaded it directly and re-installed it in
/usr/lib/python3/dist-packagesm
, but I cannot uninstall the new version and cannot match the version.
Is there a better way to do this?
Also, I've decided to give up on running the same environment as the sample, and use the pycoralapi to run the
If there is a good way to rewrite the code to use pycoral, please let me know.
Thanks
#!/usr/bin/python3
"""
Coral Smart Bird Feeder
Uses ClassificationEngine from the EdgeTPU API to analyze animals in
camera frames. Sounds a deterrent if a squirrel is detected.
Users define model, labels file, storage path, deterrent sound, and
optionally can set this to training mode for collecting images for a custom
model.
"""
import argparse
import time
import re
import imp
import logging
import gstreamer
import sys
sys.path.append('/usr/lib/python3/dist-packages/edgetpu')
from edgetpu.classification.engine import ClassificationEngine
from PIL import Image
from playsound import playsound
from pycoral.adapters import classify
from pycoral.adapters import common
from pycoral.utils.dataset import read_label_file
from pycoral.utils.edgetpu import make_interpreter
def save_data(image,results,path,ext='png'):
"""Saves camera frame and model inference results
to user-defined storage directory."""
tag = '%010d' % int(time.monotonic()*1000)
name = '%s/img-%s.%s' %(path,tag,ext)
image.save(name)
print('Frame saved as: %s' %name)
logging.info('Image: %s Results: %s', tag,results)
def load_labels(path):
"""Parses provided label file for use in model inference."""
p = re.compile(r'\s*(\d+)(.+)')
with open(path, 'r', encoding='utf-8') as f:
lines = (p.match(line).groups() for line in f.readlines())
return {int(num): text.strip() for num, text in lines}
def print_results(start_time, last_time, end_time, results):
"""Print results to terminal for debugging."""
inference_rate = ((end_time - start_time) * 1000)
fps = (1.0/(end_time - last_time))
print('\nInference: %.2f ms, FPS: %.2f fps' % (inference_rate, fps))
for label, score in results:
print(' %s, score=%.2f' %(label, score))
def do_training(results,last_results,top_k):
"""Compares current model results to previous results and returns
true if at least one label difference is detected. Used to collect
images for training a custom model."""
new_labels = [label[0] for label in results]
old_labels = [label[0] for label in last_results]
shared_labels = set(new_labels).intersection(old_labels)
if len(shared_labels) < top_k:
print('Difference detected')
return True
def user_selections():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True,
help='.tflite model path')
parser.add_argument('--labels', required=True,
help='label file path')
parser.add_argument('--top_k', type=int, default=3,
help='number of classes with highest score to display')
parser.add_argument('--threshold', type=float, default=0.1,
help='class score threshold')
parser.add_argument('--storage', required=True,
help='File path to store images and results')
parser.add_argument('--sound', required=True,
help='File path to deterrent sound')
parser.add_argument('--print', default=False, required=False,
help='Print inference results to terminal')
parser.add_argument('--training', default=False, required=False,
help='Training mode for image collection')
args = parser.parse_args()
return args
def main():
"""Creates camera pipeline, and pushes pipeline through ClassificationEngine
model. Logs results to user-defined storage. Runs either in training mode to
gather images for custom model creation or in deterrent mode that sounds an
'alarm' if a defined label is detected."""
args = user_selections()
print("Loading %s with %s labels."%(args.model, args.labels))
engine = ClassificationEngine(args.model)
labels = load_labels(args.labels)
storage_dir = args.storage
#Initialize logging file
logging.basicConfig(filename='%s/results.log'%storage_dir,
format='%(asctime)s-%(message)s',
level=logging.DEBUG)
last_time = time.monotonic()
last_results = [('label', 0)]
def user_callback(image,svg_canvas):
nonlocal last_time
nonlocal last_results
start_time = time.monotonic()
results = engine.classify_with_image(image, threshold=args.threshold, top_k=args.top_k)
end_time = time.monotonic()
results = [(labels[i], score) for i, score in results]
if args.print:
print_results(start_time,last_time, end_time, results)
if args.training:
if do_training(results,last_results,args.top_k):
save_data(image,results, storage_dir)
else:
#Custom model mode:
#The labels can be modified to detect/deter user-selected items
if results[0][0] !='background':
save_data(image, storage_dir,results)
if 'fox squirrel, eastern fox squirrel, Sciurus niger' in results:
playsound(args.sound)
logging.info('Deterrent sounded')
last_results=results
last_time = end_time
result = gstreamer.run_pipeline(user_callback)
if __name__ == '__main__':
main()
enter code here
I suggest that you follow one of the examples available from the coral examples. There is an example named classify_image.py which uses the edgetpu (tflite) that I found works. After you install the coral examples, you have to drill down through the directory hierarchy. So, in my case, from root it is: /home/pi/ml-projects/coral/pycoral/tensorflow/examples/lite/examples. There are 17 files in that last examples directory. I'm using: numpy 1.19.3, pycoral 2.0.0, scipy 1.7.1, tensorflow 2.4.0, tflite-runtime 2.5.0.post1. I've installed the following edgetpu-runtime: edgetpu_runtime_20201105.zip.

Pytorch: RAM explodes when using multiprocessing SharedMemory and CUDA

I would like to use multiprocessing to launch multiple training instances on CUDA device. Since the data is common between the processes, I want to avoid data copy for every process. I'm using python 3.8's SharedMemory from multiprocessing module to achieve this following this SO example.
I can allocate a memory block using SharedMemory and create as many processes as I'd like with constant memory (RAM) usage. However, when I try to send tensors to CUDA, the memory scales linearly with the number of processes. It appears as if when c.to(device) is called, the base data is copied for every process.
Does any one know why this is happening? Any ideas to mitigate this issue?
Here is the sample code I'm using:
import numpy as np
from multiprocessing import shared_memory, get_context
import time
import torch
import copy
dim = 10000
batch_size = 10
sleep_time = 2
npe = 1 # number of parallel executions
# cuda
if torch.cuda.is_available():
dev = 'cuda:0'
else:
dev = "cpu"
device = torch.device(dev)
def step(i, shr_name):
existing_shm = shared_memory.SharedMemory(name=shr_name)
np_arr = np.ndarray((dim, dim), dtype=np.float32, buffer=existing_shm.buf)
b = np_arr[i * batch_size: (i + 1) * batch_size, :]
b = torch.Tensor(b)
# This is just to explicitly copy the tensor so that it has nothing to do
# with the shared memory block
c = copy.deepcopy(b)
# If tensor c is sent to the cuda device, then RAM scales linearly
# with the number of parallel executions.
# If c is not sent to cuda device, memory consumption is constant.
c = c.to(device)
time.sleep(sleep_time)
existing_shm.close()
def create_shared_block():
a = np.random.random((dim, dim)).astype(np.float32)
shm = shared_memory.SharedMemory(create=True, size=a.nbytes, name='sha')
np_arr = np.ndarray(a.shape, dtype=np.float32, buffer=shm.buf)
np_arr[:] = a[:]
return shm, np_arr
if __name__ == '__main__':
# create shared memory block
shm, np_arr = create_shared_block()
# create list of inputs to be executed in parallel
inp = [[x, 'sha'] for x in range(npe)]
print(inp)
# sleep added before and after launching multiprocessing to monitor the memory consumption
print('before pool') # to check memory with top or htop
time.sleep(sleep_time)
context = get_context('spawn')
with context.Pool(npe) as pool:
print('after pool') # to check memory with top or htop
time.sleep(sleep_time)
pool.starmap(step, inp)
time.sleep(sleep_time)
shm.close()
shm.unlink()

Problems interrupting a python Input (Mac)

I am trying to allow a user to input multiple answers but only within an allocated amount of time. The problem is I have it running but the program will not interrupt the input. The program will only stop the user from inputing if the user inputs an answer after the time ends. Any ideas? Is what I am trying to do even possible in python?
I have tried using threading and the signal module however they both result in the same issue.
Using Signal:
import signal
def handler(signum, frame):
raise Exception
def answer_loop():
score = 0
while True:
answer = input("Please input your answer")
signal.signal(signal.SIGALRM, handler)
signal.alarm(5)
try:
answer_loop()
except Exception:
print("end")
signal.alarm(0)
Using Threading:
from threading import Timer
def end():
print("Time is up")
def answer_loop():
score = 0
while True:
answer = input("Please input your answer")
time_limit = 5
t = Timer(time_limit, end)
t.start()
answer_loop()
t.cancel()
Your problem is that builtin input does not have a timeout parameter and, AFAIK, threads cannot be terminated by other threads. I suggest instead that you use a GUI with events to finely control user interaction. Here is a bare bones tkinter example.
import tkinter as tk
root = tk.Tk()
label = tk.Label(root, text='answer')
entry = tk.Entry(root)
label.pack()
entry.pack()
def timesup():
ans = entry.get()
entry.destroy()
label['text'] = f"Time is up. You answered {ans}"
root.after(5000, timesup)
root.mainloop()

`ProcessPoolExecutor` works on Ubuntu, but fails with `BrokenProcessPool` when running Jupyter 5.0.0 notebook with Python 3.5.3 on Windows 10

I'm running Jupyter 5.0.0 notebook with Python 3.5.3 on Windows 10. The following example code fails to run:
from concurrent.futures import as_completed, ProcessPoolExecutor
import time
import numpy as np
def do_work(idx1, idx2):
time.sleep(0.2)
return np.mean([idx1, idx2])
with ProcessPoolExecutor(max_workers=4) as executor:
futures = set()
for idx in range(32):
future = winprocess.submit(
executor, do_work, idx, idx * 2
)
futures.add(future)
for future in as_completed(futures):
print(future.result())
... and throws BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
The code works perfectly fine on Ubuntu 14.04.
I've understand that Windows doesn't have os.fork, thus multiprocessing is handled differently, and doesn't always play nice with interactive mode and Jupyter.
What are some workarounds to make ProcessPoolExecutor work in this case?
There are some similar questions, but they relate to multiprocessing.Pool:
multiprocessing.Pool in jupyter notebook works on linux but not windows
Closer inspection shows that a Jupyter notebook can run external python modules which is parallelized using ProcessPoolExecutor. So, a solution is to do the parallelizable part of your code in a module and call it from the Jupyter notebook.
That said, this can be generalized as a utility. The following can be stored as a module, say, winprocess.py and imported by jupyter.
import inspect
import types
def execute_source(callback_imports, callback_name, callback_source, args):
for callback_import in callback_imports:
exec(callback_import, globals())
exec('import time' + "\n" + callback_source)
callback = locals()[callback_name]
return callback(*args)
def submit(executor, callback, *args):
callback_source = inspect.getsource(callback)
callback_imports = list(imports(callback.__globals__))
callback_name = callback.__name__
future = executor.submit(
execute_source,
callback_imports, callback_name, callback_source, args
)
return future
def imports(callback_globals):
for name, val in list(callback_globals.items()):
if isinstance(val, types.ModuleType) and val.__name__ != 'builtins' and val.__name__ != __name__:
import_line = 'import ' + val.__name__
if val.__name__ != name:
import_line += ' as ' + name
yield import_line
Here is how you would use this:
from concurrent.futures import as_completed, ProcessPoolExecutor
import time
import numpy as np
import winprocess
def do_work(idx1, idx2):
time.sleep(0.2)
return np.mean([idx1, idx2])
with ProcessPoolExecutor(max_workers=4) as executor:
futures = set()
for idx in range(32):
future = winprocess.submit(
executor, do_work, idx, idx * 2
)
futures.add(future)
for future in as_completed(futures):
print(future.result())
Notice that executor has been changed with winprocess and the original executor is passed to the submit function as a parameter.
What happens here is that the notebook function code and imports are serialized and passed to the module for execution. The code is not executed until it is safely in a new process, thus does not trip up with trying to make a new process based on the jupyter notebook itself.
Imports are handled in such a way as to maintain aliases. The import magic can be removed if you make sure to import everything needed for the function being executed inside the function itself.
Also, this solution only works if you pass all necessary variables as arguments to the function. The function should be static so to speak, but I think that's a requirement of ProcessPoolExecutor as well. Finally, make sure you don't execute other functions defined elsewhere in the notebook. Only external modules will be imported, thus other notebook functions won't be included.

Tkinter problems with GUI when entering while loop

I have a simple GUI which run various scripts from another python file, everything works fine until the GUI is running a function which includes a while loop, at which point the GUI seems to crash and become in-active. Does anybody have any ideas as to how this can be overcome, as I believe this is something to do with the GUI being updated,Thanks. Below is a simplified version of my GUI.
GUI
#!/usr/bin/env python
# Python 3
from tkinter import *
from tkinter import ttk
from Entry import ConstrainedEntry
import tkinter.messagebox
import functions
AlarmCode = "2222"
root = Tk()
root.title("Simple Interface")
mainframe = ttk.Frame(root, padding="3 3 12 12")
mainframe.grid(column=0, row=0, sticky=(N, W, E, S))
mainframe.columnconfigure(0, weight=1)
mainframe.rowconfigure(0, weight=1)
ttk.Button(mainframe, width=12,text="ButtonTest",
command=lambda: functions.test()).grid(
column=5, row=5, sticky=SE)
for child in mainframe.winfo_children():
child.grid_configure(padx=5, pady=5)
root.mainloop()
functions
def test():
period = 0
while True:
if (period) <=100:
time.sleep(1)
period +=1
print(period)
else:
print("100 seconds has passed")
break
What will happen in the above is that when the loop is running the application will crash. If I insert a break in the else statement after the period has elapsed, everything will work fine. I want users to be able to click when in loops as this GUI will run a number of different functions.
Don't use time.sleep in the same thread than your Tkinter code: it freezes the GUI until the execution of test is finished. To avoid this, you should use after widget method:
# GUI
ttk.Button(mainframe, width=12,text="ButtonTest",
command=lambda: functions.test(root))
.grid(column=5, row=5, sticky=SE)
# functions
def test(root, period=0):
if period <= 100:
period += 1
print(period)
root.after(1000, lambda: test(root, period))
else:
print("100 seconds has passed")
Update:
In your comment you also add that your code won't use time.sleep, so your original example may not be the most appropiate. In that case, you can create a new thread to run your intensive code.
Note that I posted the alternative of after first because multithreading should be used only if it is completely necessary - it adds overhead to your applicacion, as well as more difficulties to debug your code.
from threading import Thread
ttk.Button(mainframe, width=12,text="ButtonTest",
command=lambda: Thread(target=functions.test).start())
.grid(column=5, row=5, sticky=SE)
# functions
def test():
for x in range(100):
time.sleep(1) # Simulate intense task (not real code!)
print(x)
print("100 seconds has passed")

Resources