Why using "fork" works but using "spawn" fails in Python3.8+ `multiprocessing`? - multiprocessing

I work on macOS and lately got bitten by the "fork" to "spawn" change in Python 3.8 multiprocessing (see doc). Below shows a simplified working example where using "fork" succeeds but using "spawn" fails. The purpose of the code is to create a custom queue object that supports calling size() under macOS, hence the inheritance from the Queue object and getting multiprocessing's context.
import multiprocessing
from multiprocessing import Process
from multiprocessing.queues import Queue
from time import sleep
class Q(Queue):
def __init__(self):
super().__init__(ctx=multiprocessing.get_context())
self.size = 1
def call(self):
return print(self.size)
def foo(q):
q.call()
if __name__ == '__main__':
multiprocessing.set_start_method('spawn') # this would fail
# multiprocessing.set_start_method('fork') # this would succeed
q = Q()
p = Process(target=foo, args=(q,))
p.start()
p.join(timeout=1)
The error message output when using "spawn" is shown below.
Process Process-1:
Traceback (most recent call last):
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/Users/fanchen/Private/python_work/sandbox.py", line 23, in foo
q.call()
File "/Users/fanchen/Private/python_work/sandbox.py", line 19, in call
return print(self.size)
AttributeError: 'Q' object has no attribute 'size'
It seems that the child process deems self.size not necessary for code execution, so it is not copied. My question is why does this happen?
Code snippet tested under macOS Catalina 10.15.6, Python 3.8.5

The problem is that spawned processes do not have shared resources, so to properly recreate the queue instance for each process you need to add serialization and deserialization methods.
Here is a working code:
# Portable queue
# The idea of Victor Terron used in Lemon project (https://github.com/vterron/lemon/blob/master/util/queue.py).
# Pickling/unpickling methods are added to share Queue instance between processes correctly.
import multiprocessing
import multiprocessing.queues
class SharedCounter(object):
""" A synchronized shared counter.
The locking done by multiprocessing.Value ensures that only a single
process or thread may read or write the in-memory ctypes object. However,
in order to do n += 1, Python performs a read followed by a write, so a
second process may read the old value before the new one is written by the
first process. The solution is to use a multiprocessing.Lock to guarantee
the atomicity of the modifications to Value.
This class comes almost entirely from Eli Bendersky's blog:
http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/
"""
def __init__(self, n = 0):
self.count = multiprocessing.Value('i', n)
def __getstate__(self):
return (self.count,)
def __setstate__(self, state):
(self.count,) = state
def increment(self, n = 1):
""" Increment the counter by n (default = 1) """
with self.count.get_lock():
self.count.value += n
#property
def value(self):
""" Return the value of the counter """
return self.count.value
class Queue(multiprocessing.queues.Queue):
""" A portable implementation of multiprocessing.Queue.
Because of multithreading / multiprocessing semantics, Queue.qsize() may
raise the NotImplementedError exception on Unix platforms like Mac OS X
where sem_getvalue() is not implemented. This subclass addresses this
problem by using a synchronized shared counter (initialized to zero) and
increasing / decreasing its value every time the put() and get() methods
are called, respectively. This not only prevents NotImplementedError from
being raised, but also allows us to implement a reliable version of both
qsize() and empty().
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, ctx=multiprocessing.get_context())
self._counter = SharedCounter(0)
def __getstate__(self):
return super().__getstate__() + (self._counter,)
def __setstate__(self, state):
super().__setstate__(state[:-1])
self._counter = state[-1]
def put(self, *args, **kwargs):
super().put(*args, **kwargs)
self._counter.increment(1)
def get(self, *args, **kwargs):
item = super().get(*args, **kwargs)
self._counter.increment(-1)
return item
def qsize(self):
""" Reliable implementation of multiprocessing.Queue.qsize() """
return self._counter.value
def empty(self):
""" Reliable implementation of multiprocessing.Queue.empty() """
return not self.qsize()

You can also use multiprocessing.manager.Queue

Related

Asyncio worker class to handle parallel jobs

I found an example here but I dont understand how to make the code work
class Worker:
def __init__(self, func, n=3):
self.func = func
self.queue = asyncio.Queue()
self.semaphore = asyncio.Semaphore(n)
def put(self, *args):
self.queue.put_nowait(args)
async def run(self):
while True:
args = await self._get()
if args is None:
return
asyncio.ensure_future(self._target(args))
async def _get(self):
get_task = asyncio.ensure_future(self.queue.get())
join_task = asyncio.ensure_future(self.queue.join())
await asyncio.wait(coros, return_when='FIRST_COMPLETED')
if get_task.done():
return task.result()
async def _target(self, args):
try:
async with self.semaphore:
return await self.func(*args)
finally:
self.queue.task_done()
I tried then:
def work(a1,a2): print('work done',a1,a2)
W = Worker(work,n=3)
W.put(1,2)
W.put(1,2)
W.put(1,2)
result = await W.run() # for Jupyter notebooks
# asyncio.run(W.run()) # for normal python
I get error:
NameError: name 'coros' is not defined
I admit, the linked solution confused me, and it didn't seem to work. So, I just rewrote the Worker class which hopefully now works for you:
import asyncio
class Worker:
def __init__(self, func, n=3):
self.func = func
self.queue = asyncio.Queue()
self.semaphore = asyncio.Semaphore(n)
def put(self, *args):
self.queue.put_nowait(args)
async def run(self):
tasks = []
while True:
try:
args = self.queue.get_nowait()
except asyncio.QueueEmpty:
break
tasks.append(asyncio.ensure_future(self.do_work(args)))
await asyncio.gather(*tasks)
async def do_work(self, args):
async with self.semaphore:
await self.func(*args)
This seems like a much simpler way to do it to me. Basically, this changes Worker.run to just start a task for each item in the queue, and each task must first acquire the semaphore before calling the provided work function. And then it finishes after all the work is done.
Here is the usage:
async def work(a1, a2):
print("Starting work...", a1, a2)
await asyncio.sleep(1)
print("Finishing work...")
W = Worker(work, n=3)
W.put(1,2)
W.put(3,4)
W.put(5,6)
W.put(7,8)
W.put(9,10)
asyncio.get_event_loop().run_until_complete(W.run())
"""Output
Starting work... 1 2
Starting work... 3 4
Starting work... 5 6
Finishing work...
Finishing work...
Finishing work...
Starting work... 7 8
Starting work... 9 10
Finishing work...
Finishing work...
"""
It should be noted that you cannot use asyncio.run while also using asyncio.Semaphore this way, because asyncio.run always starts a new loop, while asyncio.Semaphore(n) sets its loop as the default loop before asyncio.run is called. This causes the semaphore to use a different loop than Worker.run.
So just using asyncio.get_event_loop.run_until_complete works fine because it uses the default loop (which the semaphore is expecting).

How to update the Wxpython statusbar with run-time data streaming in another *.py file

my project has many python files,and right now the problem is when I clicked button in Main-UI interface,it will invoke some function in another python file(sub-program),and i need all the running status in sub-program will also be updated in main-UI,how do i accomplish this?
The thing i can try so far i known is using socket,but i want to know do you guys have any other good ideas on this?
The code like this:
1. Main-UI:
import wx,time
from threading import Thread
from path import basicTest
EVT_RESULT_ID = wx.NewId()
def EVT_RESULT(win, func):
win.Connect(-1, -1, EVT_RESULT_ID, func)
class ResultEvent(wx.PyEvent):
def __init__(self, data):
wx.PyEvent.__init__(self)
self.SetEventType(EVT_RESULT_ID)
self.data = data
class TestThread(Thread):
def __init__(self, wxObject):
Thread.__init__(self)
self.wxObject = wxObject
self.start()
def run(self):
this masked sub-script can run,but what i want to do is to replace it with invoking from another python file
'''
for i in range(6):
time.sleep(1)
wx.PostEvent(self.wxObject, ResultEvent(i))
'''
data = basicTest().run(10)
wx.PostEvent(self.wxObject, ResultEvent(data))
class MyForm(wx.Frame):
def __init__(self):
wx.Frame.__init__(self, None, wx.ID_ANY, "Tester")
panel = wx.Panel(self, wx.ID_ANY)
self.btn = wx.Button(panel, label="Start Test")
self.statusbar = self.CreateStatusBar()
self.btn.Bind(wx.EVT_BUTTON, self.onButton)
EVT_RESULT(self, self.updateStatus)
def onButton(self, event):
TestThread(self)
btn = event.GetEventObject()
btn.Disable()
def updateStatus(self, msg):
t = msg.data
self.statusbar.SetStatusText("Sequence %i running.." %t)
self.btn.Enable()
if __name__ == "__main__":
app = wx.PySimpleApp()
frame = MyForm().Show()
app.MainLoop()
sub-script:
import time
class basicTest():
def run(self,inter):
for i in range(inter):
return i
time.sleep(1)
As list above and also i known that the main UI only updated when the sub-script finished,it's not run-time refresh from another sub,can anyone help me, very appreciate
I would just leave the long running code in its thread. Then you can have wxPython start the thread. In your thread's run method, just use one of wxPython's thread-safe methods to call your UI. I would recommend wx.CallAfter or 'wx.PostEvent'.
Once you have that done, then you just execute the necessary method in your main UI.

Celery on Windows without MQ

I do have an App running on linux using Celery/RabbitMQ, the setup works fine.
But now I have to develop some new features and the developing machine I have available is Windows7 with no admin rights, so I can't install Rabbit or Redis server.
There is some way to emulate this so I can execute this app (Flask by the way), on my windows machine?
Thanks in advance!
You can set CELERY_ALWAYS_EAGER:
If this is True, all tasks will be executed locally by blocking until
the task returns. apply_async() and Task.delay() will return an
EagerResult instance, which emulates the API and behavior of
AsyncResult, except the result is already evaluated.
That is, tasks will be executed locally instead of being sent to the
queue.
I don't like to answer my own question but if someone found this while looking for a way to emulate the presence of celery when it is not in there here is what I did:
Since I'm using Flask I already had a modified celery class to share context, now I had a second make_celery function to be used when I need the emulation.
if usefake == True:
#this is a fake task! actualy task will run in same thread of caller
def make_celery(app):
class FakeCelery(object):
def task(self, func):
class FakeTask(object):
def __init__(self, f):
self.f = f
def delay(self, *args, **kwargs):
return self.f(*args, **kwargs)
return FakeTask(func)
return FakeCelery()
else:
#this is the production maker which share the context with the task
def make_celery(app):
celery = Celery(app.import_name, broker=app.config['CELERY_BROKER_URL'])
celery.conf.update(app.config)
TaskBase = celery.Task
class ContextTask(TaskBase):
abstract = True
def __call__(self, *args, **kwargs):
with app.app_context():
return TaskBase.__call__(self, *args, **kwargs)
celery.Task = ContextTask
return celery
celery = make_celery(app)
both have the same usage:
#celery.task
def mytask(a, b):
return a+b
mytask.delay(1, 2)
As you can see I always use delay only, so the other task functions like get is not implemented but can be done.
I don't think this is a nice solution, but solved my problem for now. Any other solution will be wellcome.

How to run REP and PUB in single instance non-blocking Pyzmq

My object:
class mysrv(object):
def __init__(self):
self._pubsocket = context.socket(zmq.PUB)
self._socket = context.socket(zmq.REP)
self._socket.bind("tcp://127.0.0.1:9003")
self._pubsocket.bind("tcp://127.0.0.1:9004")
Then I run both functions:
def main():
s = mysrv()
Process(target=s.publoop()).start()
Process(target=s.reqrep()).start()
The first function blocks the second. Is there any way to run them from a single class instance?
The answer is that while will block everything even sleep is called.
So, running many whiles in the same process without breaks is blocking.

Pyside qwebview custom protocol

I'd like to support custom protocol inside my pyside app but without success. So far I tried:
class MainWindow(QWebView):
def __init__(self, parent=None):
oldManager = self.page().networkAccessManager()
self.page().setNetworkAccessManager(NetworkAccessManager(self, oldManager))
#in another file
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self, parent, oldManager):
QNetworkAccessManager.__init__(self)
self.oldManager = oldManager
self.setCache(oldManager.cache())
self.setCookieJar(oldManager.cookieJar())
self.setProxy(oldManager.proxy())
self.setProxyFactory(oldManager.proxyFactory())
print('There')
def createRequest(self, operation, request, data):
print('And there')
This results in a segmentation faultunder windows. I saw this :
It is currently not supported to change the network access manager after the PySide.QtWebKit.QWebPage has used it.
But I don't see where it would be used in this case. I tried to set a web page object after setting the network manager and
the segmentation fault disappeared.
PS: none of the print statement is displayed inside the console.
If createRequest doesn't return a reply it crahes. So the complete solution is:
class MainWindow(QWebView):
def __init__(self, parent=None):
oldManager = self.page().networkAccessManager()
self.setPage(DebugWebPage()) #if you want to set a custom page
self.page().setNetworkAccessManager(NetworkAccessManager(self))
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self, parent):
QNetworkAccessManager.__init__(self)
def createRequest(self, operation, request, data):
if request.url().scheme() != 'page':
return QNetworkAccessManager.createRequest(self, operation, request, data)
if operation == self.GetOperation:
# Handle page:// URLs separately by creating custom
# QNetworkReply objects.
reply = PageReply(self, request.url(), self.GetOperation)
print('here')
return reply
else:
return QNetworkAccessManager.createRequest(self, operation, request, data)
class PageReply(QNetworkReply):
def __init__(self, parent, url, operation):
QNetworkReply.__init__(self, parent)
self.content = '<html><head><title>Test</title></head><body>This is a test.</body></html>'
self.offset = 0
self.setHeader(QNetworkRequest.ContentTypeHeader, 'text/html; charset=utf-8')
self.setHeader(QNetworkRequest.ContentLengthHeader, len(self.content))
QTimer.singleShot(0, self, SIGNAL('readyRead()'))
QTimer.singleShot(0, self, SIGNAL('finished()'))
self.open(self.ReadOnly | self.Unbuffered)
self.setUrl(url)
def abort(self):
pass
def bytesAvailable(self):
return len(self.content) - self.offset + QNetworkReply.bytesAvailable(self)
def isSequential(self):
return True
def readData(self, maxSize):
if self.offset < len(self.content):
end = min(self.offset + maxSize, len(self.content))
data = self.content[self.offset:end]
self.offset = end
return data
Note: I don't really know why but any error while the script is in the network manager or the reply results in a segmentation fault.
Based on this with some correction.

Resources