PyTorch Distributed SocketTimeout error on Windows - windows

i've developed this little POC using pytorch distributed package: essentially a Trainer spawns N processes and orchestrate them using python Pipes (it could also be Queues). Normally it should send data at every epoch, but in this POC the data is just sent once on process creation. The processes train a model through DDP.
import os
import signal
import socket
from contextlib import closing
from multiprocessing.connection import Connection, Pipe
from typing import List
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.nn.parallel import DistributedDataParallel as DDP
def init_process(rank, world_size, ddp_free_port, recv, train_data):
"""Initialize the distributed environment."""
torch.set_num_threads(1)
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = ddp_free_port
os.environ["RANK"] = str(rank)
os.environ["LOCAL_RANK"] = str(rank)
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["NODE_RANK"] = "0"
dist.init_process_group("gloo", init_method=f"tcp://localhost:{ddp_free_port}", rank=rank, world_size=world_size)
Worker(recv, train_data).train()
class Worker:
def __init__(self, queue, train_dset):
self.rank = dist.get_rank()
self.world_size = dist.get_world_size()
self.queue: Connection = queue
self.train_dset = train_dset
self.model = torch.nn.Sequential(nn.Linear(784, 64), torch.nn.ReLU(), torch.nn.Linear(64, 10))
self.model = DDP(self.model)
self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
def train(self):
loss_fn = nn.CrossEntropyLoss()
sampler = torch.utils.data.distributed.DistributedSampler(
self.train_dset, num_replicas=self.world_size, rank=self.rank, shuffle=True
)
train_loader = torch.utils.data.DataLoader(self.train_dset, sampler=sampler, batch_size=32)
while True:
epoch = self.queue.recv()
if epoch is False:
print(f"Rank-{self.rank} done!")
return
total_loss = 0
sampler.set_epoch(epoch)
for i, batch in enumerate(train_loader):
images, labels = batch
out = self.model(images.view(-1, 28 * 28))
loss = loss_fn(out, labels)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
total_loss += loss.item()
dist.barrier()
if self.rank == 0:
print(f"Epoch: {epoch}, Loss#rank-{self.rank}: {total_loss / len(train_loader):.4f}")
print(f"Rank-0 is telling the trainer that everything is done for the epoch {epoch}")
self.queue.send(True)
class Trainer:
def __init__(self, world_size: int, epochs: int = 5) -> None:
self.world_size = world_size
self.epochs = epochs
self.train_data = torchvision.datasets.MNIST(
"/tmp/data",
train=True,
download=True,
transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor()]),
)
self.test_data = torchvision.datasets.MNIST(
"/tmp/data",
train=False,
download=True,
transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor()]),
)
self.ddp_free_port = str(find_free_port())
def run(self):
"""Run the distributed environment."""
print("Start training")
queues = []
processes = []
for rank in range(self.world_size):
if rank == 0:
recv, send = Pipe(duplex=True)
else:
recv, send = Pipe(duplex=False)
p = mp.Process(
target=init_process,
args=(rank, self.world_size, self.ddp_free_port, recv, self.train_data),
daemon=True,
)
p.start()
queues.append(send)
processes.append(p.pid)
self.train(queues, processes)
def train(self, queues, processes):
for epoch in range(self.epochs):
for rank in range(self.world_size):
queues[rank].send(epoch)
print("Training waiting for rank-0")
queues[0].recv()
for rank in range(self.world_size):
queues[rank].send(False)
queues[rank].close()
os.kill(processes[rank], signal.SIGTERM)
def find_free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(("", 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return s.getsockname()[1]
if __name__ == "__main__":
os.environ["LOGLEVEL"] = "DEBUG"
mp.set_start_method("spawn")
trainer = Trainer(world_size=16)
trainer.run()
print("Finished training")
I receive the following error, for every process spawned, randomly, if i increase the number of processes from 16 to 32 for example:
...
Process Process-1:
Traceback (most recent call last):
File "C:\Program Files (x86)\Python38\lib\multiprocessing\process.py", line 315, in _bootstrap
self.run()
File "C:\Program Files (x86)\Python38\lib\multiprocessing\process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "c:\Users\belof\Desktop\temp\examples\ddp_cpu.py", line 27, in init_process
dist.init_process_group("gloo", init_method=f"tcp://localhost:{ddp_free_port}", rank=rank, world_size=world_size)
File "C:\Users\belof\Desktop\temp\.venv\lib\site-packages\torch\distributed\distributed_c10d.py", line 602, in init_process_group
default_pg = _new_process_group_helper(
File "C:\Users\belof\Desktop\temp\.venv\lib\site-packages\torch\distributed\distributed_c10d.py", line 703, in _new_process_group_helper
pg = ProcessGroupGloo(prefix_store, rank, world_size, timeout=timeout)
RuntimeError: Socket Timeout
Traceback (most recent call last):
File "C:\Program Files (x86)\Python38\lib\multiprocessing\connection.py", line 312, in _recv_bytes
nread, err = ov.GetOverlappedResult(True)
BrokenPipeError: [WinError 109] The pipe has been ended
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:/Users/belof/Desktop/temp/examples/ddp_cpu.py", line 131, in <module>
trainer.run()
File "c:/Users/belof/Desktop/temp/examples/ddp_cpu.py", line 106, in run
self.train(queues, processes)
File "c:/Users/belof/Desktop/temp/examples/ddp_cpu.py", line 113, in train
queues[0].recv()
File "C:\Program Files (x86)\Python38\lib\multiprocessing\connection.py", line 250, in recv
buf = self._recv_bytes()
File "C:\Program Files (x86)\Python38\lib\multiprocessing\connection.py", line 321, in _recv_bytes
raise EOFError
EOFError
It seems to me something related to windows spawn method and the queue references passed to the processes, but i don't really know what is happening here.
This is the result of the collect_env.py script:
Collecting environment information...
PyTorch version: 1.12.1+cpu
Is debug build: False
CUDA used to build PyTorch: None
ROCM used to build PyTorch: N/A
OS: Microsoft Windows 10 Pro
GCC version: Could not collect
Clang version: Could not collect
CMake version: Could not collect
Libc version: N/A
Python version: 3.8.8 (tags/v3.8.8:024d805, Feb 19 2021, 13:18:16) [MSC v.1928 64 bit (AMD64)] (64-bit runtime)
Python platform: Windows-10-10.0.19041-SP0
Is CUDA available: False
CUDA runtime version: No CUDA
GPU models and configuration: No CUDA
Nvidia driver version: No CUDA
cuDNN version: No CUDA
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
Versions of relevant libraries:
[pip3] mypy==0.931
[pip3] mypy-extensions==0.4.3
[pip3] numpy==1.23.3
[pip3] pytorch-lightning==1.6.4
[pip3] torch==1.12.1
[pip3] torchmetrics==0.9.3
[pip3] torchvision==0.12.0
[conda] Could not collect

Related

sys:1: RuntimeWarning: coroutine 'AirthingsWaveDetect.get_sensors' was never awaited

I am a beginner trying to get the data from airthings plus to hubitat using a raspberry pi. When running a python script I get this error message :
readwaveplus.py:391: RuntimeWarning: coroutine 'AirthingsWaveDetect.get_sensor_data' was never awaited
data = airthingsdetect.get_sensor_data()[MAC]
RuntimeWarning: Enable tracemalloc to get the object allocation traceback
Read Error
^CTraceback (most recent call last):
File "readwaveplus.py", line 414, in <module>
time.sleep(SamplePeriod)
KeyboardInterrupt
sys:1: RuntimeWarning: coroutine 'AirthingsWaveDetect.get_sensors' was never awaited
It seems I need to add "await" somewhere but I am not able to figure out where. I tried to add await but not sure where It should be added.
Here is the script :
async def main():
logging.basicConfig()
_LOGGER.setLevel(logging.DEBUG)
ad = AirthingsWaveDetect(0)
num_dev_found = await ad.find_devices()
if num_dev_found > 0:
devices = await ad.get_info()
for mac, dev in devices.items():
_LOGGER.info("Device: {}: {}".format(mac, dev))
devices_sensors = await ad.get_sensors()
for mac, sensors in devices_sensors.items():
for sensor in sensors:
_LOGGER.info("Sensor: {}: {}".format(mac, sensor))
sensordata = await ad.get_sensor_data()
for mac, data in sensordata.items():
for name, val in data.items():
_LOGGER.info("Sensor data: {}: {}: {}".format(mac, name, val))
if __name__ == "__main__":
asyncio.run(main())
import time
import requests
# The period between two measurements in seconds (Default: 300)
SamplePeriod = 300
MAC = 'd8:71:4d:ca:de:dc'
# The hostname or IP address of the MQTT broker Hubitat hub to connect
makerAPIHostname = "192.168.1.38"
makerAPIAppID = "2416"
makerAPIToken = "8ca5e2ac-7d0b-4c24-a89b-f6022844e2ae"
makerAPIDeviceID = "AirThings Wave Plus"
from airthings import AirthingsWaveDetect
scan_interval = 120
airthingsdetect = AirthingsWaveDetect(scan_interval, MAC)
V_MAX = 3.0
V_MIN = 2.0
#---- Initialize ----#
URL = "http://{}/apps/api/{}/devices/{}/{}?access_token={}".format(makerAPIHostname, makerAPIAppID, makerAPIDeviceID, '{}/{}', makerAPIToken)
devices_sensors = airthingsdetect.get_sensors()
while True:
try:
data = airthingsdetect.get_sensor_data()[MAC]
except:
print( 'Read Error' )
pass
else:
sensorData = "{},{},{},{},{},{},{},{},{}".format(
data['temperature'],
data['humidity'],
data['rel_atm_pressure'],
data['co2'],
data['voc'],
round(data['radon_1day_avg']/37.,2),
round(data['radon_longterm_avg']/37.,2),
max(0, min(100, round( (data['battery']-V_MIN)/(V_MAX-V_MIN)*100))),
round(data['illuminance'])
)
#print( sensorData )
try:
request = URL.format('setValues', sensorData)
requests.get(request)
except:
pass
finally:
time.sleep(SamplePeriod)
Any idea ? Thanks.
Async needs an await but I don't know where. Thanks.

Serial Async Acquisition

I'ven't understood well the asyncio functionaly and I need a clarification.
I've checked that this code works:
from asyncio import get_event_loop
from serial_asyncio import open_serial_connection
async def main():
reader, writer = await open_serial_connection(url='COM10', baudrate=115200)
while True:
line = await reader.readline()
print(str(line, 'utf-8'))
asyncio.run(main())
basically the connection is established and I see messages coming from the serial port but it is not what I was expecting.
I'd like to start the acquisition from the serial and at the same time I'd like to go ahead doing other stuffs in my code.
Basically something like this one in scapy:
t = AsyncSniffer()
t.start()
print("Acquisition started")
wait (10)
t.stop()
I've also tried using the Thread option like this:
import serial
import threading
global stop_threads
import time
stop_threads=False
def thread_function_1():
ser = serial.Serial('COM1',115200)
while thread_function_1():
line=ser.readline()
if line:
string=line.decode()
print(string)
if stop_threads==true:
ser.close()
break
print ('print_1')
x= threading.Thread(name = "thread_function_1", target = thread_function_1)
x.start()
time.sleep(100)
stop_threads = True
print ('print_2')
x.join()
I'm getting this error which I cannot solve:
Traceback (most recent call last):
File "C:\Users\SIM\AppData\Local\Programs\Python\Python38\lib\threading.py", line 932, in _bootstrap_inner
self.run()
File "C:\Users\SIM\AppData\Local\Programs\Python\Python38\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "C:\python\connessione_seriale.py", line 31, in thread_function_1
while thread_function_1():
File "C:\python\connessione_seriale.py", line 14, in thread_function_1
ser = serial.Serial('COM1',115200)
File "C:\Users\SIM\AppData\Local\Programs\Python\Python38\lib\site-packages\serial\serialwin32.py", line 33, in __init__
super(Serial, self).__init__(*args, **kwargs)
File "C:\Users\SIM\AppData\Local\Programs\Python\Python38\lib\site-packages\serial\serialutil.py", line 244, in __init__
self.open()
File "C:\Users\SIM\AppData\Local\Programs\Python\Python38\lib\site-packages\serial\serialwin32.py", line 64, in open
raise SerialException("could not open port {!r}: {!r}".format(self.portstr, ctypes.WinError()))
serial.serialutil.SerialException: could not open port 'COM1': PermissionError(13, 'Access is denied.', None, 5)
the COM1 is available.I've tried to open it via putty and it works.
this is the solution I've found. I've used a thread inside a class in order to start/stop the serial acquisition when I need it.
class ThreadTest:
def seriale(self, n):
ser = serial.Serial('COM9', 115200, timeout=1)
time.sleep(2)
with open('C:/python/test.txt', 'w') as output:
if self._running==False:
ser.close()
while self._running and n > 0:
line = ser.readline()
#print (line)
n -= 1
if line:
# Converting Byte Strings into unicode strings
string = line.decode()
#print (string)
output.write(string)
def terminate(self):
self._running = False
def __init__(self):
self._running = True
c = ThreadTest()
t1 = Thread(target=c.seriale, args=(10,)) #10 are the seconds of the acquisition via serial and can be replaced
t1.start() # start of the serial acquisition
time.sleep(10)# here the sleep can be replaced with code
c.terminate()

When streaming tweets with Python Twitter I get the error message: OpenSSL.SSL.SysCallError: (104, 'ECONNRESET')

I got this error where I have no clue where it come from and where to start to debug. I do not understand this error at all. I'll just post my code here. I utilized multithreading in my code to periodically save the tweets on separate files. My program will work fine for maybe over 10 days, and then throw me this error message. Maybe twitter reset my connection?
import twitter
import json
import time
from tqdm import tqdm
import string
from datetime import timedelta, datetime
import threading
import os
consumer_key = '*'
consumer_secret = '*'
access_token_key = '*'
access_token_secret = '*'
api = twitter.Api(consumer_key = consumer_key,\
consumer_secret = consumer_secret,\
access_token_key = access_token_key,\
access_token_secret = access_token_secret)
os.chdir('*/social_media/streamed_tweets')
#multiThreading class and functions starts here
class timerThread(threading.Thread):
def __init__(self, threadID, name, stoptime, stopevent): #remember to change args accordingly
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.stoptime = stoptime
self.stopevent = stopevent
def run(self):
print("Starting " + self.name)
twiTimer(self.stoptime, self.stopevent)
print("Exiting " + self.name)
class streamerThread(threading.Thread):
def __init__(self, threadID, name, keywords, stop): #remember to change args accordingly
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.keywords = keywords
self.stop = stop
def run(self):
print("Starting " + self.name)
twiStream(self.keywords, self.stop)
print("Exiting " + self.name)
def twiTimer(stop_time, stop_event):
while True:
now1 = datetime.now()
t = str(now1.timetz())
if t[:8] == stop_time: #make it t[:8] for daily loop
stop_event.set()
print('stop triggered')
time.sleep(1) #wait to pass the '00' time
time.sleep(0.1)
def twiStream(keywords, stop_event):
while True:
stream = api.GetStreamFilter(track = keywords, languages=['en'], filter_level = None)
now2 = datetime.now()
filename = str(now2.date())+'.txt' #change this to date() for daily loop
f = open(filename, 'w+')
stop_event.clear() #reset the event
print(now2)
with tqdm() as pbar:
while not stop_event.is_set():
counter = 2
for tweet in stream:
if counter <= 0:
break
f.write(json.dumps(tweet))
f.write('\n')
pbar.update()
counter -= 1
f.close()
time.sleep(0.5)
#multiThreading class and functions ends here
def main():
keywords = ['*']
stop = threading.Event()
stop_at = '00:00:00' #make it '00:00:00' for daily loop
#count = 10
thread1 = timerThread(1, 'timerThread', stop_at, stop)
thread2 = streamerThread(2, 'streamerThread', keywords, stop)
thread1.start()
thread2.start()
print("Exiting Main Thread")
if __name__ == "__main__":
main()
Here is the full error message:
Exception in thread streamerThread:
Traceback (most recent call last):
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py", line 285, in recv_into
return self.connection.recv_into(*args, **kwargs)
File "/home/abe/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py", line 1814, in recv_into
self._raise_ssl_error(self._ssl, result)
File "/home/abe/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py", line 1631, in _raise_ssl_error
raise SysCallError(errno, errorcode.get(errno))
OpenSSL.SSL.SysCallError: (104, 'ECONNRESET')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/response.py", line 331, in _error_catcher
yield
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/response.py", line 637, in read_chunked
self._update_chunk_length()
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/response.py", line 569, in _update_chunk_length
line = self._fp.fp.readline()
File "/home/abe/anaconda3/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py", line 290, in recv_into
raise SocketError(str(e))
OSError: (104, 'ECONNRESET')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/abe/anaconda3/lib/python3.6/site-packages/requests/models.py", line 749, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/response.py", line 461, in stream
for line in self.read_chunked(amt, decode_content=decode_content):
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/response.py", line 665, in read_chunked
self._original_response.close()
File "/home/abe/anaconda3/lib/python3.6/contextlib.py", line 99, in __exit__
self.gen.throw(type, value, traceback)
File "/home/abe/anaconda3/lib/python3.6/site-packages/urllib3/response.py", line 349, in _error_catcher
raise ProtocolError('Connection broken: %r' % e, e)
urllib3.exceptions.ProtocolError: ('Connection broken: OSError("(104, \'ECONNRESET\')",)', OSError("(104, 'ECONNRESET')",))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/abe/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "streamer.py", line 48, in run
twiStream(self.keywords, self.stop)
File "streamer.py", line 72, in twiStream
for tweet in stream:
File "/home/abe/anaconda3/lib/python3.6/site-packages/twitter/api.py", line 4575, in GetStreamFilter
for line in resp.iter_lines():
File "/home/abe/anaconda3/lib/python3.6/site-packages/requests/models.py", line 793, in iter_lines
for chunk in self.iter_content(chunk_size=chunk_size, decode_unicode=decode_unicode):
File "/home/abe/anaconda3/lib/python3.6/site-packages/requests/models.py", line 752, in generate
raise ChunkedEncodingError(e)
requests.exceptions.ChunkedEncodingError: ('Connection broken: OSError("(104, \'ECONNRESET\')",)', OSError("(104, 'ECONNRESET')",))

WinError 10038 an operaton was attempted on something that is not a socket

I'm trying to make chatting server in Python and I can't solve it. I'm running my code in CMD by using command python client.py localhost 9009.
this is the code that I am using:
#chat_client.py
import sys
import socket
import select
def chat_client():
if(len(sys.argv) < 3):
print("Usage: python chat_client.py hostname port")
sys.exit()
host = sys.argv[1]
port = int(sys.argv[2])
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
# connect to remote host
try:
s.connect((host, port))
except:
print("Unable to connect")
sys.exit()
print("Connected to remote host. You can start sending messages")
sys.stdout.write("[Me] "); sys.stdout.flush()
while 1:
socket_list = [sys.stdin, s]
# Get the list sockets which are readable
read_sockets, write_sockets, error_sockets = select.select(socket_list , [], [])
for sock in read_sockets:
if sock == s:
# incoming message from remote server, s
data = sock.recv(4096)
if not data:
print("\nDisconnected from chat server")
sys.exit()
else:
#print data
sys.stdout.write(data)
sys.stdout.write("[Me] "); sys.stdout.flush()
else:
# user entered a message
msg = sys.stdin.readline()
s.send(msg)
sys.stdout.write("[Me] "); sys.stdout.flush()
if __name__ == "__main__":
sys.exit(chat_client())
And this is the error that I'm getting:
I don't have a clue how to fix it. Help would be appreciated! :)
[Me] Traceback (most recent call last):
File "client.py", line 54, in <module>
sys.exit(chat_client())
File "client.py", line 32, in chat_client
read_sockets, write_sockets, error_sockets = select.select(socket_list , [],
[])

Python 3.4 Multiprocess throws TypeError("cannot serialize '_io.BufferedReader' object",)

Recently I wrote a multiprocess code in Python 3.4 to download some images, it's working blazingly fast at first, then I get the following error and cannot start the program anymore.
Traceback (most recent call last):
File "multiprocessing_d.py", line 23, in <module>
main()
File "multiprocessing_d.py", line 16, in main
p.map(download, lines)
File "/usr/local/lib/python3.4/multiprocessing/pool.py", line 260, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/local/lib/python3.4/multiprocessing/pool.py", line 608, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x7f1e047f32e8>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'
My code is as following
download_helper.py
import sys
import os
from pathlib import Path
url_prefix = r"Some prefix"
def setup_download_dir(dictionary):
download_dir = Path(dictionary)
if not download_dir.exists():
download_dir.mkdir()
return dictionary
def download_link(dictionary, line):
from urllib.request import urlretrieve
itemid = line.split()[0].decode()
link = line.split()[1].decode()
if (link.startswith("http")):
image_url = link
else:
image_url = url_prefix + link
if os.path.isfile(dictionary + "/" + itemid + ".jpg"):
#print("Already have " + itemid + ".jpg")
pass
else:
urlretrieve(image_url, dictionary + "/" + itemid + ".jpg")
multiprocessing_d.py
from functools import partial
from multiprocessing.pool import Pool
import sys
from time import time
from download_helper import setup_download_dir, download_link
def main():
file_path = sys.argv[1]
dic_path = sys.argv[2]
download_dir = setup_download_dir(dic_path)
download = partial(download_link, download_dir)
with open(file_path, 'rb') as f:
lines = f.readlines()
ts = time()
p = Pool(processes=16, maxtasksperchild=1)
p.map(download, lines)
p.close()
p.join()
print('Took {}s'.format(time() - ts))
f.close()
if __name__ == "__main__":
main()
I've tried to search online but didn't find much information useful. My suspect is that there might be some exception raised in urlretrieve, but I don't know how to debug it. Any comments or suggestions would be appreciated!!
James

Resources