Pytorch dataloader with iterable dataset stops after one epoch in multiprocessing mode - multiprocessing

I have a dataloader that is initialised with a iterable dataset. I found that when I use multiprocessing (i.e. num_workers>0 in DataLoader) in dataloader, once the dataloader is exhausted after one epoch, it doesn't get reset automatically when I iterate it again in the second epoch. Below is a small reproducible example.
I am aware that "Workers are shut down once the end of the iteration is reached," according to the documentation. However, I would like to know how to achieve my expected behaviour of "resetting automatically". Thanks for any help in advance!
import torch
class MyIterableDataset(torch.utils.data.IterableDataset):
def __init__(self, start, end):
super().__init__()
self.start = start
self.end = end
def __iter__(self):
return iter(range(self.start, self.end))
dataset = MyIterableDataset(0, 4)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=False, num_workers=1, drop_last=False)
for epoch in range(2):
for i, data in enumerate(dataloader):
print(i, data)
"""
stdout:
0 tensor([0, 1])
1 tensor([2, 3])
2 _IterableDatasetStopIteration(worker_id=0)
"""
While my expectation of stdout is
"""
0 tensor([0, 1])
1 tensor([2, 3])
0 tensor([0, 1])
1 tensor([2, 3])
"""
I am using the latest pytorch version (1.6.0)

Related

How to update training dataset at epoch begin in Huggingface Trainer using Callback?

I want to recreate the training dataset by a function generate_custom_train_set at the beginning of every epoch, however, is there a way I could do it with Trainer using callback?
My trainer looks like
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset.,
eval_dataset=validation_dataset,
tokenizer=tokenizer,
)
I'm having the same question as I try to implement Examples-proportional mixing from the T5 paper. I didn't find support from hugging face.
My current solution is to modify the trainer.train_dataset in the on_epoch_begin callback.
Here's an implementation. I'm using this in my own project. Seems to work.
First, implement your per-epoch change in your Dataset, in my case, it's the sample function for Examples-Proportional Mixing.
class ProportionMixingDataset:
"""
Examples-proportional mixing from T5
TODO: failed to find a pytorch working implementation
Equivalent to, for the larger datasets, a new subset is taken at each epoch,
then sample in the joined subset once
"""
def __init__(self, dataset_list: List[Dataset] = None, k: int = None):
"""
:param dataset_list: Ordered list of datasets
:param k: Artificial limit
"""
self.dsets = dataset_list
assert k is not None
self.k = k
self.dset_szs = [min(len(d), k) for d in self.dsets]
self.sz = sum(self.dset_szs)
self._sampled_idxs: List[Optional[torch.Tensor]] = [None] * len(self.dsets)
self.sample()
def sample(self):
"""
Sub-sample datasets larger than k
Intended to call in each epoch
"""
for i, dset in enumerate(self.dsets):
sz = len(dset)
if sz > self.k:
self._sampled_idxs[i] = torch.randperm(sz)[:self.k]
def __len__(self):
return self.sz
def _idx2dset_idx(self, idx: int) -> Tuple[int, int]:
"""
Convert a global index to a dataset index
"""
for i, sz in enumerate(self.dset_szs):
if idx < sz:
return i, idx
idx -= sz
raise ValueError('Should not happen')
def __getitem__(self, idx):
if not isinstance(idx, int):
raise ValueError('Batched indexing not supported')
idx_dset, idx = self._idx2dset_idx(idx)
dset = self.dsets[idx_dset]
if self._sampled_idxs[idx_dset] is not None: # A sub-sample index
idx = self._sampled_idxs[idx_dset][idx].item()
return dset[idx]
Then pass that dataset to Trainer.
Now comes the magic part:
class ProportionalMixCallback(TrainerCallback):
"""
Trigger re-computing subset for dataset Examples-proportional mixing, see `dataset::ProportionMixingDataset`
A hack that modifies the train dataset, pointed by Trainer's dataloader
"""
def __init__(self, trainer: Trainer):
self.trainer = trainer
def on_epoch_begin(self, args: TrainingArguments, state, control, **kwargs):
self.trainer.train_dataset.sample()
Pass this to your trainer as a callback.
This triggers the sample call which modifies the dataset at the times we need it.
This works becasue train_dataLoader in trainer still points to the same train dataset object.

Change all images in training set

I have a convolutional neural network. And I wanted to train it on images from the training set but first they should be wrapped with my function change(tensor, float) that takes in a tensor/image of the form [hight,width,3] and a float.
Batch size =4
loading data
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=2)
Cnn architecture
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
#size of inputs [4,3,32,32]
#size of labels [4]
inputs = change(inputs,0.1) <----------------------------
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs) #[4, 10]
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
running_loss = 0.0
print('Finished Training')
I am trying to apply the image function change but it gives an object error.
it there a quick way to fix it?
I am using a Julia function but it works completely fine with other objects. Error message:
JULIA: MethodError: no method matching copy(::PyObject)
Closest candidates are:
copy(!Matched::T) where T<:SHA.SHA3_CTX at /opt/julia-1.7.2/share/julia/stdlib/v1.7/SHA/src/types.jl:213
copy(!Matched::T) where T<:SHA.SHA2_CTX at /opt/julia-1.7.2/share/julia/stdlib/v1.7/SHA/src/types.jl:212
copy(!Matched::Number) at /opt/julia-1.7.2/share/julia/base/number.jl:113
I would recommend to put change function to transforms list, so you do data changes on transformation stage.
partial from functools will help you to fix number of arguments, like this:
from functools import partial
def change(input, float):
pass
# Use partial to fix number of params, such that change accepts only input
change_partial = partial(change, float=pass_float_value_here)
# Add change_partial to a list of transforms before or after converting to tensors
transforms = Compose([
RandomResizedCrop(img_size), # example
# Add change_partial here if it operates on PIL Image
change_partial,
ToTensor(), # convert to tensor
# Add change_partial here if it operates on torch tensors
change_partial,
])

Multiprocessing on Python 3 Jupyter

I come here because I have an issue with my Jupiter's Python3 notebook.
I need to create a function that uses the multiprocessing library.
Before to implement it, I make some tests.
I found a looooot of different examples but the issue is everytime the same : my code is executed but nothing happens in the notebook's interface :
The code i try to run on jupyter is this one :
import os
from multiprocessing import Process, current_process
def doubler(number):
"""
A doubling function that can be used by a process
"""
result = number * 2
proc_name = current_process().name
print('{0} doubled to {1} by: {2}'.format(
number, result, proc_name))
return result
if __name__ == '__main__':
numbers = [5, 10, 15, 20, 25]
procs = []
proc = Process(target=doubler, args=(5,))
for index, number in enumerate(numbers):
proc = Process(target=doubler, args=(number,))
proc2 = Process(target=doubler, args=(number,))
procs.append(proc)
procs.append(proc2)
proc.start()
proc2.start()
proc = Process(target=doubler, name='Test', args=(2,))
proc.start()
procs.append(proc)
for proc in procs:
proc.join()
It's OK when I just run my code without Jupyter but with the command "python my_progrem.py" and I can see the logs :
Is there, for my example, and in Jupyter, a way to catch the results of my two tasks (proc1 and proc2 which both call thefunction "doubler") in a variable/object that I could use after ?
If "yes", how can I do it?
#Konate's answer really helped me. Here is a simplified version using multiprocessing.pool:
import multiprocessing
def double(a):
return a * 2
def driver_func():
PROCESSES = 4
with multiprocessing.Pool(PROCESSES) as pool:
params = [(1, ), (2, ), (3, ), (4, )]
results = [pool.apply_async(double, p) for p in params]
for r in results:
print('\t', r.get())
driver_func()
I succeed by using multiprocessing.pool.
I was inspired by this approach :
def test():
PROCESSES = 4
print('Creating pool with %d processes\n' % PROCESSES)
with multiprocessing.Pool(PROCESSES) as pool:
TASKS = [(mul, (i, 7)) for i in range(10)] + \
[(plus, (i, 8)) for i in range(10)]
results = [pool.apply_async(calculate, t) for t in TASKS]
imap_it = pool.imap(calculatestar, TASKS)
imap_unordered_it = pool.imap_unordered(calculatestar, TASKS)
print('Ordered results using pool.apply_async():')
for r in results:
print('\t', r.get())
print()
print('Ordered results using pool.imap():')
for x in imap_it:
print('\t', x)
...etc
For more, the code is at : https://docs.python.org/3.4/library/multiprocessing.html?
Another way of running multiprocessing jobs in a Jupyter notebook is to use one of the approaches supported by the nbmultitask package.
This works for me on MAC (cannot make it work on windows):
import multiprocessing as mp
mp_start_count = 0
if __name__ == '__main__':
if mp_start_count == 0:
mp.set_start_method('fork')
mp_start_count += 1

PyTorch custom dataset dataloader returns strings (of keys) not tensors

I am trying to load my own dataset and I use a custom Dataloader that reads in images and labels and converts them to PyTorch Tensors. However when the Dataloader is instantiated it returns strings x "image" and y "labels" but not the real values or tensors when read (iter)
print(self.train_loader) # shows a Tensor object
tic = time.time()
with tqdm(total=self.num_train) as pbar:
for i, (x, y) in enumerate(self.train_loader): # x and y are returned as string (where it fails)
if self.use_gpu:
x, y = x.cuda(), y.cuda()
x, y = Variable(x), Variable(y)
This is how dataloader.py looks like:
from __future__ import print_function, division #ds
import numpy as np
from utils import plot_images
import os #ds
import pandas as pd #ds
from skimage import io, transform #ds
import torch
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader #ds
from torchvision import transforms
from torchvision import utils #ds
from torch.utils.data.sampler import SubsetRandomSampler
class CDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.frame = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.frame)
def __getitem__(self, idx):
img_name = os.path.join(self.root_dir,
self.frame.iloc[idx, 0]+'.jpg')
image = io.imread(img_name)
# image = image.transpose((2, 0, 1))
labels = np.array(self.frame.iloc[idx, 1])#.as_matrix() #ds
#landmarks = landmarks.astype('float').reshape(-1, 2)
#print(image.shape)
#print(img_name,labels)
sample = {'image': image, 'labels': labels}
if self.transform:
sample = self.transform(sample)
return sample
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
image, labels = sample['image'], sample['labels']
#print(image)
#print(labels)
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
image = image.transpose((2, 0, 1))
#print(image.shape)
#print((torch.from_numpy(image)))
#print((torch.from_numpy(labels)))
return {'image': torch.from_numpy(image),
'labels': torch.from_numpy(labels)}
def get_train_valid_loader(data_dir,
batch_size,
random_seed,
#valid_size=0.1, #ds
#shuffle=True,
show_sample=False,
num_workers=4,
pin_memory=False):
"""
Utility function for loading and returning train and valid
multi-process iterators over the MNIST dataset. A sample
9x9 grid of the images can be optionally displayed.
If using CUDA, num_workers should be set to 1 and pin_memory to True.
Args
----
- data_dir: path directory to the dataset.
- batch_size: how many samples per batch to load.
- random_seed: fix seed for reproducibility.
- #ds valid_size: percentage split of the training set used for
the validation set. Should be a float in the range [0, 1].
In the paper, this number is set to 0.1.
- shuffle: whether to shuffle the train/validation indices.
- show_sample: plot 9x9 sample grid of the dataset.
- num_workers: number of subprocesses to use when loading the dataset.
- pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
True if using GPU.
Returns
-------
- train_loader: training set iterator.
- valid_loader: validation set iterator.
"""
#ds
#error_msg = "[!] valid_size should be in the range [0, 1]."
#assert ((valid_size >= 0) and (valid_size <= 1)), error_msg
#ds
# define transforms
#normalize = transforms.Normalize((0.1307,), (0.3081,))
trans = transforms.Compose([
ToTensor(), #normalize,
])
# load train dataset
#train_dataset = datasets.MNIST(
# data_dir, train=True, download=True, transform=trans
#)
train_dataset = CDataset(csv_file='/home/Desktop/6June17/util/train.csv',
root_dir='/home/caffe/data/images/',transform=trans)
# load validation dataset
#valid_dataset = datasets.MNIST( #ds
# data_dir, train=True, download=True, transform=trans #ds
#)
valid_dataset = CDataset(csv_file='/home/Desktop/6June17/util/eval.csv',
root_dir='/home/caffe/data/images/',transform=trans)
num_train = len(train_dataset)
train_indices = list(range(num_train))
#ds split = int(np.floor(valid_size * num_train))
num_valid = len(valid_dataset) #ds
valid_indices = list(range(num_valid)) #ds
#if shuffle:
# np.random.seed(random_seed)
# np.random.shuffle(indices)
#ds train_idx, valid_idx = indices[split:], indices[:split]
train_idx = train_indices #ds
valid_idx = valid_indices #ds
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=batch_size, sampler=train_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
print(train_loader)
valid_loader = torch.utils.data.DataLoader(
valid_dataset, batch_size=batch_size, sampler=valid_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
# visualize some images
if show_sample:
sample_loader = torch.utils.data.DataLoader(
dataset, batch_size=9, #shuffle=shuffle,
num_workers=num_workers, pin_memory=pin_memory
)
data_iter = iter(sample_loader)
images, labels = data_iter.next()
X = images.numpy()
X = np.transpose(X, [0, 2, 3, 1])
plot_images(X, labels)
return (train_loader, valid_loader)
def get_test_loader(data_dir,
batch_size,
num_workers=4,
pin_memory=False):
"""
Utility function for loading and returning a multi-process
test iterator over the MNIST dataset.
If using CUDA, num_workers should be set to 1 and pin_memory to True.
Args
----
- data_dir: path directory to the dataset.
- batch_size: how many samples per batch to load.
- num_workers: number of subprocesses to use when loading the dataset.
- pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
True if using GPU.
Returns
-------
- data_loader: test set iterator.
"""
# define transforms
#normalize = transforms.Normalize((0.1307,), (0.3081,))
trans = transforms.Compose([
ToTensor(), #normalize,
])
# load dataset
#dataset = datasets.MNIST(
# data_dir, train=False, download=True, transform=trans
#)
test_dataset = CDataset(csv_file='/home/Desktop/6June17/util/test.csv',
root_dir='/home/caffe/data/images/',transform=trans)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=batch_size, shuffle=False,
num_workers=num_workers, pin_memory=pin_memory,
)
return test_loader
#for i_batch, sample_batched in enumerate(dataloader):
# print(i_batch, sample_batched['image'].size(),
# sample_batched['landmarks'].size())
# # observe 4th batch and stop.
# if i_batch == 3:
# plt.figure()
# show_landmarks_batch(sample_batched)
# plt.axis('off')
# plt.ioff()
# plt.show()
# break
A minimal working sample will be difficult to post here but basically I am trying to modify this project http://torch.ch/blog/2015/09/21/rmva.html which works smoothly with MNIST. I am just trying to run it with my own dataset with the custom dataloader.py I use above.
It instantiates a Dataloader like this:
in trainer.py:
if config.is_train:
self.train_loader = data_loader[0]
self.valid_loader = data_loader[1]
self.num_train = len(self.train_loader.sampler.indices)
self.num_valid = len(self.valid_loader.sampler.indices)
-> run from main.py:
if config.is_train:
data_loader = get_train_valid_loader(
config.data_dir, config.batch_size,
config.random_seed, #config.valid_size,
#config.shuffle,
config.show_sample, **kwargs
)
You are not properly using python's enumerate(). (x, y) are currently assigned the 2 keys of your batch dictionary i.e. the strings "image" and "labels". This should solve your problem:
for i, batch in enumerate(self.train_loader):
x, y = batch["image"], batch["labels"]
# ...

Using multiple validation sets with keras

I am training a model with keras using the model.fit() method.
I would like to use multiple validation sets that should be validated on separately after each training epoch so that i get one loss value for each validation set. If possible they should be both displayed during training and as well be returned by the keras.callbacks.History() callback.
I am thinking of something like this:
history = model.fit(train_data, train_targets,
epochs=epochs,
batch_size=batch_size,
validation_data=[
(validation_data1, validation_targets1),
(validation_data2, validation_targets2)],
shuffle=True)
I currently have no idea how to implement this. Is it possible to achieve this by writing my own Callback? Or how else would you approach this problem?
I ended up writing my own Callback based on the History callback to solve the problem. I'm not sure if this is the best approach but the following Callback records losses and metrics for the training and validation set like the History callback as well as losses and metrics for additional validation sets passed to the constructor.
class AdditionalValidationSets(Callback):
def __init__(self, validation_sets, verbose=0, batch_size=None):
"""
:param validation_sets:
a list of 3-tuples (validation_data, validation_targets, validation_set_name)
or 4-tuples (validation_data, validation_targets, sample_weights, validation_set_name)
:param verbose:
verbosity mode, 1 or 0
:param batch_size:
batch size to be used when evaluating on the additional datasets
"""
super(AdditionalValidationSets, self).__init__()
self.validation_sets = validation_sets
for validation_set in self.validation_sets:
if len(validation_set) not in [3, 4]:
raise ValueError()
self.epoch = []
self.history = {}
self.verbose = verbose
self.batch_size = batch_size
def on_train_begin(self, logs=None):
self.epoch = []
self.history = {}
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
self.epoch.append(epoch)
# record the same values as History() as well
for k, v in logs.items():
self.history.setdefault(k, []).append(v)
# evaluate on the additional validation sets
for validation_set in self.validation_sets:
if len(validation_set) == 3:
validation_data, validation_targets, validation_set_name = validation_set
sample_weights = None
elif len(validation_set) == 4:
validation_data, validation_targets, sample_weights, validation_set_name = validation_set
else:
raise ValueError()
results = self.model.evaluate(x=validation_data,
y=validation_targets,
verbose=self.verbose,
sample_weight=sample_weights,
batch_size=self.batch_size)
for metric, result in zip(self.model.metrics_names,results):
valuename = validation_set_name + '_' + metric
self.history.setdefault(valuename, []).append(result)
which i am then using like this:
history = AdditionalValidationSets([(validation_data2, validation_targets2, 'val2')])
model.fit(train_data, train_targets,
epochs=epochs,
batch_size=batch_size,
validation_data=(validation_data1, validation_targets1),
callbacks=[history]
shuffle=True)
I tested this on TensorFlow 2 and it worked. You can evaluate on as many validation sets as you want at the end of each epoch:
class MyCustomCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
res_eval_1 = self.model.evaluate(X_test_1, y_test_1, verbose = 0)
res_eval_2 = self.model.evaluate(X_test_2, y_test_2, verbose = 0)
print(res_eval_1)
print(res_eval_2)
And later:
my_val_callback = MyCustomCallback()
# Your model creation code
model.fit(..., callbacks=[my_val_callback])
Considering the current keras docs, you can pass callbacks to evaluate and evaluate_generator. So you can call evaluate multiple times with different datasets.
I have not tested it, so I am happy if you comment your experiences with it below.

Resources