I have a problem with training using tff.simulation.FilePerUserClientData - I am quickly running out of RAM after 5-6 rounds with 10 clients per round.
The RAM usage is steadily increasing with each round.
I tried to narrow it down and realized that the issue is not the actual iterative process but the creation of the client datasets.
Simply calling create_tf_dataset_for_client(client) in a loop causes the problem.
So this is a minimal version of my code:
import tensorflow as tf
import tensorflow_federated as tff
import numpy as np
import pickle
BATCH_SIZE = 16
EPOCHS = 2
MAX_SEQUENCE_LEN = 20
NUM_ROUNDS = 100
CLIENTS_PER_ROUND = 10
def decode_fn(record_bytes):
return tf.io.parse_single_example(
record_bytes,
{"x": tf.io.FixedLenFeature([MAX_SEQUENCE_LEN], dtype=tf.string),
"y": tf.io.FixedLenFeature([MAX_SEQUENCE_LEN], dtype=tf.string)}
)
def dataset_fn(path):
return tf.data.TFRecordDataset([path]).map(decode_fn).padded_batch(BATCH_SIZE).repeat(EPOCHS)
def sample_client_data(data, client_ids, sampling_prob):
clients_total = len(client_ids)
x = np.random.uniform(size=clients_total)
sampled_ids = [client_ids[i] for i in range(clients_total) if x[i] < sampling_prob]
data = [train_data.create_tf_dataset_for_client(client) for client in sampled_ids]
return data
with open('users.pkl', 'rb') as f:
users = pickle.load(f)
train_client_ids = users["train"]
client_id_to_train_file = {i: "reddit_leaf_tf/" + i for i in train_client_ids}
train_data = tff.simulation.datasets.FilePerUserClientData(
client_ids_to_files=client_id_to_train_file,
dataset_fn=dataset_fn
)
sampling_prob = CLIENTS_PER_ROUND / len(train_client_ids)
for round_num in range(0, NUM_ROUNDS):
print('Round {r}'.format(r=round_num))
participants_data = sample_client_data(train_data, train_client_ids, sampling_prob)
print("Round Completed")
I am using tensorflow-federated 19.0.
Is there something wrong with the way I create the client datasets or is it somehow expected that the RAM from the previous round is not freed?
schmana# noticed this occurs when changing the cardinality of the CLIENTS placement (different number of client datasets) each round. This results in a cache filing up as documented in http://github.com/tensorflow/federated/issues/1215.
A workaround in the immediate term would be to call:
tff.framework.get_context_stack().current.executor_factory.clean_up_executors()
At the start or end of every round.
I have submitted autoML run on remote compute (Standard_D12_v2 - 4 node cluster 28GB, 4 cores each)
My input file is roughly 350 MB.
the status is "Preparing" for more than 2 hours. And then it fails.
User error: Run timed out. No model completed training in the specified time. Possible solutions:
1) Please check if there are enough compute resources to run the experiment.
2) Increase experiment timeout when creating a run.
3) Subsample your dataset to decrease featurization/training time.
below is my python-Notebook code, please help.
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.core.compute import ComputeTarget
from azureml.train.automl import AutoMLConfig
ws = Workspace.from_config()
experiment=Experiment(ws, 'nyc-taxi')
cpu_cluster_name = "low-cluster"
compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
data = "https://betaml4543906917.blob.core.windows.net/betadata/2015_08.csv"
dataset = Dataset.Tabular.from_delimited_files(data)
training_data, validation_data = dataset.random_split(percentage=0.8, seed=223)
label_column_name = 'totalAmount'
automl_settings = {
"n_cross_validations": 3,
"primary_metric": 'normalized_root_mean_squared_error',
"enable_early_stopping": True,
"max_concurrent_iterations": 2, # This is a limit for testing purpose, please increase it as per cluster size
"experiment_timeout_hours": 2, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ablity to find the best model possible
"verbosity": logging.INFO,
}
automl_config = AutoMLConfig(task = 'regression',
debug_log = 'automl_errors.log',
compute_target = compute_target,
training_data = training_data,
label_column_name = label_column_name,
**automl_settings
)
remote_run = experiment.submit(automl_config, show_output = False)
I guess i have made something in folowing simple neural network with PyTorch, because this runs much slower with CUDA then in CPU, can you find the mistake pls. The using function like
def backward(ctx, input):
return backward_sigm(ctx, input)
seems have no real impact on preformance
import torch
import torch.nn as nn
import torch.nn.functional as f
dname = 'cuda:0'
dname = 'cpu'
device = torch.device(dname)
print(torch.version.cuda)
def forward_sigm(ctx, input):
sigm = 1 / (1 + torch.exp(-input))
ctx.save_for_backward(sigm)
return sigm
def forward_step(ctx, input):
return torch.tensor(input > 0.5, dtype = torch.float32, device = device)
def backward_sigm(ctx, grad_output):
sigm, = ctx.saved_tensors
return grad_output * sigm * (1-sigm)
def backward_step(ctx, grad_output):
return grad_output
class StepAF(torch.autograd.Function):
#staticmethod
def forward(ctx, input):
return forward_sigm(ctx, input)
#staticmethod
def backward(ctx, input):
return backward_sigm(ctx, input)
#else return grad_output
class StepNN(torch.nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(StepNN, self).__init__()
self.linear1 = torch.nn.Linear(input_size, hidden_size)
#self.linear1.cuda()
self.linear2 = torch.nn.Linear(hidden_size, output_size)
#self.linear2.cuda()
#self.StepAF = StepAF.apply
def forward(self,x):
h_line_1 = self.linear1(x)
h_thrash_1 = StepAF.apply(h_line_1)
h_line_2 = self.linear2(h_thrash_1)
output = StepAF.apply(h_line_2)
return output
inputs = torch.tensor( [[1,0,1,0],[1,0,0,1],[0,1,0,1],[0,1,1,0],[1,0,0,0],[0,0,0,1],[1,1,0,1],[0,1,0,0],], dtype = torch.float32, device = device)
expected = torch.tensor( [[1,0,0],[1,0,0],[0,1,0],[0,1,0],[1,0,0],[0,0,1],[0,1,0],[0,0,1],], dtype = torch.float32, device = device)
nn = StepNN(4,8,3)
#print(*(x for x in nn.parameters()))
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(nn.parameters(), lr=1e-3)
steps = 50000
print_steps = steps // 20
good_loss = 1e-5
for t in range(steps):
output = nn(inputs)
loss = criterion(output, expected)
if t % print_steps == 0:
print('step ',t, ', loss :' , loss.item())
if loss < good_loss:
print('step ',t, ', loss :' , loss.item())
break
optimizer.zero_grad()
loss.backward()
optimizer.step()
test = torch.tensor( [[0,1,0,1],[0,1,1,0],[1,0,1,0],[1,1,0,1],], dtype = torch.float32, device=device)
print(nn(test))
Unless you have large enough data, you won't see any performance improvement while using GPU. The problem is that GPUs use parallel processing, so unless you have large amounts of data, the CPU can process the samples almost as fast as the GPU.
As far as I can see in your example, you are using 8 samples of size (4, 1). I would imagine maybe when having over hundreds or thousands of samples, then you would see the performance improvement on a GPU. In your case, the sample size is (4, 1), and the hidden layer size is 8, so the CPU can perform the calculations fairly quickly.
There are lots of example notebooks online of people using MNIST data (it has around 60000 images for training), so you could load one in maybe Google Colab and then try training on the CPU and then on GPU and observe the training times. You could try this link for example. It uses TensorFlow instead of PyTorch but it will give you an idea of the performance improvement of a GPU.
Note : If you haven't used Google Colab before, then you need to change the runtime type (None for CPU and GPU for GPU) in the runtime menu at the top.
Also, I will post the results from this notebook here itself (look at the time mentioned in the brackets, and if you run it, you can see firsthand how fast it runs) :
On CPU :
INFO:tensorflow:loss = 294.3736, step = 1
INFO:tensorflow:loss = 28.285727, step = 101 (23.769 sec)
INFO:tensorflow:loss = 23.518856, step = 201 (24.128 sec)
On GPU :
INFO:tensorflow:loss = 295.08328, step = 0
INFO:tensorflow:loss = 47.37291, step = 100 (4.709 sec)
INFO:tensorflow:loss = 23.31364, step = 200 (4.581 sec)
INFO:tensorflow:loss = 9.980572, step = 300 (4.572 sec)
INFO:tensorflow:loss = 17.769928, step = 400 (4.560 sec)
INFO:tensorflow:loss = 16.345463, step = 500 (4.531 sec)
I have a big dataset divided in files.
I would like to read and process my data one file at the time and for this I have this keras generator:
def myGenerator():
while 1:
rnd = random.randint(1,200)
strRnd = str(rnd)
lenRnd = len(strRnd)
rndPadded = strRnd.rjust(5, '0')
nSearchesInBatch = 100
f = "path/part-" + rndPadded + "*" #read one block of data
data = sqlContext.read.load(f).toPandas()
imax = int(data.shape[0]/nSearchesInBatch) #number of batches that will be created sequentially from the generator
for i in range(imax):
data_batch = data[i*nSearchesInBatch:(i+1)*nSearchesInBatch]
features = data_batch['features']
output = data_batch['output']
yield features, output
The problem is that the reading takes the biggest part (each file is around 200mb), and in the meanwhile the GPU sits waiting, it is possible to pre-read the next batch while the GPU is traning on the previous one?
At the moment one file is read and split in steps (the inner loop), the CPUs are hidden and the GPU training, as soon as the epoch finishes, the GPU goes idle and the cpu start reading (which takes 20/30 seconds).
Any solution to parallelize this?
I'd like to implement a real time plot of my CPU and GPU load.
I already have a script that retrieve the data and echo it in a terminal.
What I want to do now is to plot this information to see the evolution with time.
I don't know if I need python for instance, and in this case using which module?
Is there any other alternative?
Does someone have any experience doing real time plotting?
Éric.
here is what I did so far
#! /usr/bin/python
import pylab
import time
t = 0
dt = .25
old = None
if __name__ == "__main__":
pylab.ion()
#pylab.xlabel('this is x!')
#pylab.ylabel('this is y!')
#pylab.title('My First Plot')
while True:
with open('/proc/stat') as stat:
new = map(float, stat.readline().strip().split()[1:])
if old is not None:
diff = [n - o for n, o in zip(new, old)]
idle = diff[3] / sum(diff)
pylab.clf()
pylab.plot(t,int(255 * (1 - idle)),'-b', label='cpu')
#print t,int(255 * (1 - idle))
pylab.draw()
old = new
time.sleep(dt)
t = t + dt
Well, something happens when I execute it but nothing is displayed.
Any suggestion?
Thank you,
Éric.