Unable to use multi GPU training with TensorFlow 2 - multi-gpu

Setup: Win 10, 2x Geforce RTX 2080 Ti, Tensorflow 2.9.1 (also tested older versions), Geforce Driver 512.95
I tried multiple tutorials for multi GPU training with Tensorflow 2 and was never able to utelize more then one GPU.
Here is my code.
def mnist_dataset(batch_size):
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train / np.float32(255)
y_train = y_train.astype(np.int64)
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
return train_dataset
def build_and_compile_cnn_model():
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(28, 28)),
tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
tf.keras.layers.Conv2D(512, 3, activation='relu'),
tf.keras.layers.Conv2D(512, 3, activation='relu'),
tf.keras.layers.Conv2D(512, 3, activation='relu'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10)
])
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
model.summary()
return model
strategy = tf.distribute.MirroredStrategy()
#strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"], cross_device_ops=tf.distribute.ReductionToOneDevice())
print(tf.__version__)
print(tf.config.list_physical_devices())
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
with strategy.scope():
multi_worker_model = build_and_compile_cnn_model()
batch_size = 64
multi_worker_dataset = mnist_dataset(batch_size)
multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(multi_worker_dataset, epochs=10, steps_per_epoch=250)
Output from tf.config.list_physical_devices() and strategy.num_replicas_in_sync.
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
Number of devices: 2
But only one GPU gets used.
GPU usage and temperature multi GPU test
To test if both GPUs are working and are accessable by Tensorflow I tried this code.
with tf.device('/gpu:0'):
batch_size = 64
single_worker_dataset = mnist_dataset(batch_size)
single_worker_model = build_and_compile_cnn_model()
single_worker_model.fit(single_worker_dataset, epochs=5, steps_per_epoch=250)
with tf.device('/gpu:1'):
batch_size = 64
single_worker_dataset = mnist_dataset(batch_size)
single_worker_model = build_and_compile_cnn_model()
single_worker_model.fit(single_worker_dataset, epochs=5, steps_per_epoch=250)
Runs as expected. First GPU:0 gets used and then GPU:1. No Error. Everything is fine.
GPU usage and temperature single GPU test
In Tensorflow 1 I was able to use multiple GPUs with keras.utils multi_gpu_model and the same setup.
Any idea what the problem could be?

Related

Statsmodels - Poisson GLM is taking more than an hour to fit the model - does not return anything

I am trying to run a poisson GL regression with statsmodel on my training dataset (of 30.000 rows/observations) on Jupyter Notebook with the following code but it has been running for more than an hour and it does not return anything yet. I tried to shut it down and starting it again and I am running it on two different laptops (8 GB RAM one and 16 GB RAM the other) but nothing works...
Can somebody please give me some suggestions on how to fix it?
code
df=pd.read_csv('gravity.csv',sep=';')
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))
expr = """BTI ~ GDP_mult + distwces + DAI_mult + year + contig + comlang_off + colony"""
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

How to use BertForSequenceClassification for token max_length set at 1700?

I want to perform author classification on the Reuters 50 50 dataset, where the max token length is 1600+ tokens and there are 50 classes/authors in total.
With max_length=1700 and batch_size=1, I'm getting RuntimeError: CUDA out of memory. This error can be prevented by setting max_length=512, but this has the unwanted effect of truncating the texts.
Tokenizing and encoding:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 1700
def get_encodings(texts):
token_ids = []
attention_masks = []
for text in texts:
token_id = tokenizer.encode(text, add_special_tokens=True, max_length=MAX_LEN)
token_ids.append(token_id)
return token_ids
def pad_encodings(encodings):
return pad_sequences(encodings, maxlen=MAX_LEN, dtype="long",
value=0, truncating="post", padding="post")
def get_attention_masks(padded_encodings):
attention_masks = []
for encoding in padded_encodings:
attention_mask = [int(token_id > 0) for token_id in encoding]
attention_masks.append(attention_mask)
return attention_masks
train_encodings = get_encodings(train_df.text.values)
train_encodings = pad_encodings(train_encodings)
train_attention_masks = get_attention_masks(train_encodings)
test_encodings = get_encodings(test_df.text.values)
test_encodings = pad_encodings(test_encodings)
test_attention_masks = get_attention_masks(test_encodings)
Packing into Dataset and Dataloader:
X_train = torch.tensor(train_encodings)
y_train = torch.tensor(train_df.author_id.values)
train_masks = torch.tensor(train_attention_masks)
X_test = torch.tensor(test_encodings)
y_test = torch.tensor(test_df.author_id.values)
test_masks = torch.tensor(test_attention_masks)
batch_size = 1
# Create the DataLoader for our training set.
train_data = TensorDataset(X_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(X_test, test_masks, y_test)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
Model setup:
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
config = BertConfig.from_pretrained(
'bert-base-uncased',
num_labels = 50,
output_attentions = False,
output_hidden_states = False,
max_position_embeddings=MAX_LEN
)
model = BertForSequenceClassification(config)
model.to(device)
optimizer = AdamW(model.parameters(),
lr = 2e-5,
eps = 1e-8
)
Training:
for epoch_i in range(0, epochs):
model.train()
for step, batch in enumerate(train_dataloader):
b_texts = batch[0].to(device)
b_attention_masks = batch[1].to(device)
b_authors = batch[2].to(device)
model.zero_grad()
outputs = model(b_texts,
token_type_ids=None,
attention_mask=b_attention_masks,
labels=b_authors) <------- ERROR HERE
Error:
RuntimeError: CUDA out of memory. Tried to allocate 6.00 GiB (GPU 0; 7.93 GiB total capacity; 1.96 GiB already allocated; 5.43 GiB free; 536.50 KiB cached)
Unless you are training on a TPU, your chances are extremely low of ever having enough GPU RAM with any of the available GPUs right now.
For some BERT models, the model alone takes well above 10GB in RAM, and a doubling in sequence length beyond 512 tokens takes about that much more in memory. For reference, a Titan RTX with 24 GB GPU RAM (most of what is currently available for a single GPU), can barely fit 24 samples of 512 tokens in length at the same time.
Fortunately, most of the networks still yield a very decent performance when truncating the samples, but this is of course task-specific. Also keep in mind that - unless you are training from scratch - all of the pre-trained models are generally trained on 512 token limits. To my knowledge, the only model currently supporting longer sequences is Bart, which allows up to 1024 tokens in length.

Tensorflow 1 vs Tensorflow 2 Keras Inference Speed Differ by 2+ times

I'm trying to figure out the reason behind speed different from two different models.
an LSTM RNN model built using tensorflow 1.x:
self.input_placeholder = tf.placeholder(
tf.int32, shape=[self.config.batch_size, self.config.num_steps], name='Input')
self.labels_placeholder = tf.placeholder(
tf.int32, shape=[self.config.batch_size, self.config.num_steps], name='Target')
embedding = tf.get_variable(
'Embedding', initializer = self.embedding_matrix, trainable = False)
inputs = tf.nn.embedding_lookup(embedding, self.input_placeholder)
inputs = [tf.squeeze(x, axis = 1) for x in tf.split(inputs, self.config.num_steps, axis = 1)]
self.initial_state = tf.zeros([self.config.batch_size, self.config.hidden_size])
lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.config.hidden_size)
outputs, _ = tf.contrib.rnn.static_rnn(
lstm_cell, inputs, dtype = tf.float32,
sequence_length = [self.config.num_steps]*self.config.batch_size)
with tf.variable_scope('Projection'):
proj_U = tf.get_variable('Matrix', [self.config.hidden_size, self.config.vocab_size])
proj_b = tf.get_variable('Bias', [self.config.vocab_size])
outputs = [tf.matmul(o, proj_U) + proj_b for o in rnn_outputs]
the same model (at least from my understanding) built using tensorflow 2.0 keras:
def setup_model():
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
output_dim=embedding_dim,
weights=[embedding_matrix],
input_length=4,
trainable=False))
model.add(LSTM(config.hidden_size, activation="tanh"))
model.add(Dense(vocab_size, activation="softmax"))
return model
The architecture is:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 4, 100) 55400
_________________________________________________________________
lstm (LSTM) (None, 100) 80400
_________________________________________________________________
dense (Dense) (None, 554) 55954
=================================================================
Total params: 191,754
Trainable params: 136,354
Non-trainable params: 55,400
_________________________________________________________________
I was expecting similar inference runtime but the one built with tensorflow 1.x is much much faster. I was trying to convert tensorflow 1.x model to tensorflow 2 using only native tensorlow functions, but I have trouble converting due to the big change in tensorflow from 1.x to 2, and I was only able to create it using tf.keras.
In terms of speed, since I'm using both for generating text sequences + getting word probabilities, so I don't have a single inference time difference (I can't modify existing API from tensorflow 1.x model to get this). But in general, I'm seeing at least 2x difference in time from my use cases.
What can be the possible reasons behind this difference of inference speed? I'm happy to provide more information if needed.

Why pytorch training on CUDA works much slower than in CPU?

I guess i have made something in folowing simple neural network with PyTorch, because this runs much slower with CUDA then in CPU, can you find the mistake pls. The using function like
def backward(ctx, input):
return backward_sigm(ctx, input)
seems have no real impact on preformance
import torch
import torch.nn as nn
import torch.nn.functional as f
dname = 'cuda:0'
dname = 'cpu'
device = torch.device(dname)
print(torch.version.cuda)
def forward_sigm(ctx, input):
sigm = 1 / (1 + torch.exp(-input))
ctx.save_for_backward(sigm)
return sigm
def forward_step(ctx, input):
return torch.tensor(input > 0.5, dtype = torch.float32, device = device)
def backward_sigm(ctx, grad_output):
sigm, = ctx.saved_tensors
return grad_output * sigm * (1-sigm)
def backward_step(ctx, grad_output):
return grad_output
class StepAF(torch.autograd.Function):
#staticmethod
def forward(ctx, input):
return forward_sigm(ctx, input)
#staticmethod
def backward(ctx, input):
return backward_sigm(ctx, input)
#else return grad_output
class StepNN(torch.nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(StepNN, self).__init__()
self.linear1 = torch.nn.Linear(input_size, hidden_size)
#self.linear1.cuda()
self.linear2 = torch.nn.Linear(hidden_size, output_size)
#self.linear2.cuda()
#self.StepAF = StepAF.apply
def forward(self,x):
h_line_1 = self.linear1(x)
h_thrash_1 = StepAF.apply(h_line_1)
h_line_2 = self.linear2(h_thrash_1)
output = StepAF.apply(h_line_2)
return output
inputs = torch.tensor( [[1,0,1,0],[1,0,0,1],[0,1,0,1],[0,1,1,0],[1,0,0,0],[0,0,0,1],[1,1,0,1],[0,1,0,0],], dtype = torch.float32, device = device)
expected = torch.tensor( [[1,0,0],[1,0,0],[0,1,0],[0,1,0],[1,0,0],[0,0,1],[0,1,0],[0,0,1],], dtype = torch.float32, device = device)
nn = StepNN(4,8,3)
#print(*(x for x in nn.parameters()))
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(nn.parameters(), lr=1e-3)
steps = 50000
print_steps = steps // 20
good_loss = 1e-5
for t in range(steps):
output = nn(inputs)
loss = criterion(output, expected)
if t % print_steps == 0:
print('step ',t, ', loss :' , loss.item())
if loss < good_loss:
print('step ',t, ', loss :' , loss.item())
break
optimizer.zero_grad()
loss.backward()
optimizer.step()
test = torch.tensor( [[0,1,0,1],[0,1,1,0],[1,0,1,0],[1,1,0,1],], dtype = torch.float32, device=device)
print(nn(test))
Unless you have large enough data, you won't see any performance improvement while using GPU. The problem is that GPUs use parallel processing, so unless you have large amounts of data, the CPU can process the samples almost as fast as the GPU.
As far as I can see in your example, you are using 8 samples of size (4, 1). I would imagine maybe when having over hundreds or thousands of samples, then you would see the performance improvement on a GPU. In your case, the sample size is (4, 1), and the hidden layer size is 8, so the CPU can perform the calculations fairly quickly.
There are lots of example notebooks online of people using MNIST data (it has around 60000 images for training), so you could load one in maybe Google Colab and then try training on the CPU and then on GPU and observe the training times. You could try this link for example. It uses TensorFlow instead of PyTorch but it will give you an idea of the performance improvement of a GPU.
Note : If you haven't used Google Colab before, then you need to change the runtime type (None for CPU and GPU for GPU) in the runtime menu at the top.
Also, I will post the results from this notebook here itself (look at the time mentioned in the brackets, and if you run it, you can see firsthand how fast it runs) :
On CPU :
INFO:tensorflow:loss = 294.3736, step = 1
INFO:tensorflow:loss = 28.285727, step = 101 (23.769 sec)
INFO:tensorflow:loss = 23.518856, step = 201 (24.128 sec)
On GPU :
INFO:tensorflow:loss = 295.08328, step = 0
INFO:tensorflow:loss = 47.37291, step = 100 (4.709 sec)
INFO:tensorflow:loss = 23.31364, step = 200 (4.581 sec)
INFO:tensorflow:loss = 9.980572, step = 300 (4.572 sec)
INFO:tensorflow:loss = 17.769928, step = 400 (4.560 sec)
INFO:tensorflow:loss = 16.345463, step = 500 (4.531 sec)

How to do parallel processing in pytorch

I am working on a deep learning problem. I am solving it using pytorch. I have two GPU's which are on the same machine (16273MiB,12193MiB). I want to use both the GPU's for my training (video dataset).
I get a warning:
There is an imbalance between your GPUs. You may want to exclude GPU 1 which
has less than 75% of the memory or cores of GPU 0. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
I also get an error:
raise TypeError('Broadcast function not implemented for CPU tensors')
TypeError: Broadcast function not implemented for CPU tensors
if __name__ == '__main__':
opt.scales = [opt.initial_scale]
for i in range(1, opt.n_scales):
opt.scales.append(opt.scales[-1] * opt.scale_step)
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.mean = get_mean(opt.norm_value)
opt.std = get_std(opt.norm_value)
print("opt",opt)
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
json.dump(vars(opt), opt_file)
torch.manual_seed(opt.manual_seed)
model, parameters = generate_model(opt)
#print(model)
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable parameters: ", pytorch_total_params)
# Define Class weights
if opt.weighted:
print("Weighted Loss is created")
if opt.n_finetune_classes == 2:
weight = torch.tensor([1.0, 3.0])
else:
weight = torch.ones(opt.n_finetune_classes)
else:
weight = None
criterion = nn.CrossEntropyLoss()
if not opt.no_cuda:
criterion = nn.DataParallel(criterion.cuda())
if opt.no_mean_norm and not opt.std_norm:
norm_method = Normalize([0, 0, 0], [1, 1, 1])
elif not opt.std_norm:
norm_method = Normalize(opt.mean, [1, 1, 1])
else:
norm_method = Normalize(opt.mean, opt.std)
train_loader = torch.utils.data.DataLoader(
training_data,
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.n_threads,
pin_memory=True)
train_logger = Logger(
os.path.join(opt.result_path, 'train.log'),
['epoch', 'loss', 'acc', 'precision','recall','lr'])
train_batch_logger = Logger(
os.path.join(opt.result_path, 'train_batch.log'),
['epoch', 'batch', 'iter', 'loss', 'acc', 'precision', 'recall', 'lr'])
if opt.nesterov:
dampening = 0
else:
dampening = opt.dampening
optimizer = optim.SGD(
parameters,
lr=opt.learning_rate,
momentum=opt.momentum,
dampening=dampening,
weight_decay=opt.weight_decay,
nesterov=opt.nesterov)
# scheduler = lr_scheduler.ReduceLROnPlateau(
# optimizer, 'min', patience=opt.lr_patience)
if not opt.no_val:
spatial_transform = Compose([
Scale(opt.sample_size),
CenterCrop(opt.sample_size),
ToTensor(opt.norm_value), norm_method
])
print('run')
for i in range(opt.begin_epoch, opt.n_epochs + 1):
if not opt.no_train:
adjust_learning_rate(optimizer, i, opt.lr_steps)
train_epoch(i, train_loader, model, criterion, optimizer, opt,
train_logger, train_batch_logger)
I have also made changes in my train file:
model = nn.DataParallel(model(),device_ids=[0,1]).cuda()
outputs = model(inputs)
It does not seem to work properly and is giving error. Please advice, I am new to pytorch.
Thanks
As mentioned in this link, you have to do model.cuda() before passing it to nn.DataParallel.
net = nn.DataParallel(model.cuda(), device_ids=[0,1])
https://github.com/pytorch/pytorch/issues/17065

Resources