How to turn off Auto Mixed Precision in validation time? - validation

I try to run the MAC network (https://github.com/stanfordnlp/mac-network/tree/gqa) with Auto Mixed Precision.
def addOptimizerOp(self):
with tf.variable_scope("trainAddOptimizer"):
self.globalStep = tf.Variable(0, dtype = tf.int32, trainable = False, name = "globalStep") # init to 0 every run?
optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
if config.subsetOpt:
self.subsetOptimizer = tf.train.AdamOptimizer(learning_rate = self.lr * config.subsetOptMult)
return optimizer
In the first epoch, training is ok. However, when the model run evaluation on the validation set, I got this error.
Training epoch 1...
2019-08-05 14:51:13.625899: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:13.709959: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1723] Converted 1504/6920 nodes to float16 precision using 150 cast(s) to float16 (excluding Const and Variable casts)
2019-08-05 14:51:16.930248: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-08-05 14:51:17.331687: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudnn.so.7
2019-08-05 14:51:29.378905: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:29.380633: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1241] No whitelist ops found, nothing to do
eb 1, 10000,(160010 / 943000), t = 0.12 (0.00+0.11), lr 0.0003, l = 2.8493, a = 0.4250, avL = 2.5323, avA = 0.4188, g = 3.7617, emL = 2.3097, emA = 0.4119; gqaExperiment
Restoring EMA weights
2019-08-05 14:51:31.132804: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:31.136122: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1241] No whitelist ops found, nothing to do
2019-08-05 14:51:32.322369: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:32.341609: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1723] Converted 661/1848 nodes to float16 precision using 38 cast(s) to float16 (excluding Const and Variable casts)
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1356, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1341, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1429, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: TensorArray dtype is float but Op is trying to write dtype half.
[[{{node macModel/tower0/encoder/birnnLayer/bidirectional_rnn/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3}}]]
[[macModel/tower0/MACnetwork/MACCell_3/write/inter2attselfAttention/Softmax/_1661]]
(1) Invalid argument: TensorArray dtype is float but Op is trying to write dtype half.
[[{{node macModel/tower0/encoder/birnnLayer/bidirectional_rnn/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 866, in <module>
main()
File "main.py", line 728, in main
evalRes = runEvaluation(sess, model, data["main"], dataOps, epoch, getPreds = getPreds, prevRes = evalRes)
File "main.py", line 248, in runEvaluation
minLoss = prevRes["train"]["minLoss"] if prevRes else float("inf"))
File "main.py", line 594, in runEpoch
res = model.runBatch(sess, batch, imagesBatch, train, getPreds, getAtt)
File "/content/model.py", line 948, in runBatch
feed_dict = feed)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
I think the errors may come from not turning off the auto mixed precision on evaluation time or using the same session and model for train and evaluation. I tried "tf.train.experimental.disable_mixed_precision_graph_rewrite()" but I do not know how to use it in the right way.
How can I fix it? Thanks all.
def main():
with open(config.configFile(), "a+") as outFile:
json.dump(vars(config), outFile)
# set gpus
if config.gpus != "":
config.gpusNum = len(config.gpus.split(","))
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpus
tf.logging.set_verbosity(tf.logging.ERROR)
# process data
print(bold("Preprocess data..."))
start = time.time()
preprocessor = Preprocesser()
data, embeddings, answerDict, questionDict = preprocessor.preprocessData()
print("took {} seconds".format(bcolored("{:.2f}".format(time.time() - start), "blue")))
nextElement = None
dataOps = None
# build model
print(bold("Building model..."))
start = time.time()
model = MACnet(embeddings, answerDict, questionDict, nextElement)
print("took {} seconds".format(bcolored("{:.2f}".format(time.time() - start), "blue")))
# initializer
init = tf.global_variables_initializer()
# savers
savers = setSavers(model)
saver, emaSaver = savers["saver"], savers["emaSaver"]
# sessionConfig
sessionConfig = setSession()
with tf.Session(config = sessionConfig) as sess:
# ensure no more ops are added after model is built
sess.graph.finalize()
# restore / initialize weights, initialize epoch variable
epoch = loadWeights(sess, saver, init)
trainRes, evalRes = None, None
if config.train:
start0 = time.time()
bestEpoch = epoch
bestRes = None
prevRes = None
# epoch in [restored + 1, epochs]
for epoch in range(config.restoreEpoch + 1, config.epochs + 1):
print(bcolored("Training epoch {}...".format(epoch), "green"))
start = time.time()
# train
# calle = lambda: model.runEpoch(), collectRuntimeStats, writer
trainingData, alterData = chooseTrainingData(data)
trainRes = runEpoch(sess, model, trainingData, dataOps, train = True, epoch = epoch,
saver = saver, alterData = alterData,
maxAcc = trainRes["maxAcc"] if trainRes else 0.0,
minLoss = trainRes["minLoss"] if trainRes else float("inf"),)
# save weights
saver.save(sess, config.weightsFile(epoch))
if config.saveSubset:
subsetSaver.save(sess, config.subsetWeightsFile(epoch))
# load EMA weights
if config.useEMA:
print(bold("Restoring EMA weights"))
emaSaver.restore(sess, config.weightsFile(epoch))
# evaluation
getPreds = config.getPreds or (config.analysisType != "")
evalRes = runEvaluation(sess, model, data["main"], dataOps, epoch, getPreds = getPreds, prevRes = evalRes)
extraEvalRes = runEvaluation(sess, model, data["extra"], dataOps, epoch,
evalTrain = not config.extraVal, getPreds = getPreds)
# restore standard weights
if config.useEMA:
print(bold("Restoring standard weights"))
saver.restore(sess, config.weightsFile(epoch))
print("")
epochTime = time.time() - start
print("took {:.2f} seconds".format(epochTime))
# print results
printDatasetResults(trainRes, evalRes, extraEvalRes)

Related

How to test a model before fine-tuning in Pytorch Lightning?

Doing things on Google Colab.
transformers: 4.10.2
pytorch-lightning: 1.2.7
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
dataset_for_loader = [
{'data':torch.tensor([0,1]), 'labels':torch.tensor(0)},
{'data':torch.tensor([2,3]), 'labels':torch.tensor(1)},
{'data':torch.tensor([4,5]), 'labels':torch.tensor(2)},
{'data':torch.tensor([6,7]), 'labels':torch.tensor(3)},
]
loader = DataLoader(dataset_for_loader, batch_size=2)
for idx, batch in enumerate(loader):
print(f'# batch {idx}')
print(batch)
category_list = [
'dokujo-tsushin',
'it-life-hack',
'kaden-channel',
'livedoor-homme',
'movie-enter',
'peachy',
'smax',
'sports-watch',
'topic-news'
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
# file ./text has lots of articles, categorized by category
# and they are just plain texts, whose content begins from forth line
for file in glob.glob(f'./text/{category}/{category}*'):
lines = open(file).read().splitlines()
text = '\n'.join(lines[3:])
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True
)
encoding['labels'] = label
encoding = { k: torch.tensor(v) for k, v in encoding.items() }
dataset_for_loader.append(encoding)
SEED=lambda:0.0
# random.shuffle(dataset_for_loader) # ランダムにシャッフル
random.shuffle(dataset_for_loader,SEED)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]
dataloader_train = DataLoader(
dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)
class BertForSequenceClassification_pl(pl.LightningModule):
def __init__(self, model_name, num_labels, lr):
super().__init__()
self.save_hyperparameters()
self.bert_sc = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def training_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
loss = output.loss
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
val_loss = output.loss
self.log('val_loss', val_loss)
def test_step(self, batch, batch_idx):
labels = batch.pop('labels')
output = self.bert_sc(**batch)
labels_predicted = output.logits.argmax(-1)
num_correct = ( labels_predicted == labels ).sum().item()
accuracy = num_correct/labels.size(0)
self.log('accuracy', accuracy)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
checkpoint = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=1,
save_weights_only=True,
dirpath='model/',
)
trainer = pl.Trainer(
gpus=1,
max_epochs=10,
callbacks = [checkpoint]
)
model = BertForSequenceClassification_pl(
MODEL_NAME, num_labels=9, lr=1e-5
)
### (a) ###
# I think this is where I am doing fine-tuning
trainer.fit(model, dataloader_train, dataloader_val)
# this is to score after fine-tuning
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
But I am not really sure how to do a test before fine-tuning, in order to compare two models before and after fine-tuning, in order to show how effective fine-tuning is.
Inserting the following two lines to ### (a) ###:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
I got this result:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-c8b2c67f2d5c> in <module>()
9
10 # 6-19
---> 11 test = trainer.test(test_dataloaders=dataloader_test)
12 print(f'Accuracy: {test[0]["accuracy"]:.2f}')
13
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in test(self, model, test_dataloaders, ckpt_path, verbose, datamodule)
896 self.verbose_test = verbose
897
--> 898 self._set_running_stage(RunningStage.TESTING, model or self.lightning_module)
899
900 # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _set_running_stage(self, stage, model_ref)
563 the trainer and the model
564 """
--> 565 model_ref.running_stage = stage
566 self._running_stage = stage
567
AttributeError: 'NoneType' object has no attribute 'running_stage'
I noticed that Trainer.fit() can take None as arguments other than model, so I tried this:
trainer.fit(model)
test=trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
The result:
MisconfigurationException: No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.
Thanks.
The Trainer needs to call its .fit() in order to set up a lot of things and then only you can do .test() or other methods.
You are right about putting a .fit() just before .test() but the fit call needs to a valid one. You have to feed a dataloader/datamodule to it. But since you don't want to do a training/validation in this fit call, just pass limit_[train/val]_batches=0 while Trainer construction.
trainer = Trainer(gpus=..., ..., limit_train_batches=0, limit_val_batches=0)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # without fine-tuning
The fit call here will just set things up for you and skip training/validation. And then the testing follows. Next time run the same code but without the limit_[train/val]_batches, this will do the pretraining for you
trainer = Trainer(gpus=..., ...)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # with fine-tuning
Clarifying a bit about .fit() taking None for all but model: Its not quite true - you must provide either a DataLoader or a DataModule.

multiple input does not work on Tensorflow 2 installation on MAC OS M1 - AttributeError: 'tuple' object has no attribute 'shape'

I have a Tensorflow (sequence) model where the model takes 2 input streams. Here is the part of the code where the model is defined:
generic_in = Input(shape=(sequence_length, nr_features), name='generic_input')
input1_in = Input(shape=(sequence_length, nr_features), name='input1_input')
input2_in = Input(shape=(sequence_length, nr_features), name='input2_input')
generic_out, generic_state_h, generic_state_c = LSTM(50,
return_sequences = False,
return_state = True,
dropout = 0.15,
recurrent_dropout = 0.15,
name="generic_lstm")(generic_in)
concatenated_gen_out = Concatenate()([ generic_state_h, generic_state_c ])
gen_dense_out = Dense(100,
activation='relu',
name="generic_dense")(concatenated_gen_out)
gen_dense_out = BatchNormalization()(gen_dense_out)
gen_dense_out = Dropout(0.15)(gen_dense_out)
generic_model = Model( inputs = [ generic_in ], outputs = [ gen_dense_out ] )
input1_dense_out = generic_model(input1_in)
input2_dense_out = generic_model(input2_in)
concatenated_out = Concatenate()([ input1_dense_out, input2_dense_out ])
dense2_out = Dense(100,
activation='relu',
name="dense_2")(concatenated_out)
dense2_out = BatchNormalization()(dense2_out)
dense2_out = Dropout(0.2)(dense2_out)
softmax_out = Dense(nr_classes,
activation='softmax',
name="final_output_layer")(dense2_out)
model = Model(inputs = [ input1_in, input2_in ],
outputs = [ softmax_out ])
#opt = tf.keras.optimizers.Adam(lr=0.00008, decay=0.000001)
opt = tf.keras.optimizers.Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',
optimizer=opt,
metrics=['accuracy'])
history = model.fit(x=train_x,
y=train_y,
batch_size=BATCH_SIZE,
epochs=80,
verbose=2,
validation_data=(dev_x, dev_y),
shuffle=True)
Please note that train_x which is the input to the model.fit method is a list containing 2 inputs as defined in model = Model(inputs = [input1_in, input2_in], outputs = [softmax_out]).
This works perfectly fine in my Tensorflow v1.13.1 installation on Windows. I am trying to migrate my project to MAC OS Big Sur v11.3.1 with M1 chip. The Tensorflow version on the MAC OS is 2.4.0-rc0.
I obviously made some changes to make it work with Tensorflow 2 but these changes are mainly API call updates based on the new API.
The error I get on MAC Tensorflow installation:
Traceback (most recent call last):
File "/Users/me/Developer/AI_Projects/Football/M1_FULL_TIME_MR/src/train_fit_v2.py", line 288, in <module>
history = model.fit(x=train_x,
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1100, in fit
tmp_logs = self.train_function(iterator)
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 828, in __call__
result = self._call(*args, **kwds)
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 871, in _call
self._initialize(args, kwds, add_initializers_to=initializers)
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 725, in _initialize
self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 2969, in _get_concrete_function_internal_garbage_collected
graph_function, _ = self._maybe_define_function(args, kwargs)
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 3361, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 3196, in _create_graph_function
func_graph_module.func_graph_from_py_func(
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 990, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 634, in wrapped_fn
out = weak_wrapped_fn().__wrapped__(*args, **kwds)
File "/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 977, in wrapper
raise e.ag_error_metadata.to_exception(e)
AttributeError: in user code:
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function *
return step_function(self, iterator)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:795 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
return fn(*args, **kwargs)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:788 run_step **
outputs = model.train_step(data)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:758 train_step
self.compiled_metrics.update_state(y, y_pred, sample_weight)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:387 update_state
self.build(y_pred, y_true)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:317 build
self._metrics = nest.map_structure_up_to(y_pred, self._get_metric_objects,
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/util/nest.py:1159 map_structure_up_to
return map_structure_with_tuple_paths_up_to(
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/util/nest.py:1257 map_structure_with_tuple_paths_up_to
results = [
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/util/nest.py:1258 <listcomp>
func(*args, **kwargs) for args in zip(flat_path_gen, *flat_value_gen)
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/util/nest.py:1161 <lambda>
lambda _, *values: func(*values), # Discards the path arg.
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:418 _get_metric_objects
return [self._get_metric_object(m, y_t, y_p) for m in metrics]
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:418 <listcomp>
return [self._get_metric_object(m, y_t, y_p) for m in metrics]
/Users/me/miniforge3/envs/tf_dev/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:439 _get_metric_object
y_t_rank = len(y_t.shape.as_list())
AttributeError: 'tuple' object has no attribute 'shape'
I am totally out of solutions. What am I supposed to do to make it work?
As per the error you posted y_t is of type tuple. Tuple has attribute shape.
You can try casting y_t.
numpy.array(y_t)

Am getting error trying to predict on a single image CNN pytorch

Error message
Traceback (most recent call last):
File "pred.py", line 134, in
output = model(data)
Runtime Error: Expected 4-dimensional input for 4-dimensional weight [16, 3, 3, 3], but got 3-dimensional input of size [1, 32, 32] instead.
Prediction code
normalize = transforms.Normalize(mean=[0.4914, 0.4824, 0.4467],
std=[0.2471, 0.2435, 0.2616])
train_set = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
])
model = models.condensenet(args)
model = nn.DataParallel(model)
PATH = "results/savedir/save_models/checkpoint_001.pth.tar"
model.load_state_dict(torch.load(PATH)['state_dict'])
device = torch.device("cpu")
model.eval()
image = Image.open("horse.jpg")
input = train_set(image)
train_loader = torch.utils.data.DataLoader(
input,
batch_size=1,shuffle=True, num_workers=1)
for i, data in enumerate(train_loader):
#input_var = torch.autograd.Variable(data, volatile=True)
#input_var = input_var.view(1, 3, 32,32)
**output = model(data)
topk=(1,5)
maxk = max(topk)
_, pred = output.topk(maxk, 1, True, True)
Am getting this error when am trying to predict on a single image
Image shape/size error message
Link to saved model
Training code repository
Plz uncomment this line #input_var = input_var.view(1, 3, 32,32) so that your input dimension is 4.
I assume that your no. of input channels are 3 if its one then use input_var = input_var.view(1, 1, 32,32) if gray scale
Instead of doing the for loop and train_loader, solved this by just passing the input directly into the model. like this
input = train_set(image)
input = input.unsqueeze(0)
model.eval()
output = model(input)
More details can be found here link

How to do parallel processing in pytorch

I am working on a deep learning problem. I am solving it using pytorch. I have two GPU's which are on the same machine (16273MiB,12193MiB). I want to use both the GPU's for my training (video dataset).
I get a warning:
There is an imbalance between your GPUs. You may want to exclude GPU 1 which
has less than 75% of the memory or cores of GPU 0. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
I also get an error:
raise TypeError('Broadcast function not implemented for CPU tensors')
TypeError: Broadcast function not implemented for CPU tensors
if __name__ == '__main__':
opt.scales = [opt.initial_scale]
for i in range(1, opt.n_scales):
opt.scales.append(opt.scales[-1] * opt.scale_step)
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.mean = get_mean(opt.norm_value)
opt.std = get_std(opt.norm_value)
print("opt",opt)
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
json.dump(vars(opt), opt_file)
torch.manual_seed(opt.manual_seed)
model, parameters = generate_model(opt)
#print(model)
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable parameters: ", pytorch_total_params)
# Define Class weights
if opt.weighted:
print("Weighted Loss is created")
if opt.n_finetune_classes == 2:
weight = torch.tensor([1.0, 3.0])
else:
weight = torch.ones(opt.n_finetune_classes)
else:
weight = None
criterion = nn.CrossEntropyLoss()
if not opt.no_cuda:
criterion = nn.DataParallel(criterion.cuda())
if opt.no_mean_norm and not opt.std_norm:
norm_method = Normalize([0, 0, 0], [1, 1, 1])
elif not opt.std_norm:
norm_method = Normalize(opt.mean, [1, 1, 1])
else:
norm_method = Normalize(opt.mean, opt.std)
train_loader = torch.utils.data.DataLoader(
training_data,
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.n_threads,
pin_memory=True)
train_logger = Logger(
os.path.join(opt.result_path, 'train.log'),
['epoch', 'loss', 'acc', 'precision','recall','lr'])
train_batch_logger = Logger(
os.path.join(opt.result_path, 'train_batch.log'),
['epoch', 'batch', 'iter', 'loss', 'acc', 'precision', 'recall', 'lr'])
if opt.nesterov:
dampening = 0
else:
dampening = opt.dampening
optimizer = optim.SGD(
parameters,
lr=opt.learning_rate,
momentum=opt.momentum,
dampening=dampening,
weight_decay=opt.weight_decay,
nesterov=opt.nesterov)
# scheduler = lr_scheduler.ReduceLROnPlateau(
# optimizer, 'min', patience=opt.lr_patience)
if not opt.no_val:
spatial_transform = Compose([
Scale(opt.sample_size),
CenterCrop(opt.sample_size),
ToTensor(opt.norm_value), norm_method
])
print('run')
for i in range(opt.begin_epoch, opt.n_epochs + 1):
if not opt.no_train:
adjust_learning_rate(optimizer, i, opt.lr_steps)
train_epoch(i, train_loader, model, criterion, optimizer, opt,
train_logger, train_batch_logger)
I have also made changes in my train file:
model = nn.DataParallel(model(),device_ids=[0,1]).cuda()
outputs = model(inputs)
It does not seem to work properly and is giving error. Please advice, I am new to pytorch.
Thanks
As mentioned in this link, you have to do model.cuda() before passing it to nn.DataParallel.
net = nn.DataParallel(model.cuda(), device_ids=[0,1])
https://github.com/pytorch/pytorch/issues/17065

Dask multiprocessing fails with embarrassingly parallel for loop including call to MongoDB when number of iterations is high enough

I'm trying to run a kind of simulation in Python for loop in parallel using Dask multiprocessing. Parallelization works fine when number of iterations is fairly low but fails when the amount increases. The issue occurs on Win7 (4 cores, 10 Gb RAM), Win10 (8 cores, 8 Gb RAM) and Azure VM running Windows Server 2016 (16 cores, 32 Gb RAM). The slowest one, Win7, can go through most iterations before failing. The issue can be mitigated by adding long enough sleep time at the end of each function included in the process, but the required amount of sleeping results in very low performance, similar to running sequentially.
I hope someone will be able to help me out here. Thanks in advance for comments and answers!
The following simple code contains some phases of the for loop and repeats the error.
import json
import pandas as pd
from pymongo import MongoClient
# Create random DataFrame
df = pd.DataFrame(np.random.randint(0,100,size=(100,11)), columns=list('ABCDEFGHIJK'))
# Save to Mongo
client = MongoClient()
db = client.errordemo
res = db.errordemo.insert_many(json.loads(df.to_json(orient='records')))
db.client.close()
class ToBeRunParallel:
def __init__(self):
pass
def functionToBeRunParallel(self, i):
# Read data from mongo
with MongoClient() as client:
db = client.errordemo
dataFromMongo = pd.DataFrame.from_records(db.errordemo.find({}, {'_id': 0}))
# Randomize data
dataRand = dataFromMongo.apply(pd.to_numeric).apply(rand, volatility=0.1)
# Sum rows
dataSum = dataRand.sum(axis=1)
# Select randomly one of the resulting values and return
return dataSum.sample().values[0]
Call the function functionToBeRunParallel either in console or Jupyter (both fail). 'errordemo' is a local module containing the class ToBeRunParallel. While running the on Azure VM, the code succeeds with 500 loops and fails with 5,000.
import errordemo
from dask import delayed, compute, multiprocessing
# Determine how many times to loop
rng = range(15000)
# Define empty result lists
resList = []
# Create instance
err = errordemo.ToBeRunParallel()
# Loop in parallel using Dask
for i in rng:
sampleValue = delayed(err.functionToBeRunParallel)(i)
resList.append(sampleValue)
# Compute in parallel
result = compute(*resList, get=multiprocessing.get)
The error stack in Jupyter is as follows.
---------------------------------------------------------------------------
AutoReconnect Traceback (most recent call last)
<ipython-input-3-9f535dd4c621> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', '# Determine how many times to loop\nrng = range(50000)\n\n# Define empty result lists\nresList = []\n\n# Create instance\nerr = errordemo.ToBeRunParallel()\n\n# Loop in parallel using Dask\nfor i in rng:\n sampleValue = delayed(err.functionToBeRunParallel)(i)\n resList.append(sampleValue)\n \n# Compute in parallel \nresult = compute(*resList, get=dask.multiprocessing.get)')
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2113 magic_arg_s = self.var_expand(line, stack_depth)
2114 with self.builtin_trap:
-> 2115 result = fn(magic_arg_s, cell)
2116 return result
2117
<decorator-gen-60> in time(self, line, cell, local_ns)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\magics\execution.py in time(self, line, cell, local_ns)
1178 else:
1179 st = clock2()
-> 1180 exec(code, glob, local_ns)
1181 end = clock2()
1182 out = None
<timed exec> in <module>()
C:\ProgramData\Anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
200 dsk = collections_to_dsk(variables, optimize_graph, **kwargs)
201 keys = [var._keys() for var in variables]
--> 202 results = get(dsk, keys, **kwargs)
203
204 results_iter = iter(results)
C:\ProgramData\Anaconda3\lib\site-packages\dask\multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, **kwargs)
85 result = get_async(pool.apply_async, len(pool._pool), dsk3, keys,
86 get_id=_process_get_id,
---> 87 dumps=dumps, loads=loads, **kwargs)
88 finally:
89 if cleanup:
C:\ProgramData\Anaconda3\lib\site-packages\dask\async.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, dumps, loads, **kwargs)
498 _execute_task(task, data) # Re-execute locally
499 else:
--> 500 raise(remote_exception(res, tb))
501 state['cache'][key] = res
502 finish_task(dsk, key, state, results, keyorder.get)
AutoReconnect: localhost:27017: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
Traceback
---------
File "C:\ProgramData\Anaconda3\lib\site-packages\dask\async.py", line 266, in execute_task
result = _execute_task(task, data)
File "C:\ProgramData\Anaconda3\lib\site-packages\dask\async.py", line 247, in _execute_task
return func(*args2)
File "C:\Git_repository\footie\Pipeline\errordemo.py", line 20, in functionToBeRunParallel
dataFromMongo = pd.DataFrame.from_records(db.errordemo.find({}, {'_id': 0}))
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 981, in from_records
first_row = next(data)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\cursor.py", line 1090, in next
if len(self.__data) or self._refresh():
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\cursor.py", line 1012, in _refresh
self.__read_concern))
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\cursor.py", line 850, in __send_message
**kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\mongo_client.py", line 844, in _send_message_with_response
exhaust)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\mongo_client.py", line 855, in _reset_on_error
return func(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\server.py", line 99, in send_message_with_response
with self.get_socket(all_credentials, exhaust) as sock_info:
File "C:\ProgramData\Anaconda3\lib\contextlib.py", line 82, in __enter__
return next(self.gen)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\server.py", line 163, in get_socket
with self.pool.get_socket(all_credentials, checkout) as sock_info:
File "C:\ProgramData\Anaconda3\lib\contextlib.py", line 82, in __enter__
return next(self.gen)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 582, in get_socket
sock_info = self._get_socket_no_auth()
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 618, in _get_socket_no_auth
sock_info, from_pool = self.connect(), False
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 555, in connect
_raise_connection_failure(self.address, error)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\pool.py", line 65, in _raise_connection_failure
raise AutoReconnect(msg)
UPDATE:
Following this post, I created a decorator to catch AutoReconnect exception like shown below. Together with parameters for MongoClient the looping works, but it's still very slow, double the time it should take. (timing on the Azure VM):
500 iterations: 3.74s
50,000 iterations: 12min 12s
def safe_mongocall(call):
def _safe_mongocall(*args, **kwargs):
for i in range(5):
try:
return call(*args, **kwargs)
except errors.AutoReconnect:
sleep(random.random() / 100)
print('Error: Failed operation!')
return _safe_mongocall
#safe_mongocall
def functionToBeRunParallel(self, i):
# Read data from mongo
with MongoClient(connect=False, maxPoolSize=None, maxIdleTimeMS=100) as client:
db = client.errordemo
dataFromMongo = pd.DataFrame.from_records(db.errordemo.find({}, {'_id': 0}))
# Randomize data
dataRand = dataFromMongo.apply(pd.to_numeric).apply(rand, volatility=0.1)
# Sum rows
dataSum = dataRand.sum(axis=1)
# Select randomly one of the resulting values and return
return dataSum.sample().values[0]
The actual issue is exhausting of TCP/IP ports, hence the solution is to avoid exhaustion. Following article by Microsoft, I added the following registry keys and values to HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters:
MaxUserPort: 65534
TcpTimedWaitDelay: 30

Resources