InvalidArgumentError: input_14:0 is both fed and fetched - filter

I want to Visualize my CNN filters on every layer. I write a code for this but this is giving me some error.I want to see filter images of every layer and also want to see the heat maps of the area which my neural net use the most to predict the particular label. By doing this I am able to understand the working of my cnn and do further work on my model for better results
I searched it on google but I found mostly sited with theory but i need to see code for the solution
x = Conv2D(64,(3,3),strides = (1,1),name='layer_conv1',padding='same')(input)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2,2),name='maxPool1')(x)
x = Conv2D(64,(3,3),strides = (1,1),name='layer_conv2',padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2,2),name='maxPool2')(x)
x = Conv2D(32,(3,3),strides = (1,1),name='conv3',padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2,2),name='maxPool3')(x)
x = Flatten()(x)
x = Dense(64,activation = 'relu',name='fc0')(x)
x = Dropout(0.25)(x)
x = Dense(32,activation = 'relu',name='fc1')(x)
x = Dropout(0.25)(x)
x = Dense(2,activation = 'softmax',name='fc2')(x)
model = Model(inputs = input,outputs = x,name='Predict')
a=np.expand_dims( X_train[10],axis=0)
a.shape
from keras.models import Model
layer_outputs = [layer.output for layer in model.layers]
activation_model = Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(a)
I am getting this error
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-249-119bf7ea835a> in <module>()
2 layer_outputs = [layer.output for layer in model.layers]
3 activation_model = Model(inputs=model.input, outputs=layer_outputs)
----> 4 activations = activation_model.predict(a)
5
6
/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/engine/training.py in predict(self, x, batch_size, verbose, steps, callbacks)
1185 verbose=verbose,
1186 steps=steps,
-> 1187 callbacks=callbacks)
1188
1189 def train_on_batch(self, x, y,
/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/engine/training_arrays.py in predict_loop(model, f, ins, batch_size, verbose, steps, callbacks)
320 batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
321 callbacks._call_batch_hook('predict', 'begin', batch_index, batch_logs)
--> 322 batch_outs = f(ins_batch)
323 batch_outs = to_list(batch_outs)
324 if batch_index == 0:
/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/backend/tensorflow_backend.py in __call__(self, inputs)
2919 return self._legacy_call(inputs)
2920
-> 2921 return self._call(inputs)
2922 else:
2923 if py_any(is_tensor(x) for x in inputs):
/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/backend/tensorflow_backend.py in _call(self, inputs)
2873 feed_symbols,
2874 symbol_vals,
-> 2875 session)
2876 if self.run_metadata:
2877 fetched = self._callable_fn(*array_vals, run_metadata=self.run_metadata)
/opt/conda/lib/python3.6/site-packages/Keras-2.2.4-py3.6.egg/keras/backend/tensorflow_backend.py in _make_callable(self, feed_arrays, feed_symbols, symbol_vals, session)
2825 callable_opts.run_options.CopyFrom(self.run_options)
2826 # Create callable.
-> 2827 callable_fn = session._make_callable_from_options(callable_opts)
2828 # Cache parameters corresponding to the generated callable, so that
2829 # we can detect future mismatches and refresh the callable.
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _make_callable_from_options(self, callable_options)
1469 """
1470 self._extend_graph()
-> 1471 return BaseSession._Callable(self, callable_options)
1472
1473
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in __init__(self, session, callable_options)
1423 with errors.raise_exception_on_not_ok_status() as status:
1424 self._handle = tf_session.TF_SessionMakeCallable(
-> 1425 session._session, options_ptr, status)
1426 finally:
1427 tf_session.TF_DeleteBuffer(options_ptr)
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
526 None, None,
527 compat.as_text(c_api.TF_Message(self.status.status)),
--> 528 c_api.TF_GetCode(self.status.status))
529 # Delete the underlying status object from memory otherwise it stays alive
530 # as there is a reference to status from this from the traceback due to
InvalidArgumentError: input_14:0 is both fed and fetched.
I tried by removing some layers and adding some layer but it didnt help me. I found very less code on google.

To access any layer's output, you can use function in keras, something like this:
from keras import backend as K
last_layer_output = K.function([model.layers[0].input],
[model.layers[-1].output])
layer_output = last_layer_output([x])[0]
So to access all layer's output, you can create as many such function as follows:
outputs = [layer.output for layer in model.layers]
functors = [K.function([model.input, K.learning_phase()], [out]) for out in outputs]
layer_outs = [func([x_test[:4], 1.]) for func in functors]
Note: keras-function produce one output for one layer.
More you can read it here

with this model my problem is not solving so I make a simple model and use keras fucntions to get layers output and this is easy as compared to my previous model.
model = Sequential()
model.add(Conv2D(16,kernel_size = (5,5),activation = 'relu', activity_regularizer=regularizers.l2(1e-8)))
model.add(Conv2D(32,kernel_size = (5,5),activation = 'relu', activity_regularizer = regularizers.l2(1e-8)))
model.add(MaxPooling2D(3,3))
model.add(Conv2D(64,kernel_size = (5,5),activation = 'relu', activity_regularizer = regularizers.l2(1e-8)))
model.add(MaxPooling2D(3,3))
model.add(Conv2D(128,activation = 'relu',kernel_size = (3,3),activity_regularizer = regularizers.l2(1e-8)))
model.add(Flatten())
model.add(Dropout(0.8))
model.add(Dense(64,activation = 'relu',activity_regularizer = regularizers.l2(1e-8)))
model.add(Dropout(0.8))
model.add(Dense(64,activation = 'relu',activity_regularizer = regularizers.l2(1e-8)))
model.add(Dropout(0.8))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr = 0.001,clipnorm = 1,momentum= 0.9), metrics=["accuracy"])
model.fit(X_train,y_train, epochs = 10 ,batch_size = 16,validation_data=(X_test,y_test_Categorical))
model.summary()
#a is my one example from test set
a=np.expand_dims( X_train[10],axis=0)
a.shape
layer_outputs = [layer.output for layer in model.layers]
activation_model = Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(a)
def display_activation(activations, col_size, row_size, act_index):
activation = activations[act_index]
activation_index=0
fig, ax = plt.subplots(row_size, col_size, figsize=(row_size*2.5,col_size*1.5))
for row in range(0,row_size):
for col in range(0,col_size):
ax[row][col].imshow(activation[0, :, :, activation_index])
activation_index += 1
display_activation(activations, 4, 4,0)
by doing this I am able to get my output

Related

PyTorch - NotImplementedError at model fit using MAC M1 MPS in Notebooks

I am getting error:
NotImplementedError: The operator 'aten::hardswish_' is not currently implemented for the MPS device, goes on to say, Comment https://github.com/pytorch/pytorch/issues/77764, but nothing there.
I have gone through the appropriate set up environment (conda activate 'en' and 'torch.backends.mps.is_available()' returns TRUE
Enviro:
Python Platform: macOS-13.2-arm64-arm-64bit
PyTorch Version: 1.13.1
Python 3.9.13 (main, Aug 25 2022, 18:24:45)
[Clang 12.0.0 ]
GPU is NOT AVAILABLE
MPS is AVAILABLE
Target device is mps
Googling indicates that some operators are not currently supported by PyTorch. If that's the case, should my implementation be different. Has anyone had experience in this, or will not work? (obviously work with cpu).
Getting the error here:
Input In [9], in <cell line: 4>()
13 imgs = list(img.to(device) for img in imgs)
14 annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
---> 15 loss_dict = model(imgs, annotations)
16 losses = sum(loss for loss in loss_dict.values())
18 optimizer.zero_grad()
Setup:
def get_model_instance_segmentation(num_classes):
# load an instance segmentation model pre-trained pre-trained on COCO
#model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=False)
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
return model
for img_, annotations in tr_loader:
list(img.to(device) for img in img_)
annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
#print(annotations)
for img_, annotations in val_loader:
list(img.to(device) for img in img_)
annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
#print(annotations)
model = get_model_instance_segmentation(config.num_classes)
if torch.backends.mps.is_available():
model.to(device)
print(device)
# parameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
params, lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay
)
len_trloader = len(tr_loader)
len_valloader = len(val_loader)
train_loss = []
val_loss = []
min_valid_loss = np.inf
for epoch in range(config.num_epochs):
print(f"Epoch: {epoch}/{config.num_epochs}")
model.train()
i = 0
j = 0
for imgs, annotations in tr_loader:
i += 1
imgs = list(img.to(device) for img in imgs)
annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
loss_dict = model(imgs, annotations)
losses = sum(loss for loss in loss_dict.values())
optimizer.zero_grad()
losses.backward()
optimizer.step()
train_loss.append(losses.tolist())
print(f"Iteration_Train: {i}/{len_trloader}, Loss: {losses}")
valid_loss = 0.0
for imgs, annotations in val_loader:
j += 1
imgs = list(img.to(device) for img in imgs)
annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
loss_dict_val = model(imgs, annotations)
losses_val = sum(loss_val for loss_val in loss_dict_val.values())
val_loss.append(losses_val.tolist())
print(f"Iteration_Val: {i}/{len_valloader}, Loss: {losses_val}")

[XAI for transformer custom model using AllenNLP]

I have been solving the NER problem for a Vietnamese dataset with 15 tags in IO format. I have been using the AllenNLP Interpret Toolkit for my model, but I can not configure it completely.
I have used a pre-trained language model "xlm-roberta-base" based-on HuggingFace. I have concatenated 4 last bert layers, and pass through to linear layer. The model architecture you can see in the source below.
class BaseBertSoftmax(nn.Module):
def __init__(self, model, drop_out , num_labels):
super(BaseBertSoftmax, self).__init__()
self.num_labels = num_labels
self.model = model
self.dropout = nn.Dropout(drop_out)
self.classifier = nn.Linear(4*768, num_labels) # 4 last of layer
def forward_custom(self, input_ids, attention_mask=None,
labels=None, head_mask=None):
outputs = self.model(input_ids = input_ids, attention_mask=attention_mask)
sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output) # bsz, seq_len, num_labels
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = nn.CrossEntropyLoss(ignore_index=0)
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs #scores, (hidden_states), (attentions)
What steps do I have to take to integrate this model to AllenNLP Interpret?
Could you please help me with this problem?

How to test a model before fine-tuning in Pytorch Lightning?

Doing things on Google Colab.
transformers: 4.10.2
pytorch-lightning: 1.2.7
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
dataset_for_loader = [
{'data':torch.tensor([0,1]), 'labels':torch.tensor(0)},
{'data':torch.tensor([2,3]), 'labels':torch.tensor(1)},
{'data':torch.tensor([4,5]), 'labels':torch.tensor(2)},
{'data':torch.tensor([6,7]), 'labels':torch.tensor(3)},
]
loader = DataLoader(dataset_for_loader, batch_size=2)
for idx, batch in enumerate(loader):
print(f'# batch {idx}')
print(batch)
category_list = [
'dokujo-tsushin',
'it-life-hack',
'kaden-channel',
'livedoor-homme',
'movie-enter',
'peachy',
'smax',
'sports-watch',
'topic-news'
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
# file ./text has lots of articles, categorized by category
# and they are just plain texts, whose content begins from forth line
for file in glob.glob(f'./text/{category}/{category}*'):
lines = open(file).read().splitlines()
text = '\n'.join(lines[3:])
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True
)
encoding['labels'] = label
encoding = { k: torch.tensor(v) for k, v in encoding.items() }
dataset_for_loader.append(encoding)
SEED=lambda:0.0
# random.shuffle(dataset_for_loader) # ランダムにシャッフル
random.shuffle(dataset_for_loader,SEED)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]
dataloader_train = DataLoader(
dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)
class BertForSequenceClassification_pl(pl.LightningModule):
def __init__(self, model_name, num_labels, lr):
super().__init__()
self.save_hyperparameters()
self.bert_sc = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def training_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
loss = output.loss
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
val_loss = output.loss
self.log('val_loss', val_loss)
def test_step(self, batch, batch_idx):
labels = batch.pop('labels')
output = self.bert_sc(**batch)
labels_predicted = output.logits.argmax(-1)
num_correct = ( labels_predicted == labels ).sum().item()
accuracy = num_correct/labels.size(0)
self.log('accuracy', accuracy)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
checkpoint = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=1,
save_weights_only=True,
dirpath='model/',
)
trainer = pl.Trainer(
gpus=1,
max_epochs=10,
callbacks = [checkpoint]
)
model = BertForSequenceClassification_pl(
MODEL_NAME, num_labels=9, lr=1e-5
)
### (a) ###
# I think this is where I am doing fine-tuning
trainer.fit(model, dataloader_train, dataloader_val)
# this is to score after fine-tuning
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
But I am not really sure how to do a test before fine-tuning, in order to compare two models before and after fine-tuning, in order to show how effective fine-tuning is.
Inserting the following two lines to ### (a) ###:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
I got this result:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-c8b2c67f2d5c> in <module>()
9
10 # 6-19
---> 11 test = trainer.test(test_dataloaders=dataloader_test)
12 print(f'Accuracy: {test[0]["accuracy"]:.2f}')
13
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in test(self, model, test_dataloaders, ckpt_path, verbose, datamodule)
896 self.verbose_test = verbose
897
--> 898 self._set_running_stage(RunningStage.TESTING, model or self.lightning_module)
899
900 # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _set_running_stage(self, stage, model_ref)
563 the trainer and the model
564 """
--> 565 model_ref.running_stage = stage
566 self._running_stage = stage
567
AttributeError: 'NoneType' object has no attribute 'running_stage'
I noticed that Trainer.fit() can take None as arguments other than model, so I tried this:
trainer.fit(model)
test=trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
The result:
MisconfigurationException: No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.
Thanks.
The Trainer needs to call its .fit() in order to set up a lot of things and then only you can do .test() or other methods.
You are right about putting a .fit() just before .test() but the fit call needs to a valid one. You have to feed a dataloader/datamodule to it. But since you don't want to do a training/validation in this fit call, just pass limit_[train/val]_batches=0 while Trainer construction.
trainer = Trainer(gpus=..., ..., limit_train_batches=0, limit_val_batches=0)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # without fine-tuning
The fit call here will just set things up for you and skip training/validation. And then the testing follows. Next time run the same code but without the limit_[train/val]_batches, this will do the pretraining for you
trainer = Trainer(gpus=..., ...)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # with fine-tuning
Clarifying a bit about .fit() taking None for all but model: Its not quite true - you must provide either a DataLoader or a DataModule.

How to turn off Auto Mixed Precision in validation time?

I try to run the MAC network (https://github.com/stanfordnlp/mac-network/tree/gqa) with Auto Mixed Precision.
def addOptimizerOp(self):
with tf.variable_scope("trainAddOptimizer"):
self.globalStep = tf.Variable(0, dtype = tf.int32, trainable = False, name = "globalStep") # init to 0 every run?
optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
if config.subsetOpt:
self.subsetOptimizer = tf.train.AdamOptimizer(learning_rate = self.lr * config.subsetOptMult)
return optimizer
In the first epoch, training is ok. However, when the model run evaluation on the validation set, I got this error.
Training epoch 1...
2019-08-05 14:51:13.625899: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:13.709959: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1723] Converted 1504/6920 nodes to float16 precision using 150 cast(s) to float16 (excluding Const and Variable casts)
2019-08-05 14:51:16.930248: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-08-05 14:51:17.331687: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudnn.so.7
2019-08-05 14:51:29.378905: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:29.380633: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1241] No whitelist ops found, nothing to do
eb 1, 10000,(160010 / 943000), t = 0.12 (0.00+0.11), lr 0.0003, l = 2.8493, a = 0.4250, avL = 2.5323, avA = 0.4188, g = 3.7617, emL = 2.3097, emA = 0.4119; gqaExperiment
Restoring EMA weights
2019-08-05 14:51:31.132804: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:31.136122: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1241] No whitelist ops found, nothing to do
2019-08-05 14:51:32.322369: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1767] Running auto_mixed_precision graph optimizer
2019-08-05 14:51:32.341609: I tensorflow/core/grappler/optimizers/auto_mixed_precision.cc:1723] Converted 661/1848 nodes to float16 precision using 38 cast(s) to float16 (excluding Const and Variable casts)
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1356, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1341, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1429, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: TensorArray dtype is float but Op is trying to write dtype half.
[[{{node macModel/tower0/encoder/birnnLayer/bidirectional_rnn/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3}}]]
[[macModel/tower0/MACnetwork/MACCell_3/write/inter2attselfAttention/Softmax/_1661]]
(1) Invalid argument: TensorArray dtype is float but Op is trying to write dtype half.
[[{{node macModel/tower0/encoder/birnnLayer/bidirectional_rnn/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 866, in <module>
main()
File "main.py", line 728, in main
evalRes = runEvaluation(sess, model, data["main"], dataOps, epoch, getPreds = getPreds, prevRes = evalRes)
File "main.py", line 248, in runEvaluation
minLoss = prevRes["train"]["minLoss"] if prevRes else float("inf"))
File "main.py", line 594, in runEpoch
res = model.runBatch(sess, batch, imagesBatch, train, getPreds, getAtt)
File "/content/model.py", line 948, in runBatch
feed_dict = feed)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
I think the errors may come from not turning off the auto mixed precision on evaluation time or using the same session and model for train and evaluation. I tried "tf.train.experimental.disable_mixed_precision_graph_rewrite()" but I do not know how to use it in the right way.
How can I fix it? Thanks all.
def main():
with open(config.configFile(), "a+") as outFile:
json.dump(vars(config), outFile)
# set gpus
if config.gpus != "":
config.gpusNum = len(config.gpus.split(","))
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpus
tf.logging.set_verbosity(tf.logging.ERROR)
# process data
print(bold("Preprocess data..."))
start = time.time()
preprocessor = Preprocesser()
data, embeddings, answerDict, questionDict = preprocessor.preprocessData()
print("took {} seconds".format(bcolored("{:.2f}".format(time.time() - start), "blue")))
nextElement = None
dataOps = None
# build model
print(bold("Building model..."))
start = time.time()
model = MACnet(embeddings, answerDict, questionDict, nextElement)
print("took {} seconds".format(bcolored("{:.2f}".format(time.time() - start), "blue")))
# initializer
init = tf.global_variables_initializer()
# savers
savers = setSavers(model)
saver, emaSaver = savers["saver"], savers["emaSaver"]
# sessionConfig
sessionConfig = setSession()
with tf.Session(config = sessionConfig) as sess:
# ensure no more ops are added after model is built
sess.graph.finalize()
# restore / initialize weights, initialize epoch variable
epoch = loadWeights(sess, saver, init)
trainRes, evalRes = None, None
if config.train:
start0 = time.time()
bestEpoch = epoch
bestRes = None
prevRes = None
# epoch in [restored + 1, epochs]
for epoch in range(config.restoreEpoch + 1, config.epochs + 1):
print(bcolored("Training epoch {}...".format(epoch), "green"))
start = time.time()
# train
# calle = lambda: model.runEpoch(), collectRuntimeStats, writer
trainingData, alterData = chooseTrainingData(data)
trainRes = runEpoch(sess, model, trainingData, dataOps, train = True, epoch = epoch,
saver = saver, alterData = alterData,
maxAcc = trainRes["maxAcc"] if trainRes else 0.0,
minLoss = trainRes["minLoss"] if trainRes else float("inf"),)
# save weights
saver.save(sess, config.weightsFile(epoch))
if config.saveSubset:
subsetSaver.save(sess, config.subsetWeightsFile(epoch))
# load EMA weights
if config.useEMA:
print(bold("Restoring EMA weights"))
emaSaver.restore(sess, config.weightsFile(epoch))
# evaluation
getPreds = config.getPreds or (config.analysisType != "")
evalRes = runEvaluation(sess, model, data["main"], dataOps, epoch, getPreds = getPreds, prevRes = evalRes)
extraEvalRes = runEvaluation(sess, model, data["extra"], dataOps, epoch,
evalTrain = not config.extraVal, getPreds = getPreds)
# restore standard weights
if config.useEMA:
print(bold("Restoring standard weights"))
saver.restore(sess, config.weightsFile(epoch))
print("")
epochTime = time.time() - start
print("took {:.2f} seconds".format(epochTime))
# print results
printDatasetResults(trainRes, evalRes, extraEvalRes)

How to do parallel processing in pytorch

I am working on a deep learning problem. I am solving it using pytorch. I have two GPU's which are on the same machine (16273MiB,12193MiB). I want to use both the GPU's for my training (video dataset).
I get a warning:
There is an imbalance between your GPUs. You may want to exclude GPU 1 which
has less than 75% of the memory or cores of GPU 0. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
I also get an error:
raise TypeError('Broadcast function not implemented for CPU tensors')
TypeError: Broadcast function not implemented for CPU tensors
if __name__ == '__main__':
opt.scales = [opt.initial_scale]
for i in range(1, opt.n_scales):
opt.scales.append(opt.scales[-1] * opt.scale_step)
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.mean = get_mean(opt.norm_value)
opt.std = get_std(opt.norm_value)
print("opt",opt)
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
json.dump(vars(opt), opt_file)
torch.manual_seed(opt.manual_seed)
model, parameters = generate_model(opt)
#print(model)
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable parameters: ", pytorch_total_params)
# Define Class weights
if opt.weighted:
print("Weighted Loss is created")
if opt.n_finetune_classes == 2:
weight = torch.tensor([1.0, 3.0])
else:
weight = torch.ones(opt.n_finetune_classes)
else:
weight = None
criterion = nn.CrossEntropyLoss()
if not opt.no_cuda:
criterion = nn.DataParallel(criterion.cuda())
if opt.no_mean_norm and not opt.std_norm:
norm_method = Normalize([0, 0, 0], [1, 1, 1])
elif not opt.std_norm:
norm_method = Normalize(opt.mean, [1, 1, 1])
else:
norm_method = Normalize(opt.mean, opt.std)
train_loader = torch.utils.data.DataLoader(
training_data,
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.n_threads,
pin_memory=True)
train_logger = Logger(
os.path.join(opt.result_path, 'train.log'),
['epoch', 'loss', 'acc', 'precision','recall','lr'])
train_batch_logger = Logger(
os.path.join(opt.result_path, 'train_batch.log'),
['epoch', 'batch', 'iter', 'loss', 'acc', 'precision', 'recall', 'lr'])
if opt.nesterov:
dampening = 0
else:
dampening = opt.dampening
optimizer = optim.SGD(
parameters,
lr=opt.learning_rate,
momentum=opt.momentum,
dampening=dampening,
weight_decay=opt.weight_decay,
nesterov=opt.nesterov)
# scheduler = lr_scheduler.ReduceLROnPlateau(
# optimizer, 'min', patience=opt.lr_patience)
if not opt.no_val:
spatial_transform = Compose([
Scale(opt.sample_size),
CenterCrop(opt.sample_size),
ToTensor(opt.norm_value), norm_method
])
print('run')
for i in range(opt.begin_epoch, opt.n_epochs + 1):
if not opt.no_train:
adjust_learning_rate(optimizer, i, opt.lr_steps)
train_epoch(i, train_loader, model, criterion, optimizer, opt,
train_logger, train_batch_logger)
I have also made changes in my train file:
model = nn.DataParallel(model(),device_ids=[0,1]).cuda()
outputs = model(inputs)
It does not seem to work properly and is giving error. Please advice, I am new to pytorch.
Thanks
As mentioned in this link, you have to do model.cuda() before passing it to nn.DataParallel.
net = nn.DataParallel(model.cuda(), device_ids=[0,1])
https://github.com/pytorch/pytorch/issues/17065

Resources