I'm using a MacBook Air/OS Monterey 12.5 (There are updates available; Ventura 13.1
Python version 3.10.8 and also tried using 3.11
Pylance has pointed that all the imports I was trying to execute were not being resolved so I changed the VS Code interpreter to Python 3.10.
Anyways, here's the code:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizerFast
from transformers import BertForTokenClassification
from import Dataset, DataLoader
df = pd.read_csv('ner.csv')
labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()
for lb in labels:
[unique_labels.add(i) for i in lb if i not in unique_labels]
# print(unique_labels)
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
# print(labels_to_ids)
text = df['text'].values.tolist()
example = text[36]
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
text_tokenized = tokenizer(example, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
def align_label_example(tokenized_input, labels):
word_ids = tokenized_input.word_ids()
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
elif word_idx != previous_word_idx:
label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
previous_word_idx = word_idx
return label_ids;
label = labels[36]
label_all_tokens = False
new_label = align_label_example(text_tokenized, label)
def align_label(texts, labels):
tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
word_ids = tokenized_inputs.word_ids()
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
elif word_idx != previous_word_idx:
label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
previous_word_idx = word_idx
return label_ids
class DataSequence(
def __init__(self, df):
lb = [i.split() for i in df['labels'].values.tolist()]
txt = df['text'].values.tolist()
self.texts = [tokenizer(str(i),
padding='max_length', max_length=512, truncation=True, return_tensors='pt') for i in txt]
self.labels = [align_label(i,j) for i,j in zip(txt, lb)]
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
return torch.LongTensor(self.labels[idx])
def __getitem__(self, idx):
batch_data = self.get_batch_data(idx)
batch_labels = self.get_batch_labels(idx)
return batch_data, batch_labels
df = df[0:1000]
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
[int(.8 * len(df)), int(.9 * len(df))])
class BertModel(torch.nn.Module):
def __init__(self):
super(BertModel, self).__init__()
self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
def forward(self, input_id, mask, label):
output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
return output
def train_loop(model, df_train, df_val):
train_dataset = DataSequence(df_train)
val_dataset = DataSequence(df_val)
train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
if use_cuda:
model = model.cuda()
best_acc = 0
best_loss = 1000
for epoch_num in range(EPOCHS):
total_acc_train = 0
total_loss_train = 0
for train_data, train_label in tqdm(train_dataloader):
train_label =
mask = train_data['attention_mask'].squeeze(1).to(device)
input_id = train_data['input_ids'].squeeze(1).to(device)
loss, logits = model(input_id, mask, train_label)
for i in range(logits.shape[0]):
logits_clean = logits[i][train_label[i] != -100]
label_clean = train_label[i][train_label[i] != -100]
predictions = logits_clean.argmax(dim=1)
acc = (predictions == label_clean).float().mean()
total_acc_train += acc
total_loss_train += loss.item()
total_acc_val = 0
total_loss_val = 0
for val_data, val_label in val_dataloader:
val_label =
mask = val_data['attention_mask'].squeeze(1).to(device)
input_id = val_data['input_ids'].squeeze(1).to(device)
loss, logits = model(input_id, mask, val_label)
for i in range(logits.shape[0]):
logits_clean = logits[i][val_label[i] != -100]
label_clean = val_label[i][val_label[i] != -100]
predictions = logits_clean.argmax(dim=1)
acc = (predictions == label_clean).float().mean()
total_acc_val += acc
total_loss_val += loss.item()
val_accuracy = total_acc_val / len(df_val)
val_loss = total_loss_val / len(df_val)
f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')
model = BertModel()
train_loop(model, df_train, df_val)
And the debugger says:
Exception has occurred: RuntimeError (note: full exception trace is shown but execution is paused at: <module>)
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
File "/Users/filipedonatti/Projects/pyCodes/", line 141, in train_loop
for train_data, train_label in tqdm(train_dataloader):
File "/Users/filipedonatti/Projects/pyCodes/", line 197, in <module>
train_loop(model, df_train, df_val)
File "<string>", line 1, in <module> (Current frame)
By the way,
Despite using Mac, I have downloaded Anaconda-Navigator, however I've been trying and executing this code on VS Code. I've downloaded numpy, torch, datasets and other libraries through Brew with the pip3 command.
I'm at a loss, I can run the code on a google collab notebook or Jupiter notebook, and I know training models and such in my humble Mac would not be advised, but I am just exercising this so I can train and use the model in a much more powerful machine.
Please help me with this issue, I've been trying to find a solution for days.
Peace and happy holidays.
I've tried solving the issue by writing:
if __name__ == '__main__':
I've tried using this:
import parallelTestModule
extractor = parallelTestModule.ParallelExtractor()
extractor.runInParallel(numProcesses=2, numThreads=4)
It turns out the correct way to solve this is to implement a function to train the loop as such:
def run():
model = BertModel()
train_loop(model, df_train, df_val)
if __name__ == '__main__':
Redefining that train_loop line in the end. Issue solved. For more see this link:
I have customized a parameter in my model:
self.params = list(self.backbone.parameters())
for head in self.headlist:
self.params += list(head.parameters())
When I wrap my model with DDP, an error occurs when defining the optimizer
optimizer = optim.SGD(model.params,, momentum=FLAGS.momentum, weight_decay=FLAGS.weight_decay)
AttributeError 'DistributedDataParallel' object has no attribute 'params '
I think the error is probably caused by my customized "self.params"
Is the following code correct:
model = torch.nn.parallel.DistributedDataParallel(model,device_ids=local_rank)
model_without_ddp = model.module
optimizer = optim.SGD(model_without_ddp.params,, momentum=FLAGS.momentum, weight_decay=FLAGS.weight_decay)
Or is there any simpler code?
The detailed definition of the network is as follows:
class multiheadModel():
def __init__(self, num_heads, device, model_name):
self.device = device
self.num_heads = num_heads # global+K
if model_name == 'fcn8s':
self.backbone = VGG16_FCN8s(num_classes=19, backbone=1, head=0).to(device)
self.headlist = [VGG16_FCN8s(num_classes=19, backbone=0, head=1).to(device) for i in range(num_heads)]
self.model = VGG16_FCN8s(num_classes=19).to(device)
for name, param in self.backbone.named_parameters():
if ('conv3' in name) or ('conv4' in name):
param.requires_grad = True
param.requires_grad = False
elif model_name == 'deeplab':
self.backbone = Res_Deeplab(num_classes=19, backbone=1, head=0).to(device)
self.headlist = [Res_Deeplab(num_classes=19, backbone=0, head=1).to(device) for i in range(num_heads)]
self.model = Res_Deeplab(num_classes=19).to(device)
for name, param in self.backbone.named_parameters():
if 'layer3' in name:
param.requires_grad = True
param.required_grad = False
print('ERROR : wrong model name')
self.params = list(self.backbone.parameters())
for head in self.headlist:
self.params += list(head.parameters())
self.loss_fn = None
#self.k2head = {0:2,1:1,2:0,3:0,4:0,5:4,6:4,7:5}
#self.k2head = {0:2,1:1,2:0,3:0,4:0,5:3,6:3,7:4}
self.k2head = {0:2,1:1,2:0,3:0,4:3,5:3,6:4}
# set train and eval mode
def train(self):
for head in self.headlist:
def eval(self):
for head in self.headlist:
def computePredLoss(self, rgb, lbl, k):
x = self.backbone(rgb)
head_id = list(range(self.num_heads))
input_size = rgb.size()[2:]
loss = 0
for i in head_id:
pred = self.headlist[i](x)
pred = F.interpolate(pred, size=input_size, mode='bilinear', align_corners=True)
loss += self.loss_fn(pred, lbl)
return pred, loss
def forward(self, input):
output = {}
if "label" in input:
pred,loss = self.computePredLoss(input['rgb'], input['label'], input['k'])
output['pred'], output['loss']=pred, loss
x = self.backbone(input['rgb'])
k = -1
if "k" in input:
k = self.k2head[input['k']]
pred = self.headlist[k](x)
input_size = input['rgb'].size()[2:]
pred = F.interpolate(pred, size=input_size, mode='bilinear', align_corners=True)
output['pred'] = pred
return output
def validate(self, loader, k=-2):
if k!=-2:
val_metrics = StreamSegMetrics(19)
with torch.no_grad():
for i, (batch, rgb_batch) in enumerate(loader):
rgb_batch =, dtype=torch.float)
batch =, dtype=torch.int64)
input_size = rgb_batch.size()[2:]
x = self.backbone(rgb_batch)
pred = self.headlist[k](x)
pred = F.interpolate(pred, size=input_size, mode='bilinear', align_corners=True)
preds = pred.detach().max(dim=1)[1].cpu().numpy()
targets = batch.cpu().numpy()
val_metrics.update(targets, preds)
score = val_metrics.get_results()
val_metrics = [StreamSegMetrics(19) for i in range(self.num_heads)]
for metric in val_metrics:
with torch.no_grad():
for i, (batch, rgb_batch) in enumerate(loader):
rgb_batch =, dtype=torch.float)
batch =, dtype=torch.int64)
input_size = rgb_batch.size()[2:]
x = self.backbone(rgb_batch)
for k in range(self.num_heads):
pred = self.headlist[k](x)
pred = F.interpolate(pred, size=input_size, mode='bilinear', align_corners=True)
preds = pred.detach().max(dim=1)[1].cpu().numpy()
targets = batch.cpu().numpy()
val_metrics[k].update(targets, preds)
score = [val_metrics[k].get_results() for k in range(self.num_heads)]
return score
def getHeadPaths(self, model_path, iteration=-1):
head_paths = []
if '_iter' in model_path:
base_path = model_path.split('_iter')[0]
base_path = model_path.split('.pth')[0]
if iteration==-1:
for i in range(self.num_heads-1):
for i in range(self.num_heads-1):
return head_paths
def save(self, model_path, iteration=-1):
self.model.load_state_dict(self.backbone.state_dict(), strict=False)
head_paths = self.getHeadPaths(model_path, iteration)
for i in range(self.num_heads):
self.model.load_state_dict(self.headlist[i].state_dict(), strict=False), head_paths[i])
def load(self, model_path):
iteration = -1
if '_iter' in model_path:
iteration = int(model_path.split('_iter')[1].split('.pth')[0])
self.backbone.load_state_dict(self.model.state_dict(), strict=False)
head_paths = self.getHeadPaths(model_path, iteration)
existance = 1
for path in head_paths:
if os.path.isfile(path)==False:
existance = 0
if existance==1:
print('loading from multiheads')
for i in range(self.num_heads):
self.headlist[i].load_state_dict(self.model.state_dict(), strict=False)
print('loading from singlehead')
for i in range(self.num_heads):
self.headlist[i].load_state_dict(self.model.state_dict(), strict=False)
def __call__(self, input):
return self.forward(input)
I need as an input of my CNN to have 3 images, that I preprocess using ImageGenerator and flow_from_dataframe :
idg = ImageDataGenerator(rescale = 1./255)
A_gen = idg.flow_from_dataframe(df,directory = path,x_col = 'A',y_col = 'class',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = 'binary',seed=1,batch_size=batch_size)
B_gen = idg.flow_from_dataframe(df,directory = path,x_col = 'taste1',y_col = 'class',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = 'binary',seed=1,batch_size=batch_size)
C_gen = idg.flow_from_dataframe(df,directory = path,x_col = 'taste2',y_col = 'class',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = 'binary',seed=1,batch_size=batch_size)
Then, I put all the 3 generators in one, using:
def combine(A,B,C):
while True:
X1i =
X2i =
X3i =
yield [X1i[0], X2i[0],X3i[0]], X1i[1]
inputgenerator = combine(A_gen,B_gen,C_gen)
The beginning of my CNN look like this :
def simple_cnn():
pic_input1 = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
pic_input2 = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
pic_input3 = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
cnn1 = BatchNormalization()(pic_input1)
cnn2 = BatchNormalization()(pic_input2)
cnn3 = BatchNormalization()(pic_input3)
... (rest is not relevant I guess)
Then, I fit my model using :,steps_per_epoch=len(df) / batch_size, epochs=4)
Till here, anything works perfectly. (I know, I need to use a validation set etc, but first I want to make sure I know how to deal with multiple generators)
But, when I want to make prediction, with my testgenerator that is :
idg2 = ImageDataGenerator(rescale = 1./255)
D_gen = idg2.flow_from_dataframe(df2,directory = path,x_col = 'D',y_col = 'None',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = None,seed=1,batch_size=1)
E_gen = idg2.flow_from_dataframe(df2,directory = path,x_col = 'E',y_col = 'None',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = None,seed=1,batch_size=1)
F_gen = idg2.flow_from_dataframe(df2,directory = path,x_col = 'F',y_col = 'None',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = None,seed=1,batch_size=1)
testgenerator = combine_test(D_gen,E_gen,F_gen)
pred = model.predict(testgenerator)
def combine_test(A,B,C):
while True:
X1i =
X2i =
X3i =
yield [X1i[0], X2i[0],X3i[0]]
I got the following error :
Traceback (most recent call last):
File "/home/maeul/Documents/ETHZ/2ndSemester/IntroToMachineLearning/Task4/", line 228, in <module>
pred = model.predict(testgenerator)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/", line 1013, in predict
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/", line 498, in predict
workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/", line 426, in _model_iteration
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/", line 706, in _process_inputs
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/", line 767, in __init__
dataset = standardize_function(dataset)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/", line 684, in standardize_function
return, num_parallel_calls=dataset_ops.AUTOTUNE)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/data/ops/", line 1591, in map
self, map_func, num_parallel_calls, preserve_cardinality=True)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/data/ops/", line 3926, in __init__
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/data/ops/", line 3147, in __init__
self._function = wrapper_fn._get_concrete_function_internal()
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/", line 2395, in _get_concrete_function_internal
*args, **kwargs)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/", line 2389, in _get_concrete_function_internal_garbage_collected
graph_function, _, _ = self._maybe_define_function(args, kwargs)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/", line 2703, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/", line 2593, in _create_graph_function
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/", line 978, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/data/ops/", line 3140, in wrapper_fn
ret = _wrapper_helper(*args)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/data/ops/", line 3082, in _wrapper_helper
ret = autograph.tf_convert(func, ag_ctx)(*nested_args)
File "/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/ map_fn
/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/ _standardize_tensors
/home/maeul/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/ standardize_input_data
'with shape ' + str(data_shape))
ValueError: Error when checking input: expected input_10 to have 4 dimensions, but got array with shape (None, None, None)
I guess this is related to the batch size of a single generator, but I don't know how to "trick" the model.predict by adding a trivial dimension in each image generated...
Thanks in advance for your help !
I finally found an answer: as there are no label in the generators
D_gen = idg2.flow_from_dataframe(df2,directory = path,x_col = 'D',y_col = 'None',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = None,seed=1,batch_size=1)
E_gen = idg2.flow_from_dataframe(df2,directory = path,x_col = 'E',y_col = 'None',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = None,seed=1,batch_size=1)
F_gen = idg2.flow_from_dataframe(df2,directory = path,x_col = 'F',y_col = 'None',target_size = (IMG_HEIGHT,IMG_WIDTH),
class_mode = None,seed=1,batch_size=1)
it means that the generators generate a list and not a table, so when in combine_test I write Xi[0], I actually called the first element of the list and not the whole list (the technical terms are probably wrong, but that how I get that with my basic knowledge), so I need to replace the Xi[0] by simply Xi.
To fix this error, modifiy the "combine_test" function as following:
def combine_test(A,B,C):
while True:
X1i =
X2i =
X3i =
yield [X1i, X2i, X3i]
I have downloaded a code of FCN for image segmentation and it ran well. Now I want to add a rnn layer attempting to refine the result according to the work "ReSeg: A Recurrent Neural Network-Based Model for Semantic Segmentation". My code shows as follows:
This part is for the inference:
def inference(image, keep_prob):
Semantic segmentation network definition
:param image: input image. Should have values in range 0-255
:param keep_prob:
print("setting up vgg initialized conv layers ...")
#model_data = utils.get_model_data(FLAGS.model_dir, MODEL_URL)
model_data ="H:/Deep Learning/FCN.tensorflow-master/imagenet-vgg-verydeep-19.mat")
mean = model_data['normalization'][0][0][0]
mean_pixel = np.mean(mean, axis=(0, 1))
weights = np.squeeze(model_data['layers'])
processed_image = utils.process_image(image, mean_pixel)
with tf.variable_scope("inference"):
image_net = vgg_net(weights, processed_image)
conv_final_layer = image_net["conv5_3"]
pool5 = utils.max_pool_2x2(conv_final_layer)
W6 = utils.weight_variable([7, 7, 512, 4096], name="W6")
b6 = utils.bias_variable([4096], name="b6")
conv6 = utils.conv2d_basic(pool5, W6, b6)
relu6 = tf.nn.relu(conv6, name="relu6")
if FLAGS.debug:
relu_dropout6 = tf.nn.dropout(relu6, keep_prob=keep_prob)
W7 = utils.weight_variable([1, 1, 4096, 4096], name="W7")
b7 = utils.bias_variable([4096], name="b7")
conv7 = utils.conv2d_basic(relu_dropout6, W7, b7)
relu7 = tf.nn.relu(conv7, name="relu7")
if FLAGS.debug:
relu_dropout7 = tf.nn.dropout(relu7, keep_prob=keep_prob)
W8 = utils.weight_variable([1, 1, 4096, NUM_OF_CLASSESS], name="W8")
b8 = utils.bias_variable([NUM_OF_CLASSESS], name="b8")
conv8 = utils.conv2d_basic(relu_dropout7, W8, b8)
# annotation_pred1 = tf.argmax(conv8, dimension=3, name="prediction1")
# now to upscale to actual image size
deconv_shape1 = image_net["pool4"].get_shape()
W_t1 = utils.weight_variable([4, 4, deconv_shape1[3].value, NUM_OF_CLASSESS], name="W_t1")
b_t1 = utils.bias_variable([deconv_shape1[3].value], name="b_t1")
conv_t1 = utils.conv2d_transpose_strided(conv8, W_t1, b_t1, output_shape=tf.shape(image_net["pool4"]))
#fuse_1 = tf.add(conv_t1, image_net["pool4"], name="fuse_1")
deconv_shape2 = image_net["pool3"].get_shape()
W_t2 = utils.weight_variable([4, 4, deconv_shape2[3].value, deconv_shape1[3].value], name="W_t2")
b_t2 = utils.bias_variable([deconv_shape2[3].value], name="b_t2")
conv_t2 = utils.conv2d_transpose_strided(conv_t1, W_t2, b_t2, output_shape=tf.shape(image_net["pool3"]))
#fuse_2 = tf.add(conv_t2, image_net["pool3"], name="fuse_2")
shape = tf.shape(image)
deconv_shape3 = tf.stack([shape[0], shape[1], shape[2], NUM_OF_CLASSESS])
W_t3 = utils.weight_variable([16, 16, NUM_OF_CLASSESS, deconv_shape2[3].value], name="W_t3")
b_t3 = utils.bias_variable([NUM_OF_CLASSESS], name="b_t3")
conv_t3 = utils.conv2d_transpose_strided(conv_t2, W_t3, b_t3, output_shape=deconv_shape3, stride=8)
/////////////////////////////////////////////////////this is from where i added the rnn
shape_5 = tf.shape(image)
W_a = 224
H_a = 224
p_size_a = NUM_OF_CLASSESS
# x = tf.reshape(conv_t1, [shape_5[0],H_a,W_a, p_size_a])
x = tf.transpose(conv_t3, perm=[0,2,1,3])
x = tf.reshape(x,[-1,H_a,p_size_a])
mat = tf.unstack(x, H_a, 1)
lstm_fw_cell = rnn.BasicLSTMCell(N_HIDDEN, forget_bias=1.0)
lstm_bw_cell = rnn.BasicLSTMCell(N_HIDDEN, forget_bias=1.0)
#with tf.variable_scope('rnn1_1'):
outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, mat,
except Exception: # Old TensorFlow version only returns outputs not states
outputs = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, mat,
outputs1 = tf.reshape(outputs,[H_a, shape_5[0], W_a, 2 * N_HIDDEN])
outputs1 = tf.transpose(outputs1,(1,0,2,3))
x_1 = tf.reshape(outputs1,[-1,W_a,2 * N_HIDDEN])
mat_1 = tf.unstack(x_1, W_a, 1)
lstm_lw_cell = rnn.BasicLSTMCell(N_HIDDEN, forget_bias=1.0)
lstm_rw_cell = rnn.BasicLSTMCell(N_HIDDEN, forget_bias=1.0)
#with tf.variable_scope('rnn1_2'):
outputs2, _, _ = rnn.static_bidirectional_rnn(lstm_lw_cell, lstm_rw_cell, mat_1,
dtype=tf.float32,scope = 'rnn1_2')
except Exception: # Old TensorFlow version only returns outputs not states
outputs2 = rnn.static_bidirectional_rnn(lstm_lw_cell, lstm_rw_cell, mat_1,
outputs2 = tf.reshape(outputs,[W_a, shape_5[0], H_a, 2 * N_HIDDEN])
outputs2 = tf.transpose(outputs2,(1,2,0,3))
///////////////////////////////////////////////////till here
annotation_pred = tf.argmax(outputs2, dimension=3, name="prediction")
return tf.expand_dims(annotation_pred, dim=3), outputs2
and this part is for the training:
def train(loss_val, var_list):
optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
grads = optimizer.compute_gradients(loss_val, var_list=var_list)
if FLAGS.debug:
# print(len(var_list))
for grad, var in grads:
utils.add_gradient_summary(grad, var)
return optimizer.apply_gradients(grads)
def main(argv=None):
keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")
image = tf.placeholder(tf.float32, shape=[None, IMAGE_SIZE, IMAGE_SIZE, 3], name="input_image")
annotation = tf.placeholder(tf.int32, shape=[None, IMAGE_SIZE, IMAGE_SIZE, 1], name="annotation")
pred_annotation, logits = inference(image, keep_probability)
tf.summary.image("input_image", image, max_outputs=2)
tf.summary.image("ground_truth", tf.cast(annotation, tf.uint8), max_outputs=2)
tf.summary.image("pred_annotation", tf.cast(pred_annotation, tf.uint8), max_outputs=2)
loss = tf.reduce_mean((tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=tf.squeeze(annotation, squeeze_dims=[3]),
tf.summary.scalar("entropy", loss)
trainable_var = tf.trainable_variables()
if FLAGS.debug:
for var in trainable_var:
train_op = train(loss, trainable_var)
print("Setting up summary op...")
summary_op = tf.summary.merge_all()
print("Setting up image reader...")
train_records, valid_records = scene_parsing.read_dataset(FLAGS.data_dir)
print("Setting up dataset reader")
image_options = {'resize': True, 'resize_size': IMAGE_SIZE}
if FLAGS.mode == 'train':
train_dataset_reader = dataset.BatchDatset(train_records, image_options)
validation_dataset_reader = dataset.BatchDatset(valid_records, image_options)
sess = tf.Session()
print("Setting up Saver...")
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(FLAGS.logs_dir, sess.graph)
ckpt = tf.train.get_checkpoint_state(FLAGS.logs_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print("Model restored...")
if FLAGS.mode == "train":
for itr in xrange(MAX_ITERATION):
train_images, train_annotations = train_dataset_reader.next_batch(FLAGS.batch_size)
feed_dict = {image: train_images, annotation: train_annotations, keep_probability: 0.85}, feed_dict=feed_dict)
if itr % 10 == 0:
train_loss, summary_str =[loss, summary_op], feed_dict=feed_dict)
print("Step: %d, Train_loss:%g" % (itr, train_loss))
summary_writer.add_summary(summary_str, itr)
if itr % 500 == 0:
valid_images, valid_annotations = validation_dataset_reader.next_batch(FLAGS.batch_size)
valid_loss =, feed_dict={image: valid_images, annotation: valid_annotations,
keep_probability: 1.0})
print("%s ---> Validation_loss: %g" % (, valid_loss)), FLAGS.logs_dir + "model.ckpt", itr)
elif FLAGS.mode == "visualize":
valid_images, valid_annotations = validation_dataset_reader.get_random_batch(FLAGS.batch_size)
pred =, feed_dict={image: valid_images, annotation: valid_annotations,
keep_probability: 1.0})
valid_annotations = np.squeeze(valid_annotations, axis=3)
pred = np.squeeze(pred, axis=3)
for itr in range(FLAGS.batch_size):
utils.save_image(valid_images[itr].astype(np.uint8), FLAGS.logs_dir, name="inp_" + str(5+itr))
utils.save_image(valid_annotations[itr].astype(np.uint8), FLAGS.logs_dir, name="gt_" + str(5+itr))
utils.save_image(pred[itr].astype(np.uint8), FLAGS.logs_dir, name="pred_" + str(5+itr))
print("Saved image: %d" % itr)
The error was described as:
Not found: Key inference/rnn1_2/fw/basic_lstm_cell/weights not found in checkpoint
So i think there must be something wrong with the variables.
I'll be very appreciate if someone could tell me how to fix it!
looking forward to your help!
The following code is my pipeline for reading images and labels from files:
import tensorflow as tf
import numpy as np
import tflearn.data_utils
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
import sys
#process labels in the input file
def process_label(label):
return info
def read_label_file(file):
f = open(file, "r")
filepaths = []
labels = []
lines = []
for line in f:
tokens = line.split(",")
return filepaths, np.vstack(labels), lines
def get_data_batches(params):
# reading labels and file path
train_filepaths, train_labels, train_line = read_label_file(params.train_info)
test_filepaths, test_labels, test_line = read_label_file(params.test_info)
# convert string into tensors
train_images = ops.convert_to_tensor(train_filepaths)
train_labels = ops.convert_to_tensor(train_labels)
train_line = ops.convert_to_tensor(train_line)
test_images = ops.convert_to_tensor(test_filepaths)
test_labels = ops.convert_to_tensor(test_labels)
test_line = ops.convert_to_tensor(test_line)
# create input queues
train_input_queue = tf.train.slice_input_producer([train_images, train_labels, train_line], shuffle=params.shuffle)
test_input_queue = tf.train.slice_input_producer([test_images, test_labels, test_line],shuffle=False)
# process path and string tensor into an image and a label
for i in range(train_input_queue[0].get_shape()[0]):
file_content = tf.read_file(params.path_prefix+train_input_queue[0][i])
train_imageT = (tf.to_float(tf.image.decode_jpeg(file_content, channels=params.num_channels)))*(1.0/255)
train_imageT = tf.image.resize_images(train_imageT,[params.load_size[0],params.load_size[1]])
train_imageT = tf.random_crop(train_imageT,size=[params.crop_size[0],params.crop_size[1],params.num_channels])
train_imageT = tf.image.random_flip_up_down(train_imageT)
train_imageT = tf.image.per_image_standardization(train_imageT)
train_image = train_imageT
train_image = tf.concat([train_image, train_imageT], 2)
train_label = train_input_queue[1]
train_lineInfo = train_input_queue[2]
for i in range(test_input_queue[0].get_shape()[0]):
file_content = tf.read_file(params.path_prefix+test_input_queue[0][i])
test_imageT = tf.to_float(tf.image.decode_jpeg(file_content, channels=params.num_channels))*(1.0/255)
test_imageT = tf.image.resize_images(test_imageT,[params.load_size[0],params.load_size[1]])
test_imageT = tf.image.central_crop(test_imageT, (params.crop_size[0]+0.0)/params.load_size[0])
test_imageT = tf.image.per_image_standardization(test_imageT)
test_image = test_imageT
test_image = tf.concat([test_image, test_imageT],2)
test_label = test_input_queue[1]
test_lineInfo = test_input_queue[2]
# define tensor shape
train_image.set_shape([params.crop_size[0], params.crop_size[1], params.num_channels*3])
test_image.set_shape( [params.crop_size[0], params.crop_size[1], params.num_channels*3])
# collect batches of images before processing
train_image_batch, train_label_batch, train_lineno = tf.train.batch([train_image, train_label, train_lineInfo],batch_size=params.batch_size,num_threads=params.num_threads,allow_smaller_final_batch=True)
test_image_batch, test_label_batch, test_lineno = tf.train.batch([test_image, test_label, test_lineInfo],batch_size=params.test_size,num_threads=params.num_threads,allow_smaller_final_batch=True)
return train_image_batch, train_label_batch, train_lineno, test_image_batch, test_label_batch, test_lineno
elif params.loadSlice=='train':
return train_image_batch, train_label_batch
elif params.loadSlice=='test':
return test_image_batch, test_label_batch
elif params.loadSlice=='train_info':
return train_image_batch, train_label_batch, train_lineno
elif params.loadSlice=='test_info':
return test_image_batch, test_label_batch, test_lineno
return train_image_batch, train_label_batch, test_image_batch, test_label_batch
I want to use the same pipeline for loading the test data. The size of my test data is huge and I cannot load all of them at once.
I have 20453 test examples which is not an integer multiply of the batch size (here 512).
How can I read all of my test examples via this pipeline one and only one time and then measure the performance on them?
Currently, I am using this code for batching my test data and it does not work. It always read a full batch from the queue even when I set allow_smaller_final_batch to True
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
more = True
img_test, lbl_test,[test_image_batch,test_label_batch,test_lineno])
num_examples += size
if size<args.batch_size:
more = False
This is the code of my model:
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.normalization import batch_normalization
from tflearn.layers.estimator import regression
from tflearn.activations import relu
def get_alexnet(x,num_output):
network = conv_2d(x, 64, 11, strides=4)
network = batch_normalization(network,epsilon=0.001)
network = relu (network)
network = max_pool_2d(network, 3, strides=2)
network = conv_2d(network, 192, 5)
network = batch_normalization(network,epsilon=0.001)
network = relu(network)
network = max_pool_2d(network, 3, strides=2)
network = conv_2d(network, 384, 3)
network = batch_normalization(network,epsilon=0.0001)
network = relu(network)
network = conv_2d(network, 256, 3)
network = batch_normalization(network,epsilon=0.001)
network = relu(network)
network = conv_2d(network, 256, 3)
network = batch_normalization(network,epsilon=0.001)
network = relu(network)
network = max_pool_2d(network, 3, strides=2)
network = fully_connected(network, 4096)
network = batch_normalization(network,epsilon=0.001)
network = relu(network)
network = dropout(network, 0.5)
network = fully_connected(network, 4096)
network = batch_normalization(network,epsilon=0.001)
network = relu(network)
network = dropout(network, 0.5)
network1 = fully_connected(network, num_output)
network2 = fully_connected(network, 12)
network3 = fully_connected(network,6)
return network1,network2,network3
This simply could be achieved by setting num_epochs=1 and allow_smaller_final_batch= True!
One solution is set batch_size=size of test set