Tensorflow/Keras: volatile validation loss - validation

I've been training a U-Net for single class small lesion segmentation, and have been getting consistently volatile validation loss. I have about 20k images split 70/30 between training and validation sets-so I don't think the issue is too little data. I've tried shuffling and resplitting the sets a few times with no change in volatility-so I don't think the validation set is unrepresentative. I have tried lowering the learning rate with no effect on volatility. And I have tried a few loss functions (dice coefficient, focal tversky, weighted binary cross-entropy). I'm using a decent amount of augmentation so as to avoid overfitting. I've also run through all my data (512x512 float64s with corresponding 512x512 int64 masks--both stored as numpy arrays) do double check that the value range, dtypes, etc. aren't screwy...and I even removed any ROIs in the masks under 35 pixels in area which I thought might be artifact and messing with loss.
I'm using keras ImageDataGen.flow_from_directory...I was initially using zca_whitening and brightness_range augmentation but I think this causes issues with flow_from_directory and the link between mask and image being lost.. so I skipped this.
I've tried validation generators with and without shuffle=True. Batch size is 8.
Here's some of my code, happy to include more if it would help:
# loss
from keras.losses import binary_crossentropy
import keras.backend as K
import tensorflow as tf
epsilon = 1e-5
smooth = 1
def dsc(y_true, y_pred):
smooth = 1.
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
score = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
return score
def dice_loss(y_true, y_pred):
loss = 1 - dsc(y_true, y_pred)
return loss
def bce_dice_loss(y_true, y_pred):
loss = binary_crossentropy(y_true, y_pred) + dice_loss(y_true, y_pred)
return loss
def confusion(y_true, y_pred):
smooth=1
y_pred_pos = K.clip(y_pred, 0, 1)
y_pred_neg = 1 - y_pred_pos
y_pos = K.clip(y_true, 0, 1)
y_neg = 1 - y_pos
tp = K.sum(y_pos * y_pred_pos)
fp = K.sum(y_neg * y_pred_pos)
fn = K.sum(y_pos * y_pred_neg)
prec = (tp + smooth)/(tp+fp+smooth)
recall = (tp+smooth)/(tp+fn+smooth)
return prec, recall
def tp(y_true, y_pred):
smooth = 1
y_pred_pos = K.round(K.clip(y_pred, 0, 1))
y_pos = K.round(K.clip(y_true, 0, 1))
tp = (K.sum(y_pos * y_pred_pos) + smooth)/ (K.sum(y_pos) + smooth)
return tp
def tn(y_true, y_pred):
smooth = 1
y_pred_pos = K.round(K.clip(y_pred, 0, 1))
y_pred_neg = 1 - y_pred_pos
y_pos = K.round(K.clip(y_true, 0, 1))
y_neg = 1 - y_pos
tn = (K.sum(y_neg * y_pred_neg) + smooth) / (K.sum(y_neg) + smooth )
return tn
def tversky(y_true, y_pred):
y_true_pos = K.flatten(y_true)
y_pred_pos = K.flatten(y_pred)
true_pos = K.sum(y_true_pos * y_pred_pos)
false_neg = K.sum(y_true_pos * (1-y_pred_pos))
false_pos = K.sum((1-y_true_pos)*y_pred_pos)
alpha = 0.7
return (true_pos + smooth)/(true_pos + alpha*false_neg + (1-alpha)*false_pos + smooth)
def tversky_loss(y_true, y_pred):
return 1 - tversky(y_true,y_pred)
def focal_tversky(y_true,y_pred):
pt_1 = tversky(y_true, y_pred)
gamma = 0.75
return K.pow((1-pt_1), gamma)
model = BlockModel((len(os.listdir(os.path.join(imageroot,'train_ct','train'))), 512, 512, 1),filt_num=16,numBlocks=4)
#model.compile(optimizer=Adam(learning_rate=0.001), loss=weighted_cross_entropy)
#model.compile(optimizer=Adam(learning_rate=0.001), loss=dice_coef_loss)
model.compile(optimizer=Adam(learning_rate=0.001), loss=focal_tversky)
train_mask = os.path.join(imageroot,'train_masks')
val_mask = os.path.join(imageroot,'val_masks')
model.load_weights(model_weights_path) #I'm initializing with some pre-trained weights from a similar model
data_gen_args_mask = dict(
rotation_range=10,
shear_range=20,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=[0.8,1.2],
horizontal_flip=True,
#vertical_flip=True,
fill_mode='nearest',
data_format='channels_last'
)
data_gen_args = dict(
**data_gen_args_mask
)
image_datagen_train = ImageDataGenerator(**data_gen_args)
mask_datagen_train = ImageDataGenerator(**data_gen_args)#_mask)
image_datagen_val = ImageDataGenerator()
mask_datagen_val = ImageDataGenerator()
seed = 1
BS = 8
steps = int(np.floor((len(os.listdir(os.path.join(train_ct,'train'))))/BS))
print(steps)
val_steps = int(np.floor((len(os.listdir(os.path.join(val_ct,'val'))))/BS))
print(val_steps)
train_image_generator = image_datagen_train.flow_from_directory(
train_ct,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
train_mask_generator = mask_datagen_train.flow_from_directory(
train_mask,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
val_image_generator = image_datagen_val.flow_from_directory(
val_ct,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
val_mask_generator = mask_datagen_val.flow_from_directory(
val_mask,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
train_generator = zip(train_image_generator, train_mask_generator)
val_generator = zip(val_image_generator, val_mask_generator)
# make callback for checkpointing
plot_losses = PlotLossesCallback(skip_first=0,plot_extrema=False)
%matplotlib inline
filepath = os.path.join(versionPath, model_version + "_saved-model-{epoch:02d}-{val_loss:.2f}.hdf5")
if reduce:
cb_check = [ModelCheckpoint(filepath,monitor='val_loss',
verbose=1,save_best_only=False,
save_weights_only=True,mode='auto',period=1),
reduce_lr,
plot_losses]
else:
cb_check = [ModelCheckpoint(filepath,monitor='val_loss',
verbose=1,save_best_only=False,
save_weights_only=True,mode='auto',period=1),
plot_losses]
# train model
history = model.fit_generator(train_generator, epochs=numEp,
steps_per_epoch=steps,
validation_data=val_generator,
validation_steps=val_steps,
verbose=1,
callbacks=cb_check,
use_multiprocessing = False
)
And here's how my loss looks:
Another potentially relevant thing: I tweaked the flow_from_directory code a bit (added npy to the white list). But training loss looks fine so assuming the issue isnt here

Two suggestions:
Switch to the classic validation data format (i.e. numpy array) instead of using a generator -- this will ensure you always use the exactly same validation data every time. If you see a different validation curve, then there is something "random" in the validation generator giving you different data at different epochs.
Use a fixed set of samples (100 or 1000 should be enough w/o any data augmentation) for both training and validation. If everything goes well, you should see your network quickly overfit to this dataset and your training and validation curves should very much similar. If not, debug your network.

Related

Is there a way to extract the variables from `lmfit` report?

I'm using the python package lmfit to fit my dataset with this model:
def GaussianFit(results, highest_num, Peak_shot, nuni, dif = None):
...
Gauss_mod = GaussianModel(prefix='gauss_')
Const_mod = ConstantModel(prefix='const_')
mod = Gauss_mod + Const_mod
pars = mod.make_params(gauss_center = ig, gauss_sigma = 1/12)
out = mod.fit(y_sel, pars, x = x_sel, weights = get_weights(last_sel,Peak_shot,nuni))
print(out.fit_report())
And the fit report looks like:
[[Model]]
(Model(gaussian, prefix='gauss_') + Model(constant, prefix='const_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 101
# data points = 18
# variables = 4
chi-square = 2.1571e-05
reduced chi-square = 1.5408e-06
Akaike info crit = -237.421693
Bayesian info crit = -233.860206
[[Variables]]
gauss_amplitude: 0.02133733 +/- 0.01122602 (52.61%) (init = 0.25)
gauss_center: 0.98316682 +/- 0.02152806 (2.19%) (init = 1.041587)
gauss_sigma: 0.11847360 +/- 0.04182091 (35.30%) (init = 0.08333333)
const_c: 0.09532047 +/- 0.01831759 (19.22%) (init = 0)
gauss_fwhm: 0.27898399 +/- 0.09848070 (35.30%) == '2.3548200*gauss_sigma'
I was wondering if it is possible to extract the gauss_center and its error with two variables, instead of directly copying and pasting these results. Thanks!
out.params['gauss_center'].value will be the best-fit value for the gauss_center parameter, and out.params['gauss_center'].stderr will be its standard error.

How we can explain below validation loss?

I have trained different number of layers in CNN+LSTM encoder and decoder model with attention.
The problem I am facing is very strange to me. The validation loss is fluctuating around 3.***. As we can see from the below loss graphs. I have 3 CNN layer+1 layer BLSTM at encoder and 1 LSTM at decoder
3 layer CNN+2 layers of BLSTM at encoder and 1 layer LSTM at encoder
I have also tried weight decay from 0.1 to 0.000001. But still I am getting this type of loss graphs. Note that the Accuracy of the model is increasing on both validation and trainset. How is it possible that validation loss is still around 3 but accuracy is increasing? Can someone explain this?
Thanks
`
class Encoder(nn.Module):
def init(self,height, width, enc_hid_dim, dec_hid_dim, dropout):
super().init()
self.height= height
self.enc_hid_dim=enc_hid_dim
self.width=width
self.layer0 = nn.Sequential(
nn.Conv2d(1, 8, kernel_size=(3,3),stride =(1,1), padding=1),
nn.ReLU(),
nn.BatchNorm2d(8),
nn.MaxPool2d(2,2),
)
self.layer1 = nn.Sequential(
nn.Conv2d(8, 32, kernel_size=(3,3),stride =(1,1), padding=1),
nn.ReLU(),
nn.BatchNorm2d(32),
nn.MaxPool2d(2,2),
)
self.layer2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=(3,3),stride =(1,1), padding=1),
nn.ReLU(),
nn.BatchNorm2d(64),
nn.MaxPool2d(2,2)
)
self.rnn = nn.LSTM(self.height//8*64, self.enc_hid_dim, bidirectional=True)
self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
self.dropout = nn.Dropout(dropout)
self.cnn_dropout = nn.Dropout(p=0.2)
def forward(self, src, in_data_len, train):
batch_size = src.shape[0]
out = self.layer0(src)
out = self.layer1(out)
out = self.layer2(out)
out = self.dropout(out) # torch.Size([batch, channel, h, w])
out = out.permute(3, 0, 2, 1) # (width, batch, height, channels)
out.contiguous()
out = out.reshape(-1, batch_size, self.height//8*64) #(w,batch, (height, channels))
width = out.shape[0]
src_len = in_data_len.numpy()*(width/self.width)
src_len = src_len + 0.999 # in case of 0 length value from float to int
src_len = src_len.astype('int')
out = pack_padded_sequence(out, src_len.tolist(), batch_first=False)
outputs, hidden_out = self.rnn(out)
hidden=hidden_out[0]
cell=hidden_out[1]
# output: t, b, f*2 hidden: 2, b, f
outputs, output_len = pad_packed_sequence(outputs, batch_first=False)
hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
cell = torch.tanh(self.fc(torch.cat((cell[-2,:,:], cell[-1,:,:]), dim = 1)))
return outputs, hidden, cell, output_len
class Decoder(nn.Module):
def init(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().init()
self.output_dim = output_dim
self.attention = attention
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.LSTM((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
self.dropout_layer = nn.Dropout(dropout)
def forward(self, input, hidden, cell, encoder_outputs, train):
input=torch.topk(input,1)[1]
embedded = self.embedding(input)
if train:
embedded=self.dropout_layer(embedded)
embedded = embedded.permute(1, 0, 2)
#embedded = [1, batch size, emb dim]
a = self.attention(hidden, encoder_outputs)
#a = [batch size, src len]
a = a.unsqueeze(1)
#a = [batch size, 1, src len]
encoder_outputs = encoder_outputs.permute(1, 0, 2)
#encoder_outputs = [batch size, src len, enc hid dim * 2]
weighted = torch.bmm(a, encoder_outputs)
weighted = weighted.permute(1, 0, 2)
#weighted = [1, batch size, enc hid dim * 2]
rnn_input = torch.cat((embedded, weighted), dim = 2)
output, hidden_out = self.rnn(rnn_input (hidden.unsqueeze(0),cell.unsqueeze(0)))
hidden=hidden_out[0]
cell=hidden_out[1]
assert (output == hidden).all()
embedded = embedded.squeeze(0)
output = output.squeeze(0)
weighted = weighted.squeeze(0)
prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
return prediction, hidden.squeeze(0), cell.squeeze(0)
`

Pytorch - Not able to achieve reproducibility

I am a training a image classifier model using Pytorch. While training it I am not able to set the seed. I have exploited all my options but still not getting any consistent results. Please help me with the same.
I was using this but my model is still not consistent.
torch.manual_seed(1)
torch.cuda.manual_seed(1)
np.random.seed(1)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 10)
#Define loss function & optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
lrscheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, threshold = 0.9)
model = model.to(device)
#Train model
model.train()
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
train_acc = (labels==predicted).sum().item() / images.size(0)
if (i+1) % 2 == 0:
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.4f'
% (epoch+1, num_epochs, i+1, len(train_dset)//batch_size,
loss.item(), train_acc))
if (i+1) % 5 == 0:
model.eval()
with torch.no_grad():
num_correct, num_total = 0, 0
for (images, labels) in val_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
num_correct += (labels==predicted).sum().item()
num_total += labels.size(0)
val_acc = 1. * num_correct / num_total
print('Epoch [%d/%d], Step [%d/%d], Val Acc: %.4f'
%(epoch+1, num_epochs, i+1, len(train_dset)//batch_size,
val_acc))
model.train()
I use the following code to make my results reproducible and it seems to work :)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
# for cuda
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

Kernel Restarting The kernel appears to have died & dst tensor is not initialized

I'm using Tensorflow (GPU) to fit a CNN model (the total input datasize is only 9.8MB(np array form) and I'm on Windows 10 (Kaby Lake), Tensorflow GPU mode, Geforce GTX 1050, RAM 32GB.
Each time I try running this below piece of code, it either ends the kernel or throws up the error "dst tensor is not initialized". This code seems to be executable by others with relatively lower computing power than mine but I'm not sure how to get it to work.
I am able to run the below code on Tensorflow CPU mode without any problem (but it takes almost 12 hours to finish running it, especially with the epoch is set to more than just 3). That's why I need to run it using my GPU for faster execution.
import tensorflow as tf
import numpy as np
IMG_PX_SIZE = 50
HM_SLICES = 20
n_classes = 2
x = tf.placeholder('float')
y = tf.placeholder('float')
keep_rate = 0.8
keep_prob = tf.placeholder(tf.float32)
def conv3d(x, W):
return tf.nn.conv3d(x, W, strides=[1,1,1,1,1], padding='SAME')
def maxpool3d(x):
return tf.nn.max_pool3d(x, ksize=[1,2,2,2,1], strides=[1,2,2,2,1],
padding='SAME')
def convolutional_neural_network(x):
weights = {'W_conv1':tf.Variable(tf.random_normal([3,3,3,1,32])),
'W_conv2':tf.Variable(tf.random_normal([3,3,3,32,64])),
'W_fc':tf.Variable(tf.random_normal([62720 ,1024])),
'out':tf.Variable(tf.random_normal([1024, n_classes]))}
biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
'b_conv2':tf.Variable(tf.random_normal([64])),
'b_fc':tf.Variable(tf.random_normal([1024])),
'out':tf.Variable(tf.random_normal([n_classes]))}
x = tf.reshape(x, shape=[-1, IMG_PX_SIZE, IMG_PX_SIZE, HM_SLICES, 1])
conv1 = tf.nn.relu(conv3d(x, weights['W_conv1']) + biases['b_conv1'])
conv1 = maxpool3d(conv1)
conv2 = tf.nn.relu(conv3d(conv1, weights['W_conv2']) + biases['b_conv2'])
conv2 = maxpool3d(conv2)
fc = tf.reshape(conv2,[-1, 62720 ])
fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
fc = tf.nn.dropout(fc, keep_rate)
output = tf.matmul(fc, weights['out']) + biases['out']
return output
def train_neural_network(x):
much_data = np.load('muchdata_sampled-50-50-20.npy')
train_data = much_data[:100]
validation_data = much_data[-100:]
prediction = convolutional_neural_network(x)
cost = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
hm_epochs = 3
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss = 0
for data in train_data:
X = data[0]
Y = data[1]
_, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y})
epoch_loss += c
print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:[i[0] for i in validation_data], y:[i[1]
for i in validation_data]}))
train_neural_network(x)
Please kindly provide some help as I'm stuck with this for sometime now. My only tip is to feed the data in batches instead of the whole thing into CNN, but I'm not successful with that technique yet. Could someone please point out a way ?

How to model a mixture of 3 Normals in PyMC?

There is a question on CrossValidated on how to use PyMC to fit two Normal distributions to data. The answer of Cam.Davidson.Pilon was to use a Bernoulli distribution to assign data to one of the two Normals:
size = 10
p = Uniform( "p", 0 , 1) #this is the fraction that come from mean1 vs mean2
ber = Bernoulli( "ber", p = p, size = size) # produces 1 with proportion p.
precision = Gamma('precision', alpha=0.1, beta=0.1)
mean1 = Normal( "mean1", 0, 0.001 )
mean2 = Normal( "mean2", 0, 0.001 )
#deterministic
def mean( ber = ber, mean1 = mean1, mean2 = mean2):
return ber*mean1 + (1-ber)*mean2
Now my question is: how to do it with three Normals?
Basically, the issue is that you can't use a Bernoulli distribution and 1-Bernoulli anymore. But how to do it then?
edit: With the CDP's suggestion, I wrote the following code:
import numpy as np
import pymc as mc
n = 3
ndata = 500
dd = mc.Dirichlet('dd', theta=(1,)*n)
category = mc.Categorical('category', p=dd, size=ndata)
precs = mc.Gamma('precs', alpha=0.1, beta=0.1, size=n)
means = mc.Normal('means', 0, 0.001, size=n)
#mc.deterministic
def mean(category=category, means=means):
return means[category]
#mc.deterministic
def prec(category=category, precs=precs):
return precs[category]
v = np.random.randint( 0, n, ndata)
data = (v==0)*(50+ np.random.randn(ndata)) \
+ (v==1)*(-50 + np.random.randn(ndata)) \
+ (v==2)*np.random.randn(ndata)
obs = mc.Normal('obs', mean, prec, value=data, observed = True)
model = mc.Model({'dd': dd,
'category': category,
'precs': precs,
'means': means,
'obs': obs})
The traces with the following sampling procedure look good as well. Solved!
mcmc = mc.MCMC( model )
mcmc.sample( 50000,0 )
mcmc.trace('means').gettrace()[-1,:]
there is a mc.Categorical object that does just this.
p = [0.2, 0.3, .5]
t = mc.Categorical('test', p )
t.random()
#array(2, dtype=int32)
It returns an int between 0 and len(p)-1. To model the 3 Normals, you make p a mc.Dirichlet object (it accepts a k length array as the hyperparameters; setting the values in the array to be the same is setting the prior probabilities to be equal). The rest of the model is nearly identical.
This is a generalization of the model I suggested above.
Update:
Okay, so instead of having different means, we can collapse them all into 1:
means = Normal( "means", 0, 0.001, size=3 )
...
#mc.deterministic
def mean(categorical=categorical, means = means):
return means[categorical]

Resources