Pytorch - Not able to achieve reproducibility - image

I am a training a image classifier model using Pytorch. While training it I am not able to set the seed. I have exploited all my options but still not getting any consistent results. Please help me with the same.
I was using this but my model is still not consistent.
torch.manual_seed(1)
torch.cuda.manual_seed(1)
np.random.seed(1)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 10)
#Define loss function & optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
lrscheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, threshold = 0.9)
model = model.to(device)
#Train model
model.train()
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
train_acc = (labels==predicted).sum().item() / images.size(0)
if (i+1) % 2 == 0:
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.4f'
% (epoch+1, num_epochs, i+1, len(train_dset)//batch_size,
loss.item(), train_acc))
if (i+1) % 5 == 0:
model.eval()
with torch.no_grad():
num_correct, num_total = 0, 0
for (images, labels) in val_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
num_correct += (labels==predicted).sum().item()
num_total += labels.size(0)
val_acc = 1. * num_correct / num_total
print('Epoch [%d/%d], Step [%d/%d], Val Acc: %.4f'
%(epoch+1, num_epochs, i+1, len(train_dset)//batch_size,
val_acc))
model.train()

I use the following code to make my results reproducible and it seems to work :)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
# for cuda
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

Related

Emotion detection using facial landmarks

I plan on using scikit svm for class prediction.
I have been trying this :
Get images from a webcam
Detect Facial Landmarks
Train a machine learning algorithm (we will use a linear SVM)
Predict emotions
I have a problem in this line : clf.fit(npar_train, training_labels)
also I have a problem in site-packages\sklearn\svm_base.py and in site-packages\sklearn\utils\validation.py
How can I remove this error?
thank you in advance
python script
emotions = ['neutral', 'sad', 'happy', 'anger']
data={}
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor('shape_predictor_68_face_landmarks.dat')
clf = SVC(kernel='linear', probability=True, tol=1e-3)
def get_files(emotion):
files = glob.glob('img\\datasets\\%s\\*' %emotion)
random.shuffle(files)
training = files[:int(len(files)*0.8)]
prediction = files[-int(len(files)*0.2)]
return training, prediction
def get_landmarks(image):
detections = detector(image, 1)
for k, d in enumerate(detections): # For all detected face instances individually
shape = predictor(image, d) # Draw Facial Landmarks with the predictor class
xlist = []
ylist = []
for i in range(1, 68): # Store X and Y coordinates in two lists
xlist.append(float(shape.part(i).x))
ylist.append(float(shape.part(i).y))
xmean = np.mean(xlist)
ymean = np.mean(ylist)
xcentral = [(x - xmean) for x in xlist]
ycentral = [(y - ymean) for y in ylist]
landmarks_vectorised = []
for x, y, w, z in zip(xcentral, ycentral, xlist, ylist):
landmarks_vectorised.append(w)
landmarks_vectorised.append(z)
meannp = np.asarray((ymean, xmean))
coornp = np.asarray((z, w))
dist = np.linalg.norm(coornp - meannp)
landmarks_vectorised.append(dist)
landmarks_vectorised.append((math.atan2(y, x) * 360) / (2 * math.pi))
data['landmarks_vectorised'] = landmarks_vectorised
if len(detections) < 1:
data['landmarks_vestorised'] = "error"
def make_sets():
training_data = []
training_labels = []
prediction_data = []
prediction_labels = []
for emotion in emotions:
print("Working on %s emotion" %emotion)
training, prediction = get_files(emotion)
for item in training:
image = cv2.imread(item)
try:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
except:
print()
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
clahe_image = clahe.apply(image)
get_landmarks(clahe_image)
if data['landmarks_vectorised'] == "error":
print("no face detected on this one")
else:
training_data.append(data['landmarks_vectorised']) # append image array to training data list
training_labels.append(emotions.index(emotion))
for item in prediction:
image = cv2.imread(item)
try:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
except:
print()
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
clahe_image = clahe.apply(image)
get_landmarks(clahe_image)
if data['landmarks_vectorised'] == "error":
print("no face detected on this one")
else:
prediction_data.append(data['landmarks_vectorised'])
prediction_labels.append(emotions.index(emotion))
return training_data, training_labels, prediction_data, prediction_labels
accur_lin = []
for i in range(0,10):
print("Making sets %s" % i) # Make sets by random sampling 80/20%
training_data, training_labels, prediction_data, prediction_labels = make_sets()
npar_train = np.array(training_data)
npar_trainlabs = np.array(training_labels)
print("training SVM linear %s" % i) # train SVM
clf.fit(npar_train, training_labels)
print("getting accuracies %s" % i)
npar_pred = np.array(prediction_data)
pred_lin = clf.score(npar_pred, prediction_labels)
print("Mean value lin svm: %s" % np.mean(accur_lin))

When using the five-fold cross validation to train the network, some folds perform well and some perform poorly, how can I do

I am trying to create a binary CNN classifier for a dataset (class 0 = 77 images, class 1 = 41 images), which I want to do 5-Fold cross validation. In each fold, using the validation sets to save best model, and sharing same model, Hyperparameters, and training strategy. And here is my results.
fold - test sets accuracy
fold0 - 0.68
fold1 - 0.71
fold2 - 0.91
fold3 - 0.96
fold4 - 0.64
My question is:
Fine tuning by changing the Hyperparameters. It was found that fold2 and fold3 performed better each time, but fold0 and fold4 performed poorly. What is willing to cause it and what should I do.
The possible problem is that each initialization is random.
Thank you all for your answers.
import os
import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data.sampler import WeightedRandomSampler
import monai
from monai.data import NiftiDataset
from monai.transforms import Compose, AddChannel, ScaleIntensity, RandFlip, RandRotate, ToTensor
from monai.data import CSVSaver
from data_process import read_csv, get_sample_weights
def train(train_file, val_file, stage='exp0'):
'''
:param train_file:
:param val_file:
:param stage:
:return:
'''
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
img_src_path = '../samples/T1c_images/' #
img_list_train, label_list_train = read_csv(train_file)
img_list_val, label_list_val = read_csv(val_file)
img_train = [os.path.join(img_src_path, i) for i in img_list_train]
labels_train = [int(i) for i in label_list_train]
img_val = [os.path.join(img_src_path, i) for i in img_list_val]
labels_val = [int(i) for i in label_list_val]
print('val images: ', len(img_val))
# Define transforms
# train_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((182, 218, 182)), RandRotate90(), ToTensor()])
# val_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((182, 218, 182)), ToTensor()])
train_transforms = Compose([ScaleIntensity(), RandRotate(range_x=45, range_y=45, range_z=45, prob=0.5),
RandFlip(prob=0.5, spatial_axis=1),
AddChannel(), ToTensor()]) # if x=y=z RandRotate90()
val_transforms = Compose([ScaleIntensity(), AddChannel(), ToTensor()])
train_ds = NiftiDataset(image_files=img_train, labels=labels_train, transform=train_transforms, image_only=False)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=2,
pin_memory=torch.cuda.is_available())
# create a validation data_process loader
val_ds = NiftiDataset(image_files=img_val, labels=labels_val, transform=val_transforms, image_only=False)
val_loader = DataLoader(val_ds, batch_size=4, num_workers=2, pin_memory=torch.cuda.is_available())
# Create DenseNet121, CrossEntropyLoss and Adam optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device)
model = torch.nn.DataParallel(model)
loss_function = torch.nn.CrossEntropyLoss(weight=torch.Tensor([1, 1.2])).cuda()
optimizer = torch.optim.Adam(model.parameters(), 1e-5)
# start a typical PyTorch training
epochs = 50
val_interval = 1
best_metric = -1
best_metric_epoch = -1
writer = SummaryWriter()
for epoch in range(epochs):
print("-" * 10)
print(f"epoch {epoch + 1}/{epochs}")
model.train()
epoch_loss = 0
step = 0
t_metric_count = 0
t_num_correct = 0
for batch_data in train_loader:
step += 1
# ptrint images name
# print('image name', batch_data[2]['filename_or_obj'])
inputs = batch_data[0].to(device)
# print(inputs.shape)
labels = batch_data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_len = len(train_ds) // train_loader.batch_size
# train acc
t_value = torch.eq(outputs.argmax(dim=1), labels)
t_metric_count += len(t_value) #
t_num_correct += t_value.sum().item() #
# print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
epoch_loss /= step
t_metric = t_num_correct / t_metric_count
writer.add_scalar("train_loss", epoch_loss, epoch + 1)
writer.add_scalar("train_acc", t_metric, epoch + 1)
print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
if (epoch + 1) % val_interval == 0:
model.eval()
with torch.no_grad():
num_correct = 0.0
metric_count = 0
for val_data in val_loader:
val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
val_outputs = model(val_images)
value = torch.eq(val_outputs.argmax(dim=1), val_labels)
metric_count += len(value) #
num_correct += value.sum().item() #
metric = num_correct / metric_count
if metric > best_metric:
best_metric = metric
best_metric_epoch = epoch + 1
save_path = 'checkpoint_07201/' + stage + '_' + str(epoch + 1) + "_best_metric_model.pth"
torch.save(model.state_dict(), save_path)
print("saved new best metric model")
print(
"current epoch: {} current accuracy: {:.4f} best val accuracy: {:.4f} at epoch {}".format(
epoch + 1, metric, best_metric, best_metric_epoch
))
print('current train accuracy: {:.4f}, num_correct: {}, num_count:{}'.
format(t_metric, t_num_correct, t_metric_count ))
writer.add_scalar("val_accuracy", metric, epoch + 1)
print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
writer.close()
if __name__ == "__main__":
# 5 folder
for i in range(5):
folder = 'exp'+str(i)
train_path = './data/'+ folder +'/train.csv'
val_path = './data/'+ folder + '/val.csv'
train(train_path, val_path, stage=folder)

not enough values to unpack (expected 2, got 1) adaboost algorithm

def adaboost(X_train, Y_train, X_test, Y_test, lamb=0.01, num_iterations=200, learning_rate=0.001):
label_train = 2*Y_train -1
label_test = 2*Y_test -1
[n,p] = X_train.shape
[ntest, ptest] = X_test.shape
X_train_1 = np.concatenate((np.ones([n,1]), X_train), axis=1)
X_test_1 = np.concatenate((np.ones([ntest,1]), X_test), axis=1)
beta = np.zeros([p+1])
acc_train = []
acc_test = []
#margins = []
for it in range(num_iterations):
score = np.matmul(X_train_1, beta)
error = (score*label_train < 1)
dbeta = np.mean(X_train_1 * (error * label_train).reshape(-1,1), axis=0)
beta += learning_rate * dbeta
beta[1:] -= lamb * beta[1:]
#margins.append(np.min(score*label_train))
# train
predict = (np.sign(score) == label_train)
acc = np.sum(predict)/n
acc_train.append(acc)
# test
score_test = np.matmul(X_test_1, beta)
predict = (np.sign(score_test) == label_test)
acc = np.sum(predict)/ntest
acc_test.append(acc)
return beta, acc_train, acc_test
I am calling this function by:
_, train_acc, test_acc = adaboost(X_train, y_train, X_test, y_test)
and it is giving the error provided in title:
for line 68 '''[ntest, ptest] = X_test.shape'''
Any idea how to stop getting this error?
Can someone explain what I am doing wrong??
Whatever X_test is, it must have only a single dimension when it should be two dimensional

Tensorflow/Keras: volatile validation loss

I've been training a U-Net for single class small lesion segmentation, and have been getting consistently volatile validation loss. I have about 20k images split 70/30 between training and validation sets-so I don't think the issue is too little data. I've tried shuffling and resplitting the sets a few times with no change in volatility-so I don't think the validation set is unrepresentative. I have tried lowering the learning rate with no effect on volatility. And I have tried a few loss functions (dice coefficient, focal tversky, weighted binary cross-entropy). I'm using a decent amount of augmentation so as to avoid overfitting. I've also run through all my data (512x512 float64s with corresponding 512x512 int64 masks--both stored as numpy arrays) do double check that the value range, dtypes, etc. aren't screwy...and I even removed any ROIs in the masks under 35 pixels in area which I thought might be artifact and messing with loss.
I'm using keras ImageDataGen.flow_from_directory...I was initially using zca_whitening and brightness_range augmentation but I think this causes issues with flow_from_directory and the link between mask and image being lost.. so I skipped this.
I've tried validation generators with and without shuffle=True. Batch size is 8.
Here's some of my code, happy to include more if it would help:
# loss
from keras.losses import binary_crossentropy
import keras.backend as K
import tensorflow as tf
epsilon = 1e-5
smooth = 1
def dsc(y_true, y_pred):
smooth = 1.
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
score = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
return score
def dice_loss(y_true, y_pred):
loss = 1 - dsc(y_true, y_pred)
return loss
def bce_dice_loss(y_true, y_pred):
loss = binary_crossentropy(y_true, y_pred) + dice_loss(y_true, y_pred)
return loss
def confusion(y_true, y_pred):
smooth=1
y_pred_pos = K.clip(y_pred, 0, 1)
y_pred_neg = 1 - y_pred_pos
y_pos = K.clip(y_true, 0, 1)
y_neg = 1 - y_pos
tp = K.sum(y_pos * y_pred_pos)
fp = K.sum(y_neg * y_pred_pos)
fn = K.sum(y_pos * y_pred_neg)
prec = (tp + smooth)/(tp+fp+smooth)
recall = (tp+smooth)/(tp+fn+smooth)
return prec, recall
def tp(y_true, y_pred):
smooth = 1
y_pred_pos = K.round(K.clip(y_pred, 0, 1))
y_pos = K.round(K.clip(y_true, 0, 1))
tp = (K.sum(y_pos * y_pred_pos) + smooth)/ (K.sum(y_pos) + smooth)
return tp
def tn(y_true, y_pred):
smooth = 1
y_pred_pos = K.round(K.clip(y_pred, 0, 1))
y_pred_neg = 1 - y_pred_pos
y_pos = K.round(K.clip(y_true, 0, 1))
y_neg = 1 - y_pos
tn = (K.sum(y_neg * y_pred_neg) + smooth) / (K.sum(y_neg) + smooth )
return tn
def tversky(y_true, y_pred):
y_true_pos = K.flatten(y_true)
y_pred_pos = K.flatten(y_pred)
true_pos = K.sum(y_true_pos * y_pred_pos)
false_neg = K.sum(y_true_pos * (1-y_pred_pos))
false_pos = K.sum((1-y_true_pos)*y_pred_pos)
alpha = 0.7
return (true_pos + smooth)/(true_pos + alpha*false_neg + (1-alpha)*false_pos + smooth)
def tversky_loss(y_true, y_pred):
return 1 - tversky(y_true,y_pred)
def focal_tversky(y_true,y_pred):
pt_1 = tversky(y_true, y_pred)
gamma = 0.75
return K.pow((1-pt_1), gamma)
model = BlockModel((len(os.listdir(os.path.join(imageroot,'train_ct','train'))), 512, 512, 1),filt_num=16,numBlocks=4)
#model.compile(optimizer=Adam(learning_rate=0.001), loss=weighted_cross_entropy)
#model.compile(optimizer=Adam(learning_rate=0.001), loss=dice_coef_loss)
model.compile(optimizer=Adam(learning_rate=0.001), loss=focal_tversky)
train_mask = os.path.join(imageroot,'train_masks')
val_mask = os.path.join(imageroot,'val_masks')
model.load_weights(model_weights_path) #I'm initializing with some pre-trained weights from a similar model
data_gen_args_mask = dict(
rotation_range=10,
shear_range=20,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=[0.8,1.2],
horizontal_flip=True,
#vertical_flip=True,
fill_mode='nearest',
data_format='channels_last'
)
data_gen_args = dict(
**data_gen_args_mask
)
image_datagen_train = ImageDataGenerator(**data_gen_args)
mask_datagen_train = ImageDataGenerator(**data_gen_args)#_mask)
image_datagen_val = ImageDataGenerator()
mask_datagen_val = ImageDataGenerator()
seed = 1
BS = 8
steps = int(np.floor((len(os.listdir(os.path.join(train_ct,'train'))))/BS))
print(steps)
val_steps = int(np.floor((len(os.listdir(os.path.join(val_ct,'val'))))/BS))
print(val_steps)
train_image_generator = image_datagen_train.flow_from_directory(
train_ct,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
train_mask_generator = mask_datagen_train.flow_from_directory(
train_mask,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
val_image_generator = image_datagen_val.flow_from_directory(
val_ct,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
val_mask_generator = mask_datagen_val.flow_from_directory(
val_mask,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
train_generator = zip(train_image_generator, train_mask_generator)
val_generator = zip(val_image_generator, val_mask_generator)
# make callback for checkpointing
plot_losses = PlotLossesCallback(skip_first=0,plot_extrema=False)
%matplotlib inline
filepath = os.path.join(versionPath, model_version + "_saved-model-{epoch:02d}-{val_loss:.2f}.hdf5")
if reduce:
cb_check = [ModelCheckpoint(filepath,monitor='val_loss',
verbose=1,save_best_only=False,
save_weights_only=True,mode='auto',period=1),
reduce_lr,
plot_losses]
else:
cb_check = [ModelCheckpoint(filepath,monitor='val_loss',
verbose=1,save_best_only=False,
save_weights_only=True,mode='auto',period=1),
plot_losses]
# train model
history = model.fit_generator(train_generator, epochs=numEp,
steps_per_epoch=steps,
validation_data=val_generator,
validation_steps=val_steps,
verbose=1,
callbacks=cb_check,
use_multiprocessing = False
)
And here's how my loss looks:
Another potentially relevant thing: I tweaked the flow_from_directory code a bit (added npy to the white list). But training loss looks fine so assuming the issue isnt here
Two suggestions:
Switch to the classic validation data format (i.e. numpy array) instead of using a generator -- this will ensure you always use the exactly same validation data every time. If you see a different validation curve, then there is something "random" in the validation generator giving you different data at different epochs.
Use a fixed set of samples (100 or 1000 should be enough w/o any data augmentation) for both training and validation. If everything goes well, you should see your network quickly overfit to this dataset and your training and validation curves should very much similar. If not, debug your network.

how store train_loss and valid_loss separably from epoch_loss?

I am trying to store the train_loss and valid_loss separably from epoch_loss as epoch_loss is return back the two loss values(first is train loss and second is valid loss). the epoch_loss is a float64 object. I tried to convert it to a numpy array and then access to the each slice but again return me two values.
this is snippet
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 1e-3
# Decay LR by a factor of 0.1 every 4 epochs.
#step size: Period of learning rate decay.
#gamma = Multiplicative factor of learning rate decay. Default: 0.1, should
float
scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
def train_model(model, criterion, optimizer, scheduler, num_epochs=4):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs): # loop over the dataset multiple times
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 30)
# Each epoch has a training and validation phase
for phase in ['train', 'valid']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
train_loss = 0.0
total_train = 0
correct_train = 0
#iterate over data
for t_image, mask, image_paths, target_paths in dataLoaders[phase]:
# get the inputs
t_image = t_image.to(device)
mask = mask.to(device)
# zeroes the gradient buffers of all parameters
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(t_image)
_, predicted = torch.max(outputs.data, 1)
loss = criterion(outputs, mask) # calculate the loss
# backward + optimize only if in training phase
if phase == 'train':
loss.backward() # back propagation
optimizer.step() # update gradients
# accuracy
train_loss += loss.item()
total_train += mask.nelement() # number of pixel in the batch
correct_train += predicted.eq(mask.data).sum().item() # sum all precited pixel values
epoch_loss = train_loss / len(dataLoaders[phase].dataset)
epoch_acc = (correct_train / total_train)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
#deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
torch.save(model.state_dict(), 'train_valid_exp1.pth')
# load best model weights
model.load_state_dict(best_model_wts)

Resources