RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call

RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call - image

I tried DeIT classification using pytorch, it's a workable code. When I change my collab account to pro I still get an error in the second epochs:
I execute this instruction :
model_ft = train_model(model, criterion, optimizer, exp_lr_scheduler) # now it is a lot faster
Training model function: :
def train_model(model, criterion, optimizer, scheduler, num_epochs=10):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print(f'Epoch {epoch}/{num_epochs - 1}')
print("-"*10)
for phase in ['train', 'val']: # We do training and validation phase per epoch
if phase == 'train':
model.train() # model to training mode
else:
model.eval() # model to evaluate
running_loss = 0.0
running_corrects = 0.0
for inputs, labels in tqdm(dataloaders[phase]):
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'): # no autograd makes validation go faster
outputs = model(inputs)
_, preds = torch.max(outputs, 1) # used for accuracy
loss = criterion(outputs, labels)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step() # step at end of epoch
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print("{} Loss: {:.4f} Acc: {:.4f}".format(phase, epoch_loss, epoch_acc))
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict()) # keep the best validation accuracy model
print()
time_elapsed = time.time() - since # slight error
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print("Best Val Acc: {:.4f}".format(best_acc))
model.load_state_dict(best_model_wts)
return model
After the first epoch i get this error
Epoch 0/9
----------
100%|██████████| 175/175 [15:40<00:00, 5.37s/it]
train Loss: 6.2008 Acc: 0.0037
1%|▏ | 3/232 [00:11<14:36, 3.83s/it]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-23-c552e1f19194> in <module>
----> 1 model_ft = train_model(model, criterion, optimizer, exp_lr_scheduler) # now it is a lot faster
2 # I will come back after 10 epochs
2 frames
<ipython-input-22-58764f3d79c1> in train_model(model, criterion, optimizer, scheduler, num_epochs)
26 outputs = model(inputs)
27 _, preds = torch.max(outputs, 1) # used for accuracy
---> 28 loss = criterion(outputs, labels)
29
30 if phase == 'train':
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/timm/loss/cross_entropy.py in forward(self, x, target)
20 def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
21 logprobs = F.log_softmax(x, dim=-1)
---> 22 nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
23 nll_loss = nll_loss.squeeze(1)
24 smooth_loss = -logprobs.mean(dim=-1)
RuntimeError: CUDA error: device-side assert triggered
( my outputs are matched, my dataset has 37374 images and 488 classes
Sequential(
(0): Linear(in_features=192, out_features=150, bias=True)
(1): ReLU()
(2): Dropout(p=0.3, inplace=False)
(3): Linear(in_features=150, out_features=488, bias=True)
)
I tried os.environ['CUDA_LAUNCH_BLOCKING'] = "1" and still have the error)

Related

Using GEKKO for Moving Horizon Estimation online 2

This is the following question after appyling comments from: Using GEKKO for Moving Horizon Estimation online
I have studied example from estimation iterative example on the Dynamic Optimization course website and revised my code as follows:
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
import matplotlib; matplotlib.use('TkAgg')
class Observer():
def __init__(self, window_size, r_init, alpha_init):
self.m = GEKKO(remote=False)
self.dt = 0.05
self.m.time = [i*self.dt for i in range(window_size)]
#Parameters
self.m.u = self.m.MV()
#Variables
self.m.r = self.m.CV(lb=0) # value=r_init) #ub=20 can be over 20
self.m.alpha = self.m.CV() # value=alpha_init) #ub lb for angle?
#Equations
self.m.Equation(self.m.r.dt()== -self.m.cos(self.m.alpha))
self.m.Equation(self.m.alpha.dt()== self.m.sin(self.m.alpha)/self.m.r - self.m.u) # differential equation
#Options
self.m.options.MV_STEP_HOR = 2
self.m.options.IMODE = 5 # dynamic estimation
self.m.options.EV_TYPE = 2 #Default 1: absolute error form 2: squared error form
self.m.options.DIAGLEVEL = 0 #diagnostic level
self.m.options.NODES = 5 #nodes # collocation nodes default:2
self.m.options.SOLVER = 3 #solver_num
# STATUS = 0, optimizer doesn't adjust value
# STATUS = 1, optimizer can adjust
self.m.u.STATUS = 0
self.m.r.STATUS = 1
self.m.alpha.STATUS = 1
# FSTATUS = 0, no measurement
# FSTATUS = 1, measurement used to update model
self.m.u.FSTATUS = 1 #default
self.m.r.FSTATUS = 1
self.m.alpha.FSTATUS = 1
self.m.r.TR_INIT = 0
self.m.alpha.TR_INIT = 0
self.count = 0
def MHE(self, observed_state, u_data):
self.count =+ 1
self.m.u.MEAS = u_data
self.m.r.MEAS = observed_state[0]
self.m.alpha.MEAS = observed_state[1]
self.m.solve(disp=False)
return self.m.r.MODEL, self.m.alpha.MODEL
if __name__=="__main__":
FILE_PATH00 = '/home/shane16/Project/model_guard/uav_paper/adversarial/SA_PPO/src/DATA/4end_estimation_results_r.csv'
FILE_PATH01 = '/home/shane16/Project/model_guard/uav_paper/adversarial/SA_PPO/src/DATA/4end_estimation_results_alpha.csv'
FILE_PATH02 = '/home/shane16/Project/model_guard/uav_paper/adversarial/SA_PPO/src/DATA/4end_action_buffer_eps0.0_sig0.0.csv'
cycles = 55
x = np.arange(cycles) # 1...300
matrix00 = np.loadtxt(FILE_PATH00, delimiter=',')
matrix01 = np.loadtxt(FILE_PATH01, delimiter=',')
matrix02 = np.loadtxt(FILE_PATH02, delimiter=',')
vanilla_action_sigma_0 = matrix02
vanilla_estimation_matrix_r = np.zeros(cycles)
vanilla_estimation_matrix_alpha = np.zeros(cycles)
# sigma = 0.0
# vanilla model true/observed states
r_vanilla_sigma_0_true = matrix00[0, 3:] # from step 1
r_vanilla_sigma_0_observed = matrix00[1, 3:] # from step1
alpha_vanilla_sigma_0_true = matrix01[0, 3:]
alpha_vanilla_sigma_0_observed = matrix01[1, 3:]
# initialize estimator
sigma = 0.0 #1.0
solver_num = 3
nodes = 5
# for window_size in [5, 10, 20, 30, 40, 50]:
window_size = 5
observer = Observer(window_size, r_vanilla_sigma_0_observed[0], alpha_vanilla_sigma_0_observed[0])
for i in range(cycles):
if i % 100 == 0:
print('cylcle: {}'.format(i))
vanilla_observed_states = np.hstack((r_vanilla_sigma_0_observed[i], alpha_vanilla_sigma_0_observed[i])) # from current observed state
r_hat, alpha_hat = observer.MHE(vanilla_observed_states, vanilla_action_sigma_0[i]) # and current action -> estimate current state
vanilla_estimation_matrix_r[i] = r_hat
vanilla_estimation_matrix_alpha[i] = alpha_hat
#plot vanilla
plt.figure()
plt.subplot(3,1,1)
plt.title('Vanilla model_sig{}'.format(sigma))
plt.plot(x, vanilla_action_sigma_0[:cycles],'b:',label='action (w)')
plt.legend()
plt.subplot(3,1,2)
plt.ylabel('r')
plt.plot(x, r_vanilla_sigma_0_true[:cycles], 'k-', label='true_r')
plt.plot(x, r_vanilla_sigma_0_observed[:cycles], 'gx', label='observed_r')
plt.plot(x, vanilla_estimation_matrix_r, 'r--', label='time window: 10')
# plt.legend()
plt.subplot(3,1,3)
plt.xlabel('time steps')
plt.ylabel('alpha')
plt.plot(x, alpha_vanilla_sigma_0_true[:cycles], 'k-', label='true_alpha')
plt.plot(x, alpha_vanilla_sigma_0_observed[:cycles], 'gx', label='observed_alpha')
plt.plot(x, vanilla_estimation_matrix_alpha, 'r--', label='time window: {}'.format(window_size))
plt.legend()
plt.savefig('plot/revision/4estimated_STATES_vanilla_sig{}_window{}_cycles{}_solver{}_nodes{}.png'.format(sigma, window_size,cycles, solver_num, nodes))
plt.show()
csv files: https://drive.google.com/drive/folders/1jW_6zBCdbJHB7yU3HmCIhamEyOT1LJqD?usp=sharing
The code works when initialized with values specified at line 15,16 (m.r, m.alpha).
However, if I try with no initial value,(as same condition in example), solution is not found.
terminal output:
cylcle: 0 Traceback (most recent call last): File
"4observer_mhe.py", line 86, in
r_hat, alpha_hat = observer.MHE(vanilla_observed_states, vanilla_action_sigma_0[i]) # and current action -> estimate current
state File "4observer_mhe.py", line 49, in MHE
self.m.solve(disp=False) File "/home/shane16/Project/model_guard/LipSDP/lipenv/lib/python3.7/site-packages/gekko/gekko.py",
line 2140, in solve
raise Exception(apm_error) Exception: #error: Solution Not Found
What could be the solution to this problem?
I have tried below strategies, but couldn't find the solution.
Reduce the number of decision variables by using m.FV() or m.MV() with m.options.MV_STEP_HOR=2+ to reduce the degrees of freedom for the solver for the unknown parameters.
Try other solvers with m.options.SOLVER=1 or m.options.SOLVER=2.
I expect to see estimation results that follow the true state well.
But I guess I'm doing something wrong.
Could anyone help me please?
Thank you.

Solvers sometimes need good initial guess values or constraints (lower and upper bounds) on the degrees of freedom (MV or FV) to find the optimal solution.
One of the equations may be the source of the problem:
self.m.alpha.dt() == self.m.sin(self.m.alpha)/self.m.r - self.m.u
The initial value of r is zero (default) because no initial value is provided when it is declared as self.m.r = self.m.CV(lb=0). A comment suggests that it was formerly initialized with value r_init. The zero value creates a divide-by-zero for that equation. Try rearranging the equation into an equivalent form that avoids the potential for divide-by-zero either with the initial guess or when the solver is iterating.
self.m.r*self.m.alpha.dt() == self.m.sin(self.m.alpha) - self.m.r*self.m.u
There may be other things that are also causing the model to not converge. When the solution does not converge then the infeasibilities.txt file can be a source to troubleshoot the specific equations that are having trouble. Here are instructions to retrieve the infeasibilities.txt file: How to retrieve the 'infeasibilities.txt' from the gekko

MSE Loss is reducing with a very small amount during training pytorch

I'm trying out my first deep learning program for speech separation using ideal ratio mask. It is a BLSTM model. The mse loss obtained while training is reducing with a very small quantity. Any idea about why it is happening and how to fix it?? thank you
my learning rate is 0.0003, adam optimizer, batch size 64, training dataset 3000 files and validation dataset is 340 files. I tried changing the learning rate and tried learning rate decay as well. still getting the same problem. I've been stuck here for a long time. Please do help. Thank you
Train Epoch: 1 Loss: 0.027980
val Epoch: 1 Loss: 0.025925
Train Epoch: 2 Loss: 0.027975
val Epoch: 2 Loss: 0.025961
Train Epoch: 3 Loss: 0.027973
val Epoch: 3 Loss: 0.025987
here is my model
import torch.nn as nn
import torch
print("Is the GPU available:", torch.cuda.is_available()) # True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class Base_model(nn.Module):
def __init__(self):
super(Base_model, self).__init__()
self.lstm = nn.LSTM(input_size=322, hidden_size=300, num_layers=4, bidirectional = True, batch_first=True)
self.rel = nn.ReLU()
self.fc = nn.Linear(in_features=600, out_features=161)
self.activation = nn.Sigmoid()
def forward(self, x):
# print(x.size(0))
out, _ = self.lstm(x)
out = self.fc(out)
out = self.activation(out)
return out
my training code
start_epoch = 0
if args.model_name:
print("Load the model:", args.checkpoints_dir + args.model_name)
checkpoint = torch.load(args.checkpoints_dir + args.model_name)
model.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint["optimizer"])
start_epoch = checkpoint['epoch']
# lr_schedule.load_state_dict(checkpoint['lr_schedule']) # load lr_scheduler
for epoch in range(start_epoch, args.epochs):
model.train() # train model
for batch_idx, (train_X, train_mask, train_nearend_mic_magnitude, train_nearend_magnitude) in enumerate(
train_loader):
train_X = torch.reshape(train_X, (train_X.shape[0],train_X.shape[2],train_X.shape[1]))
train_mask = torch.reshape(train_mask, (train_mask.shape[0],train_mask.shape[2],train_mask.shape[1]))
train_nearend_mic_magnitude = torch.reshape(train_nearend_mic_magnitude, (train_nearend_mic_magnitude.shape[0],train_nearend_mic_magnitude.shape[2],train_nearend_mic_magnitude.shape[1]))
train_nearend_magnitude = torch.reshape(train_nearend_magnitude, (train_nearend_magnitude.shape[0],train_nearend_magnitude.shape[2],train_nearend_magnitude.shape[1]))
train_X = train_X.to(device)
train_mask = train_mask.to(device) # IRM [batch_size 161, 999]
train_nearend_mic_magnitude = train_nearend_mic_magnitude.to(device)
train_nearend_magnitude = train_nearend_magnitude.to(device)
# forward propagation
optimizer.zero_grad() # zero out the gradient
pred_mask = model(train_X) # [batch_size, 322, 999]--> [batch_size, 161, 999]
train_loss = criterion(pred_mask, train_mask)
# backpropagation
train_loss.backward() # backpropagation
optimizer.step() # update parameters
# ########### Visual printing ############
print('Train Epoch: {} Loss: {:.6f} '.format(epoch + 1, train_loss.item()))

I would suggest u use a bigger learning rate and make a decaying learning rate:
torch.optim.lr_scheduler.StepLR
https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html

When using the five-fold cross validation to train the network, some folds perform well and some perform poorly, how can I do

I am trying to create a binary CNN classifier for a dataset (class 0 = 77 images, class 1 = 41 images), which I want to do 5-Fold cross validation. In each fold, using the validation sets to save best model, and sharing same model, Hyperparameters, and training strategy. And here is my results.
fold - test sets accuracy
fold0 - 0.68
fold1 - 0.71
fold2 - 0.91
fold3 - 0.96
fold4 - 0.64
My question is:
Fine tuning by changing the Hyperparameters. It was found that fold2 and fold3 performed better each time, but fold0 and fold4 performed poorly. What is willing to cause it and what should I do.
The possible problem is that each initialization is random.
Thank you all for your answers.
import os
import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data.sampler import WeightedRandomSampler
import monai
from monai.data import NiftiDataset
from monai.transforms import Compose, AddChannel, ScaleIntensity, RandFlip, RandRotate, ToTensor
from monai.data import CSVSaver
from data_process import read_csv, get_sample_weights
def train(train_file, val_file, stage='exp0'):
'''
:param train_file:
:param val_file:
:param stage:
:return:
'''
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
img_src_path = '../samples/T1c_images/' #
img_list_train, label_list_train = read_csv(train_file)
img_list_val, label_list_val = read_csv(val_file)
img_train = [os.path.join(img_src_path, i) for i in img_list_train]
labels_train = [int(i) for i in label_list_train]
img_val = [os.path.join(img_src_path, i) for i in img_list_val]
labels_val = [int(i) for i in label_list_val]
print('val images: ', len(img_val))
# Define transforms
# train_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((182, 218, 182)), RandRotate90(), ToTensor()])
# val_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((182, 218, 182)), ToTensor()])
train_transforms = Compose([ScaleIntensity(), RandRotate(range_x=45, range_y=45, range_z=45, prob=0.5),
RandFlip(prob=0.5, spatial_axis=1),
AddChannel(), ToTensor()]) # if x=y=z RandRotate90()
val_transforms = Compose([ScaleIntensity(), AddChannel(), ToTensor()])
train_ds = NiftiDataset(image_files=img_train, labels=labels_train, transform=train_transforms, image_only=False)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=2,
pin_memory=torch.cuda.is_available())
# create a validation data_process loader
val_ds = NiftiDataset(image_files=img_val, labels=labels_val, transform=val_transforms, image_only=False)
val_loader = DataLoader(val_ds, batch_size=4, num_workers=2, pin_memory=torch.cuda.is_available())
# Create DenseNet121, CrossEntropyLoss and Adam optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device)
model = torch.nn.DataParallel(model)
loss_function = torch.nn.CrossEntropyLoss(weight=torch.Tensor([1, 1.2])).cuda()
optimizer = torch.optim.Adam(model.parameters(), 1e-5)
# start a typical PyTorch training
epochs = 50
val_interval = 1
best_metric = -1
best_metric_epoch = -1
writer = SummaryWriter()
for epoch in range(epochs):
print("-" * 10)
print(f"epoch {epoch + 1}/{epochs}")
model.train()
epoch_loss = 0
step = 0
t_metric_count = 0
t_num_correct = 0
for batch_data in train_loader:
step += 1
# ptrint images name
# print('image name', batch_data[2]['filename_or_obj'])
inputs = batch_data[0].to(device)
# print(inputs.shape)
labels = batch_data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_len = len(train_ds) // train_loader.batch_size
# train acc
t_value = torch.eq(outputs.argmax(dim=1), labels)
t_metric_count += len(t_value) #
t_num_correct += t_value.sum().item() #
# print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
epoch_loss /= step
t_metric = t_num_correct / t_metric_count
writer.add_scalar("train_loss", epoch_loss, epoch + 1)
writer.add_scalar("train_acc", t_metric, epoch + 1)
print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
if (epoch + 1) % val_interval == 0:
model.eval()
with torch.no_grad():
num_correct = 0.0
metric_count = 0
for val_data in val_loader:
val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
val_outputs = model(val_images)
value = torch.eq(val_outputs.argmax(dim=1), val_labels)
metric_count += len(value) #
num_correct += value.sum().item() #
metric = num_correct / metric_count
if metric > best_metric:
best_metric = metric
best_metric_epoch = epoch + 1
save_path = 'checkpoint_07201/' + stage + '_' + str(epoch + 1) + "_best_metric_model.pth"
torch.save(model.state_dict(), save_path)
print("saved new best metric model")
print(
"current epoch: {} current accuracy: {:.4f} best val accuracy: {:.4f} at epoch {}".format(
epoch + 1, metric, best_metric, best_metric_epoch
))
print('current train accuracy: {:.4f}, num_correct: {}, num_count:{}'.
format(t_metric, t_num_correct, t_metric_count ))
writer.add_scalar("val_accuracy", metric, epoch + 1)
print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
writer.close()
if __name__ == "__main__":
# 5 folder
for i in range(5):
folder = 'exp'+str(i)
train_path = './data/'+ folder +'/train.csv'
val_path = './data/'+ folder + '/val.csv'
train(train_path, val_path, stage=folder)

how store train_loss and valid_loss separably from epoch_loss?

I am trying to store the train_loss and valid_loss separably from epoch_loss as epoch_loss is return back the two loss values(first is train loss and second is valid loss). the epoch_loss is a float64 object. I tried to convert it to a numpy array and then access to the each slice but again return me two values.
this is snippet
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 1e-3
# Decay LR by a factor of 0.1 every 4 epochs.
#step size: Period of learning rate decay.
#gamma = Multiplicative factor of learning rate decay. Default: 0.1, should
float
scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
def train_model(model, criterion, optimizer, scheduler, num_epochs=4):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs): # loop over the dataset multiple times
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 30)
# Each epoch has a training and validation phase
for phase in ['train', 'valid']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
train_loss = 0.0
total_train = 0
correct_train = 0
#iterate over data
for t_image, mask, image_paths, target_paths in dataLoaders[phase]:
# get the inputs
t_image = t_image.to(device)
mask = mask.to(device)
# zeroes the gradient buffers of all parameters
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(t_image)
_, predicted = torch.max(outputs.data, 1)
loss = criterion(outputs, mask) # calculate the loss
# backward + optimize only if in training phase
if phase == 'train':
loss.backward() # back propagation
optimizer.step() # update gradients
# accuracy
train_loss += loss.item()
total_train += mask.nelement() # number of pixel in the batch
correct_train += predicted.eq(mask.data).sum().item() # sum all precited pixel values
epoch_loss = train_loss / len(dataLoaders[phase].dataset)
epoch_acc = (correct_train / total_train)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
#deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
torch.save(model.state_dict(), 'train_valid_exp1.pth')
# load best model weights
model.load_state_dict(best_model_wts)

how to get reproducible result in Tensorflow

I built 5-layer neural network by using tensorflow.
I have a problem to get reproducible results (or stable results).
I found similar questions regarding reproducibility of tensorflow and the corresponding answers, such as How to get stable results with TensorFlow, setting random seed
But the problem is not solved yet.
I also set random seed like the following
tf.set_random_seed(1)
Furthermore, I added seed options to every random function such as
b1 = tf.Variable(tf.random_normal([nHidden1], seed=1234))
I confirmed that the first epoch shows the identical results, but not identical from the second epoch little by little.
How can I get the reproducible results?
Am I missing something?
Here is a code block I use.
def xavier_init(n_inputs, n_outputs, uniform=True):
if uniform:
init_range = tf.sqrt(6.0 / (n_inputs + n_outputs))
return tf.random_uniform_initializer(-init_range, init_range, seed=1234)
else:
stddev = tf.sqrt(3.0 / (n_inputs + n_outputs))
return tf.truncated_normal_initializer(stddev=stddev, seed=1234)
import numpy as np
import tensorflow as tf
import dataSetup
from scipy.stats.stats import pearsonr
tf.set_random_seed(1)
x_train, y_train, x_test, y_test = dataSetup.input_data()
# Parameters
learningRate = 0.01
trainingEpochs = 1000000
batchSize = 64
displayStep = 100
thresholdReduce = 1e-6
thresholdNow = 0.6
#dropoutRate = tf.constant(0.7)
# Network Parameter
nHidden1 = 128 # number of 1st layer nodes
nHidden2 = 64 # number of 2nd layer nodes
nInput = 24 #
nOutput = 1 # Predicted score: 1 output for regression
# save parameter
modelPath = 'model/model_layer5_%d_%d_mini%d_lr%.3f_noDrop_rollBack.ckpt' %(nHidden1, nHidden2, batchSize, learningRate)
# tf Graph input
X = tf.placeholder("float", [None, nInput])
Y = tf.placeholder("float", [None, nOutput])
# Weight
W1 = tf.get_variable("W1", shape=[nInput, nHidden1], initializer=xavier_init(nInput, nHidden1))
W2 = tf.get_variable("W2", shape=[nHidden1, nHidden2], initializer=xavier_init(nHidden1, nHidden2))
W3 = tf.get_variable("W3", shape=[nHidden2, nHidden2], initializer=xavier_init(nHidden2, nHidden2))
W4 = tf.get_variable("W4", shape=[nHidden2, nHidden2], initializer=xavier_init(nHidden2, nHidden2))
WFinal = tf.get_variable("WFinal", shape=[nHidden2, nOutput], initializer=xavier_init(nHidden2, nOutput))
# biases
b1 = tf.Variable(tf.random_normal([nHidden1], seed=1234))
b2 = tf.Variable(tf.random_normal([nHidden2], seed=1234))
b3 = tf.Variable(tf.random_normal([nHidden2], seed=1234))
b4 = tf.Variable(tf.random_normal([nHidden2], seed=1234))
bFinal = tf.Variable(tf.random_normal([nOutput], seed=1234))
# Layers for dropout
L1 = tf.nn.relu(tf.add(tf.matmul(X, W1), b1))
L2 = tf.nn.relu(tf.add(tf.matmul(L1, W2), b2))
L3 = tf.nn.relu(tf.add(tf.matmul(L2, W3), b3))
L4 = tf.nn.relu(tf.add(tf.matmul(L3, W4), b4))
hypothesis = tf.add(tf.matmul(L4, WFinal), bFinal)
print "Layer setting DONE..."
# define loss and optimizer
cost = tf.reduce_mean(tf.square(hypothesis - Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(cost)
# Initialize the variable
init = tf.initialize_all_variables()
# save op to save and restore all the variables
saver = tf.train.Saver()
with tf.Session() as sess:
# initialize
sess.run(init)
print "Initialize DONE..."
# Training
costPrevious = 100000000000000.0
best = float("INF")
totalBatch = int(len(x_train)/batchSize)
print "Total Batch: %d" %totalBatch
for epoch in range(trainingEpochs):
#print "EPOCH: %04d" %epoch
avgCost = 0.
for i in range(totalBatch):
np.random.seed(i+epoch)
randidx = np.random.randint(len(x_train), size=batchSize)
batch_xs = x_train[randidx,:]
batch_ys = y_train[randidx,:]
# Fit traiing using batch data
sess.run(optimizer, feed_dict={X:batch_xs, Y:batch_ys})
# compute average loss
avgCost += sess.run(cost, feed_dict={X:batch_xs, Y:batch_ys})/totalBatch
# compare the current cost and the previous
# if current cost > the previous
# just continue and make the learning rate half
#print "Cost: %1.8f --> %1.8f at epoch %05d" %(costPrevious, avgCost, epoch+1)
if avgCost > costPrevious + .5:
#sess.run(init)
load_path = saver.restore(sess, modelPath)
print "Cost increases at the epoch %05d" %(epoch+1)
print "Cost: %1.8f --> %1.8f" %(costPrevious, avgCost)
continue
costNow = avgCost
reduceCost = abs(costPrevious - costNow)
costPrevious = costNow
#Display logs per epoch step
if costNow < best:
best = costNow
bestMatch = sess.run(hypothesis, feed_dict={X:x_test})
# model save
save_path = saver.save(sess, modelPath)
if epoch % displayStep == 0:
print "step {}".format(epoch)
pearson = np.corrcoef(bestMatch.flatten(), y_test.flatten())
print 'train loss = {}, current loss = {}, test corrcoef={}'.format(best, costNow, pearson[0][1])
if reduceCost < thresholdReduce or costNow < thresholdNow:
print "Epoch: %04d, Cost: %.9f, Prev: %.9f, Reduce: %.9f" %(epoch+1, costNow, costPrevious, reduceCost)
break
print "Optimization Finished"

It seems that your results are perhaps not reproducible because you are using Saver to write/restore from checkpoint each time? (i.e. the second time that you run the code, the variable values aren't initialized using your random seed -- they are restored from your previous checkpoint)
Please trim down your code example to just the code necessary to reproduce irreproducibility.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call - image

Related

Using GEKKO for Moving Horizon Estimation online 2

MSE Loss is reducing with a very small amount during training pytorch

When using the five-fold cross validation to train the network, some folds perform well and some perform poorly, how can I do

how store train_loss and valid_loss separably from epoch_loss?

how to get reproducible result in Tensorflow

Categories

Resources