Major performance differences when using the tokenizer in huggingface - huggingface-transformers

I'm using the dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, set_seed, AutoModelForMultipleChoice, AutoModelForMaskedLM, BertForSequenceClassification, AutoTokenizer, AutoModel
from datasets import load_dataset
mrpc = load_dataset('glue', 'mrpc')
mrpc_test = mrpc['test']
and the following tokenizer and model
baseline_model_name = 'Intel/bert-base-uncased-mrpc'
baseline_model = BertForSequenceClassification.from_pretrained(baseline_model_name).to(device)
baseline_tokenizer = AutoTokenizer.from_pretrained(baseline_model_name, truncation=True)
The issue is that I get one performance using
successes = 0
batch_size = 15
for sample in range(0,len(mrpc_test), batch_size):
tok = baseline_tokenizer(mrpc_test[sample:sample+batch_size]['sentence1'], mrpc_test[sample:sample+batch_size]['sentence2'], return_tensors="pt", padding=True)
logits = baseline_model(**tok.to(device)).logits
results_softmax = torch.softmax(logits, dim=1)
prediction = torch.argmax(results_softmax, dim=1)
labels = mrpc_test[sample:sample+batch_size]['label']
successes += torch.sum(torch.tensor(labels).to(device)==prediction)
print(f'final accuracy: {successes/len(mrpc_test)}')
>>>final accuracy: 0.8307245969772339
But different performance if I run
successes = 0
batch_size = 15
for sample in range(0,len(mrpc_test), batch_size):
sentences = [f'{s1} [SEP] {s2}' for s1, s2 in zip(mrpc_test[sample:sample+batch_size]['sentence1'],mrpc_test[sample:sample+batch_size]['sentence2'])]
tok = baseline_tokenizer(sentences, return_tensors="pt", padding=True)
logits = baseline_model(**tok.to(device)).logits
results_softmax = torch.softmax(logits, dim=1)
prediction = torch.argmax(results_softmax, dim=1)
labels = mrpc_test[sample:sample+batch_size]['label']
successes += torch.sum(torch.tensor(labels).to(device)==prediction)
print(f'final accuracy: {successes/len(mrpc_test)}')
>>>final accuracy: 0.49159419536590576
Note that the only difference is not sending the 2 sentences together to the tokenizer, and instead doing
sentences = [f'{s1} [SEP] {s2}' for s1, s2 in zip(mrpc_test[sample:sample+batch_size]['sentence1'],mrpc_test[sample:sample+batch_size]['sentence2'])]

Related

Comparing Biweekly HFMD Cases with and without using the Squared Error Objective & L1-Norm Objective

I wish to model the biweekly HFMD cases in Malaysia.
Then, I want to show that the model using the Squared Error Objective and L1-Norm Objective can better model the biweekly HFMD cases than the model without objectives.
My question is, is it possible to model the biweekly HFMD cases without using the Squared Error Objective and L1-Norm Objective?
With this, I have attached the coding below:
from gekko import GEKKO
import numpy as np
import matplotlib.pyplot as plt
m1 = GEKKO(remote=False)
m2 = GEKKO(remote=False)
m = m1
# Known parameters
nb = 26 # Number of biweeks in a year
ny = 3 # Number of years
biweeks = np.zeros((nb,ny*nb+1))
biweeks[0][0] = 1
for i in range(nb):
for j in range(ny):
biweeks[i][j*nb+i+1] = 1
# Write csv data file
tm = np.linspace(0,78,79)
# case data
# Malaysia weekly HFMD data from the year 2013 - 2015
cases = np.array([506,506,700,890,1158,1605,1694,1311,1490,1310,1368,\
1009,1097,934,866,670,408,481,637,749,700,648,710,\
740,627,507,516,548,636,750,1066,1339,1565,\
1464,1575,1759,1631,1601,1227,794,774,623,411,\
750,1017,976,1258,1290,1546,1662,1720,1553,1787,1291,1712,2227,2132,\
2550,2140,1645,1743,1296,1153,871,621,570,388,\
347,391,446,442,390,399,421,398,452,470,437,411])
data = np.vstack((tm,cases))
data = data.T
# np.savetxt('measles_biweek_2.csv',data,delimiter=',',header='time,cases')
np.savetxt('hfmd_biweek_2.csv',data,delimiter=',',header='time,cases')
# Load data from csv
# m.time, cases_meas = np.loadtxt('measles_biweek_2.csv', \
m.time, cases_hfmd = np.loadtxt('hfmd_biweek_2.csv', \
delimiter=',',skiprows=1,unpack=True)
# m.Vr = m.Param(value = 0)
# Variables
# m.N = m.FV(value = 3.2e6)
# m.mu = m.FV(value = 7.8e-4)
# m.N = m.FV(value = 3.11861e7)
# m.mu = m.FV(value = 6.42712e-4)
m.N = m.FV(value = 3.16141e7) # Malaysia average total population (2015 - 2017)
m.mu = m.FV(value = 6.237171519e-4) # Malaysia scaled birth rate (births/biweek/total population)
m.rep_frac = m.FV(value = 0.45) # Percentage of underreporting
# Beta values (unknown parameters in the model)
m.beta = [m.FV(value=1, lb=0.1, ub=5) for i in range(nb)]
# Predicted values
m.S = m.SV(value = 0.162492875*m.N.value, lb=0,ub=m.N) # Susceptibles (Kids from 0 - 9 YO: 5137066 people) - Average of 94.88% from total reported cases
m.I = m.SV(value = 7.907863896e-5*m.N.value, lb=0,ub=m.N) #
# m.V = m.Var(value = 2e5)
# measured values
m.cases = m.CV(value = cases_hfmd, lb=0)
# turn on feedback status for CASES
m.cases.FSTATUS = 1
# weight on prior model predictions
m.cases.WMODEL = 0
# meas_gap = deadband that represents level of
# accuracy / measurement noise
db = 100
m.cases.MEAS_GAP = db
for i in range(nb):
m.beta[i].STATUS=1
#m.gamma = m.FV(value=0.07)
m.gamma = m.FV(value=0.07)
m.gamma.STATUS = 1
m.gamma.LOWER = 0.05
m.gamma.UPPER = 0.5
m.biweek=[None]*nb
for i in range(nb):
m.biweek[i] = m.Param(value=biweeks[i])
# Intermediate
m.Rs = m.Intermediate(m.S*m.I/m.N)
# Equations
sum_biweek = sum([m.biweek[i]*m.beta[i]*m.Rs for i in range(nb)])
# m.Equation(m.S.dt()== -sum_biweek + m.mu*m.N - m.Vr)
m.Equation(m.S.dt()== -sum_biweek + m.mu*m.N)
m.Equation(m.I.dt()== sum_biweek - m.gamma*m.I)
m.Equation(m.cases == m.rep_frac*sum_biweek)
# m.Equation(m.V.dt()==-m.Vr)
# options
m.options.SOLVER = 1
m.options.NODES=3
# imode = 5, dynamic estimation
m.options.IMODE = 5
# ev_type = 1 (L1-norm) or 2 (squared error)
m.options.EV_TYPE = 2
# solve model and print solver output
m.solve()
[print('beta['+str(i+1)+'] = '+str(m.beta[i][0])) \
for i in range(nb)]
print('gamma = '+str(m.gamma.value[0]))
# export data
# stack time and avg as column vectors
my_data = np.vstack((m.time,np.asarray(m.beta),m.gamma))
# transpose data
my_data = my_data.T
# save text file with comma delimiter
beta_str = ''
for i in range(nb):
beta_str = beta_str + ',beta[' + str(i+1) + ']'
header_name = 'time,gamma' + beta_str
##np.savetxt('solution_data.csv',my_data,delimiter=',',\
## header = header_name, comments='')
np.savetxt('solution_data_EVTYPE_'+str(m.options.EV_TYPE)+\
'_gamma'+str(m.gamma.STATUS)+'.csv',\
my_data,delimiter=',',header = header_name)
plt.figure(num=1, figsize=(16,8))
plt.suptitle('Estimation')
plt.subplot(2,2,1)
plt.plot(m.time,m.cases, label='Cases (model)')
plt.plot(m.time,cases_hfmd, label='Cases (measured)')
if m.options.EV_TYPE==2:
plt.plot(m.time,cases_hfmd+db/2, 'k-.',\
lw=0.5, label=r'$Cases_{db-hi}$')
plt.plot(m.time,cases_hfmd-db/2, 'k-.',\
lw=0.5, label=r'$Cases_{db-lo}$')
plt.fill_between(m.time,cases_hfmd-db/2,\
cases_hfmd+db/2,color='gold',alpha=.5)
plt.legend(loc='best')
plt.ylabel('Cases')
plt.subplot(2,2,2)
plt.plot(m.time,m.S,'r--')
plt.ylabel('S')
plt.subplot(2,2,3)
[plt.plot(m.time,m.beta[i], label='_nolegend_')\
for i in range(nb)]
plt.plot(m.time,m.gamma,'c--', label=r'$\gamma$')
plt.legend(loc='best')
plt.ylabel(r'$\beta, \gamma$')
plt.xlabel('Time')
plt.subplot(2,2,4)
plt.plot(m.time,m.I,'g--')
plt.xlabel('Time')
plt.ylabel('I')
plt.subplots_adjust(hspace=0.2,wspace=0.4)
name = 'cases_EVTYPE_'+ str(m.options.EV_TYPE) +\
'_gamma' + str(m.gamma.STATUS) + '.png'
plt.savefig(name)
plt.show()
To define a custom objective, use the m.Minimize() or m.Maximize() functions instead of the squared error or l1-norm objectives that are built into the m.CV() objects. To create a custom objective, use m.Var() instead of m.CV() such as:
from gekko import GEKKO
import numpy as np
m = GEKKO()
x = m.Array(m.Var,4,value=1,lb=1,ub=5)
x1,x2,x3,x4 = x
# change initial values
x2.value = 5; x3.value = 5
m.Equation(x1*x2*x3*x4>=25)
m.Equation(x1**2+x2**2+x3**2+x4**2==40)
m.Minimize(x1*x4*(x1+x2+x3)+x3)
m.solve()
print('x: ', x)
print('Objective: ',m.options.OBJFCNVAL)
Here is a similar problem with disease prediction (Measles) that uses m.CV().
import numpy as np
from gekko import GEKKO
import matplotlib.pyplot as plt
# Import Data
# write csv data file
t_s = np.linspace(0,78,79)
# case data
cases_s = np.array([180,180,271,423,465,523,649,624,556,420,\
423,488,441,268,260,163,83,60,41,48,65,82,\
145,122,194,237,318,450,671,1387,1617,2058,\
3099,3340,2965,1873,1641,1122,884,591,427,282,\
174,127,84,97,68,88,79,58,85,75,121,174,209,458,\
742,929,1027,1411,1885,2110,1764,2001,2154,1843,\
1427,970,726,416,218,160,160,188,224,298,436,482,468])
# Initialize gekko model
m = GEKKO()
# Number of collocation nodes
nodes = 4
# Number of phases (years in this case)
n = 3
#Biweek periods per year
bi = 26
# Time horizon (for all 3 phases)
m.time = np.linspace(0,1,bi+1)
# Parameters that will repeat each year
N = m.Param(3.2e6)
mu = m.Param(7.8e-4)
rep_frac = m.Param(0.45)
Vr = m.Param(0)
beta = m.MV(2,lb = 0.1)
beta.STATUS = 1
gamma = m.FV(value=0.07)
gamma.STATUS = 1
gamma.LOWER = 0.05
gamma.UPPER = 0.5
# Data used to control objective function
casesobj1 = m.Param(cases_s[0:(bi+1)])
casesobj2 = m.Param(cases_s[bi:(2*bi+1)])
casesobj3 = m.Param(cases_s[2*bi:(3*bi+1)])
# Variables that vary between years, one version for each year
cases = [m.CV(value = cases_s[(i*bi):(i+1)*(bi+1)-i],lb=0) for i in range(n)]
for i in cases:
i.FSTATUS = 1
i.WMODEL = 0
i.MEAS_GAP = 100
S = [m.Var(0.06*N,lb = 0,ub = N) for i in range(n)]
I = [m.Var(0.001*N, lb = 0,ub = N) for i in range(n)]
V = [m.Var(2e5) for i in range(n)]
# Equations (created for each year)
for i in range(n):
R = m.Intermediate(beta*S[i]*I[i]/N)
m.Equation(S[i].dt() == -R + mu*N - Vr)
m.Equation(I[i].dt() == R - gamma*I[i])
m.Equation(cases[i] == rep_frac*R)
m.Equation(V[i].dt() == -Vr)
# Connect years together at endpoints
for i in range(n-1):
m.Connection(cases[i+1],cases[i],1,bi,1,nodes)#,1,nodes)
m.Connection(cases[i+1],'CALCULATED',pos1=1,node1=1)
m.Connection(S[i+1],S[i],1,bi,1,nodes)
m.Connection(S[i+1],'CALCULATED',pos1=1,node1=1)
m.Connection(I[i+1],I[i],1,bi,1,nodes)
m.Connection(I[i+1],'CALCULATED',pos1=1, node1=1)
# Solver options
m.options.IMODE = 5
m.options.NODES = nodes
m.EV_TYPE = 1
m.options.SOLVER = 1
# Solve
m.Obj(2*(casesobj1-cases[0])**2+(casesobj3-cases[2])**2)
m.solve()
# Calculate the start time of each phase
ts = np.linspace(1,n,n)
# Plot
plt.figure()
plt.subplot(4,1,1)
tm = np.empty(len(m.time))
for i in range(n):
tm = m.time + ts[i]
plt.plot(tm,cases[i].value,label='Cases Year %s'%(i+1))
plt.plot(tm,cases_s[(i*bi):(i+1)*(bi+1)-i],'.')
plt.legend()
plt.ylabel('Cases')
plt.subplot(4,1,2)
for i in range(n):
tm = m.time + ts[i]
plt.plot(tm,beta.value,label='Beta Year %s'%(i+1))
plt.legend()
plt.ylabel('Contact Rate')
plt.subplot(4,1,3)
for i in range(n):
tm = m.time + ts[i]
plt.plot(tm,I[i].value,label='I Year %s'%(i+1))
plt.legend()
plt.ylabel('Infectives')
plt.subplot(4,1,4)
for i in range(n):
tm = m.time + ts[i]
plt.plot(tm,S[i].value,label='S Year %s'%(i+1))
plt.legend()
plt.ylabel('Susceptibles')
plt.xlabel('Time (yr)')
plt.show()

I got error in code to train model - multivariant timeseries

I need help with multivariant time series forecasting using gru / lstm.
The dataset I am using about 4000 rows and 7 columns.
I already used this for input shaping
def create_dataset (X, look_back = 1):
Xs, ys = [], []
for i in range(len(X)-look_back):
v = X[i:i+look_back]
Xs.append(v)
ys.append(X[i+look_back][0])
return np.array(Xs), np.array(ys)
LOOK_BACK = 30
X_train, y_train = create_dataset(train_scaled,LOOK_BACK)
X_test, y_test = create_dataset(test_scaled,LOOK_BACK)
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)
this part for model creation
def create_gru(units):
model = Sequential()
# Input layer
model.add(GRU (units = units, return_sequences = True,
input_shape = [X_train.shape[1], X_train.shape[2]]))
model.add(Dropout(0.2))
# Hidden layer
model.add(GRU(units = units))
model.add(Dropout(0.2))
model.add(Dense(units = 1))
When I execute I got an error says that non-broadcastable output operand with shape (854,1) doesn't match the broadcast shape (854,7)
this error happen when the execution each this part
y_train = scaler.inverse_transform(y_train)
y_test = scaler.inverse_transform(y_test)
def prediction(model):
prediction = model.predict(X_test)
prediction = scaler.inverse_transform(prediction)
return prediction
prediction_gru = prediction(model_gru)
prediction_bilstm = prediction(model_bilstm)

What is the formula being used in the in-sample prediction of statsmodels?

I would like to know what formula is being used in statsmodels ARIMA predict/forecast. For a simple AR(1) model I thought that it would be y_t = a1 * y_t-1. However, I am not able to recreate the results produced by forecast or predict.
Here's what I am trying to do:
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
def ar_series(n):
# generate the series y_t = a1 y_t-1 + eps
np.random.seed(1)
y0 = np.random.rand()
y = [y0]
a1 = 0.7 # the AR coefficient
for i in range(1, n):
y.append(a1 * y[i - 1] + 0.3 * np.random.rand())
return np.array(y)
series = ar_series(10)
model = ARIMA(series, order=(1, 0, 0))
fit = model.fit()
#print(fit.summary())
# const = 0.3441; ar.L1 = 0.6518
print(fit.predict())
y_pred = [0.3441]
for i in range(1, 10):
y_pred.append( 0.6518 * series[i-1])
y_pred = np.array(y_pred)
print(y_pred)
The two series don't match and I have no idea how the in-sample predictions are being calculated?
Found the answer here. I think what I was trying to do is valid only if the process mean is zero.
https://faculty.washington.edu/ezivot/econ584/notes/forecast.pdf

When using the five-fold cross validation to train the network, some folds perform well and some perform poorly, how can I do

I am trying to create a binary CNN classifier for a dataset (class 0 = 77 images, class 1 = 41 images), which I want to do 5-Fold cross validation. In each fold, using the validation sets to save best model, and sharing same model, Hyperparameters, and training strategy. And here is my results.
fold - test sets accuracy
fold0 - 0.68
fold1 - 0.71
fold2 - 0.91
fold3 - 0.96
fold4 - 0.64
My question is:
Fine tuning by changing the Hyperparameters. It was found that fold2 and fold3 performed better each time, but fold0 and fold4 performed poorly. What is willing to cause it and what should I do.
The possible problem is that each initialization is random.
Thank you all for your answers.
import os
import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data.sampler import WeightedRandomSampler
import monai
from monai.data import NiftiDataset
from monai.transforms import Compose, AddChannel, ScaleIntensity, RandFlip, RandRotate, ToTensor
from monai.data import CSVSaver
from data_process import read_csv, get_sample_weights
def train(train_file, val_file, stage='exp0'):
'''
:param train_file:
:param val_file:
:param stage:
:return:
'''
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
img_src_path = '../samples/T1c_images/' #
img_list_train, label_list_train = read_csv(train_file)
img_list_val, label_list_val = read_csv(val_file)
img_train = [os.path.join(img_src_path, i) for i in img_list_train]
labels_train = [int(i) for i in label_list_train]
img_val = [os.path.join(img_src_path, i) for i in img_list_val]
labels_val = [int(i) for i in label_list_val]
print('val images: ', len(img_val))
# Define transforms
# train_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((182, 218, 182)), RandRotate90(), ToTensor()])
# val_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((182, 218, 182)), ToTensor()])
train_transforms = Compose([ScaleIntensity(), RandRotate(range_x=45, range_y=45, range_z=45, prob=0.5),
RandFlip(prob=0.5, spatial_axis=1),
AddChannel(), ToTensor()]) # if x=y=z RandRotate90()
val_transforms = Compose([ScaleIntensity(), AddChannel(), ToTensor()])
train_ds = NiftiDataset(image_files=img_train, labels=labels_train, transform=train_transforms, image_only=False)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=2,
pin_memory=torch.cuda.is_available())
# create a validation data_process loader
val_ds = NiftiDataset(image_files=img_val, labels=labels_val, transform=val_transforms, image_only=False)
val_loader = DataLoader(val_ds, batch_size=4, num_workers=2, pin_memory=torch.cuda.is_available())
# Create DenseNet121, CrossEntropyLoss and Adam optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device)
model = torch.nn.DataParallel(model)
loss_function = torch.nn.CrossEntropyLoss(weight=torch.Tensor([1, 1.2])).cuda()
optimizer = torch.optim.Adam(model.parameters(), 1e-5)
# start a typical PyTorch training
epochs = 50
val_interval = 1
best_metric = -1
best_metric_epoch = -1
writer = SummaryWriter()
for epoch in range(epochs):
print("-" * 10)
print(f"epoch {epoch + 1}/{epochs}")
model.train()
epoch_loss = 0
step = 0
t_metric_count = 0
t_num_correct = 0
for batch_data in train_loader:
step += 1
# ptrint images name
# print('image name', batch_data[2]['filename_or_obj'])
inputs = batch_data[0].to(device)
# print(inputs.shape)
labels = batch_data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_len = len(train_ds) // train_loader.batch_size
# train acc
t_value = torch.eq(outputs.argmax(dim=1), labels)
t_metric_count += len(t_value) #
t_num_correct += t_value.sum().item() #
# print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
epoch_loss /= step
t_metric = t_num_correct / t_metric_count
writer.add_scalar("train_loss", epoch_loss, epoch + 1)
writer.add_scalar("train_acc", t_metric, epoch + 1)
print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
if (epoch + 1) % val_interval == 0:
model.eval()
with torch.no_grad():
num_correct = 0.0
metric_count = 0
for val_data in val_loader:
val_images, val_labels = val_data[0].to(device), val_data[1].to(device)
val_outputs = model(val_images)
value = torch.eq(val_outputs.argmax(dim=1), val_labels)
metric_count += len(value) #
num_correct += value.sum().item() #
metric = num_correct / metric_count
if metric > best_metric:
best_metric = metric
best_metric_epoch = epoch + 1
save_path = 'checkpoint_07201/' + stage + '_' + str(epoch + 1) + "_best_metric_model.pth"
torch.save(model.state_dict(), save_path)
print("saved new best metric model")
print(
"current epoch: {} current accuracy: {:.4f} best val accuracy: {:.4f} at epoch {}".format(
epoch + 1, metric, best_metric, best_metric_epoch
))
print('current train accuracy: {:.4f}, num_correct: {}, num_count:{}'.
format(t_metric, t_num_correct, t_metric_count ))
writer.add_scalar("val_accuracy", metric, epoch + 1)
print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}")
writer.close()
if __name__ == "__main__":
# 5 folder
for i in range(5):
folder = 'exp'+str(i)
train_path = './data/'+ folder +'/train.csv'
val_path = './data/'+ folder + '/val.csv'
train(train_path, val_path, stage=folder)

Tensorflow/Keras: volatile validation loss

I've been training a U-Net for single class small lesion segmentation, and have been getting consistently volatile validation loss. I have about 20k images split 70/30 between training and validation sets-so I don't think the issue is too little data. I've tried shuffling and resplitting the sets a few times with no change in volatility-so I don't think the validation set is unrepresentative. I have tried lowering the learning rate with no effect on volatility. And I have tried a few loss functions (dice coefficient, focal tversky, weighted binary cross-entropy). I'm using a decent amount of augmentation so as to avoid overfitting. I've also run through all my data (512x512 float64s with corresponding 512x512 int64 masks--both stored as numpy arrays) do double check that the value range, dtypes, etc. aren't screwy...and I even removed any ROIs in the masks under 35 pixels in area which I thought might be artifact and messing with loss.
I'm using keras ImageDataGen.flow_from_directory...I was initially using zca_whitening and brightness_range augmentation but I think this causes issues with flow_from_directory and the link between mask and image being lost.. so I skipped this.
I've tried validation generators with and without shuffle=True. Batch size is 8.
Here's some of my code, happy to include more if it would help:
# loss
from keras.losses import binary_crossentropy
import keras.backend as K
import tensorflow as tf
epsilon = 1e-5
smooth = 1
def dsc(y_true, y_pred):
smooth = 1.
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
score = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
return score
def dice_loss(y_true, y_pred):
loss = 1 - dsc(y_true, y_pred)
return loss
def bce_dice_loss(y_true, y_pred):
loss = binary_crossentropy(y_true, y_pred) + dice_loss(y_true, y_pred)
return loss
def confusion(y_true, y_pred):
smooth=1
y_pred_pos = K.clip(y_pred, 0, 1)
y_pred_neg = 1 - y_pred_pos
y_pos = K.clip(y_true, 0, 1)
y_neg = 1 - y_pos
tp = K.sum(y_pos * y_pred_pos)
fp = K.sum(y_neg * y_pred_pos)
fn = K.sum(y_pos * y_pred_neg)
prec = (tp + smooth)/(tp+fp+smooth)
recall = (tp+smooth)/(tp+fn+smooth)
return prec, recall
def tp(y_true, y_pred):
smooth = 1
y_pred_pos = K.round(K.clip(y_pred, 0, 1))
y_pos = K.round(K.clip(y_true, 0, 1))
tp = (K.sum(y_pos * y_pred_pos) + smooth)/ (K.sum(y_pos) + smooth)
return tp
def tn(y_true, y_pred):
smooth = 1
y_pred_pos = K.round(K.clip(y_pred, 0, 1))
y_pred_neg = 1 - y_pred_pos
y_pos = K.round(K.clip(y_true, 0, 1))
y_neg = 1 - y_pos
tn = (K.sum(y_neg * y_pred_neg) + smooth) / (K.sum(y_neg) + smooth )
return tn
def tversky(y_true, y_pred):
y_true_pos = K.flatten(y_true)
y_pred_pos = K.flatten(y_pred)
true_pos = K.sum(y_true_pos * y_pred_pos)
false_neg = K.sum(y_true_pos * (1-y_pred_pos))
false_pos = K.sum((1-y_true_pos)*y_pred_pos)
alpha = 0.7
return (true_pos + smooth)/(true_pos + alpha*false_neg + (1-alpha)*false_pos + smooth)
def tversky_loss(y_true, y_pred):
return 1 - tversky(y_true,y_pred)
def focal_tversky(y_true,y_pred):
pt_1 = tversky(y_true, y_pred)
gamma = 0.75
return K.pow((1-pt_1), gamma)
model = BlockModel((len(os.listdir(os.path.join(imageroot,'train_ct','train'))), 512, 512, 1),filt_num=16,numBlocks=4)
#model.compile(optimizer=Adam(learning_rate=0.001), loss=weighted_cross_entropy)
#model.compile(optimizer=Adam(learning_rate=0.001), loss=dice_coef_loss)
model.compile(optimizer=Adam(learning_rate=0.001), loss=focal_tversky)
train_mask = os.path.join(imageroot,'train_masks')
val_mask = os.path.join(imageroot,'val_masks')
model.load_weights(model_weights_path) #I'm initializing with some pre-trained weights from a similar model
data_gen_args_mask = dict(
rotation_range=10,
shear_range=20,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=[0.8,1.2],
horizontal_flip=True,
#vertical_flip=True,
fill_mode='nearest',
data_format='channels_last'
)
data_gen_args = dict(
**data_gen_args_mask
)
image_datagen_train = ImageDataGenerator(**data_gen_args)
mask_datagen_train = ImageDataGenerator(**data_gen_args)#_mask)
image_datagen_val = ImageDataGenerator()
mask_datagen_val = ImageDataGenerator()
seed = 1
BS = 8
steps = int(np.floor((len(os.listdir(os.path.join(train_ct,'train'))))/BS))
print(steps)
val_steps = int(np.floor((len(os.listdir(os.path.join(val_ct,'val'))))/BS))
print(val_steps)
train_image_generator = image_datagen_train.flow_from_directory(
train_ct,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
train_mask_generator = mask_datagen_train.flow_from_directory(
train_mask,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
val_image_generator = image_datagen_val.flow_from_directory(
val_ct,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
val_mask_generator = mask_datagen_val.flow_from_directory(
val_mask,
target_size = (512, 512),
color_mode = ("grayscale"),
classes=None,
class_mode=None,
seed = seed,
shuffle = True,
batch_size = BS)
train_generator = zip(train_image_generator, train_mask_generator)
val_generator = zip(val_image_generator, val_mask_generator)
# make callback for checkpointing
plot_losses = PlotLossesCallback(skip_first=0,plot_extrema=False)
%matplotlib inline
filepath = os.path.join(versionPath, model_version + "_saved-model-{epoch:02d}-{val_loss:.2f}.hdf5")
if reduce:
cb_check = [ModelCheckpoint(filepath,monitor='val_loss',
verbose=1,save_best_only=False,
save_weights_only=True,mode='auto',period=1),
reduce_lr,
plot_losses]
else:
cb_check = [ModelCheckpoint(filepath,monitor='val_loss',
verbose=1,save_best_only=False,
save_weights_only=True,mode='auto',period=1),
plot_losses]
# train model
history = model.fit_generator(train_generator, epochs=numEp,
steps_per_epoch=steps,
validation_data=val_generator,
validation_steps=val_steps,
verbose=1,
callbacks=cb_check,
use_multiprocessing = False
)
And here's how my loss looks:
Another potentially relevant thing: I tweaked the flow_from_directory code a bit (added npy to the white list). But training loss looks fine so assuming the issue isnt here
Two suggestions:
Switch to the classic validation data format (i.e. numpy array) instead of using a generator -- this will ensure you always use the exactly same validation data every time. If you see a different validation curve, then there is something "random" in the validation generator giving you different data at different epochs.
Use a fixed set of samples (100 or 1000 should be enough w/o any data augmentation) for both training and validation. If everything goes well, you should see your network quickly overfit to this dataset and your training and validation curves should very much similar. If not, debug your network.

Resources