I tried using SGD, Adadelta, Adabound, Adam. Everything gives me fluctuations in validation accuracy. I tried all the activation functions in keras, but still, I'm getting fluctuations in val_acc.
Training samples: 1352
Validation Samples: 339
Validation Accuracy
# first (and only) CONV => RELU => POOL block
inpt = Input(shape = input_shape)
x = Conv2D(32, (3, 3), padding = "same")(inpt)
x = Activation("swish")(x)
x = BatchNormalization(axis = channel_dim)(x)
x = MaxPooling2D(pool_size = (3, 3))(x)
# x = Dropout(0.25)(x)
# first CONV => RELU => CONV => RELU => POOL block
x = Conv2D(64, (3, 3), padding = "same")(x)
x = Activation("swish")(x)
x = BatchNormalization(axis = channel_dim)(x)
x = Conv2D(64, (3, 3), padding = "same")(x)
x = Activation("swish")(x)
x = BatchNormalization(axis = channel_dim)(x)
x = MaxPooling2D(pool_size = (2, 2))(x)
# x = Dropout(0.25)(x)
# second CONV => RELU => CONV => RELU => POOL Block
x = Conv2D(128, (3, 3), padding = "same")(x)
x = Activation("swish")(x)
x = BatchNormalization(axis = channel_dim)(x)
x = Conv2D(128, (3, 3), padding = "same")(x)
x = Activation("swish")(x)
x = BatchNormalization(axis = channel_dim)(x)
x = MaxPooling2D(pool_size = (2, 2))(x)
# x = Dropout(0.25)(x)
# first (and only) FC layer
x = Flatten()(x) # Change to GlobalMaxPooling2D
x = Dense(256, activation = 'swish')(x)
x = BatchNormalization(axis = channel_dim)(x)
x = Dropout(0.4)(x)
x = Dense(128, activation = 'swish')(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)
x = Dense(64, activation = 'swish')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(32, activation = 'swish')(x)
x = BatchNormalization()(x)
x = Dense(nc, activation = 'softmax')(x)
model = Model(inputs=inpt, outputs = x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])
Your model may be too noise sensitive, see this answer.
Based on the answer in the link and what I see from your model, your network may be too deep for the amount of data you have (large model and not enough datas ==> overfitting ==> noise sensitivity). I suggest to use a simpler model as a sanity check.
The learning rate could also be a possible reason (as stated by Neb). You are using the default learning rate of sgd (which is 0.01, maybe too high). Try with 1.e-3 or below.
Related
I have an object detection task that requires to recognise multiple objects from an image. The idea is to define a simple network from scratch (i.e. without using ready implementations of YoLo or similar algorithms). Our approach was to define an architecture that was able to detect a single bounding box, and then to execute different independent copies of the same model on the data in parallel. Our architecture was defined as the following:
class MultiObjectNet(nn.Module):
def __init__(self, image_width, image_height, num_classes=13, num_boxes=5, device=torch.device('mps')):
super(MultiObjectNet, self).__init__()
self.device = device
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
self.initial_conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=4, stride=2, padding=1, device=self.device)
self.initial_conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=4, stride=2, padding=1, device=self.device)
self.initial_conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=2, stride=2, device=self.device)
self.no_pools1 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, device=self.device)
self.no_pools2 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=3, device=self.device)
self.fc1 = nn.Linear(in_features=1024, out_features=512, device=self.device)
self.fc2 = nn.Linear(in_features=512, out_features=256, device=self.device)
self.fc3 = nn.Linear(in_features=256, out_features=128, device=self.device)
self.fc4 = nn.Linear(in_features=128, out_features=64, device=self.device)
self.pc_layer = nn.Linear(in_features=64, out_features=1, device=self.device)
self.box_layer = nn.Linear(in_features=64, out_features=4, device=self.device)
self.category_layer = nn.Linear(in_features=64, out_features=num_classes, device=self.device)
self.sigmoid = nn.Sigmoid()
self.num_classes = num_classes
self.num_boxes = num_boxes
def __device__(self):
return self.device
def forward(self, x):
x = self.initial_conv1(x)
x = F.relu(x)
x = self.pool(x)
x = self.initial_conv2(x)
x = F.relu(x)
x = self.pool(x)
x = self.initial_conv3(x)
x = F.relu(x)
x = self.pool(x)
x = self.no_pools1(x)
x = F.relu(x)
x = self.no_pools2(x)
x = F.relu(x)
x = torch.transpose(x,1,3)[::,0][:,0]
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
x = F.relu(x)
x = self.fc4(x)
x = F.relu(x)
return self.sigmoid(self.pc_layer(x)), self.box_layer(x), self.category_layer(x)
The network outputs a triple $(P_C, [x_{min}, y_{min}, x_{max}, y_{max}], CAT)$, where the inner list contains the coordinates of the bounding box and $CAT$ is a probability distribution over the classes of the dataset.
The training and loss calculation were computed as following:
nets = [ MultiObjectNet(resizing_width, resizing_height, device=torch.device('mps')) for _ in range(max_boxes) ]
loss_df = pd.DataFrame(index=list(range(100)), columns=["loss"])
learning_rate = 1e-02
optims = [ torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9) for net in nets ]
bce_loss = torch.nn.BCELoss()
mse_loss = torch.nn.MSELoss()
xent_loss = torch.nn.CrossEntropyLoss()
for epoch in range(5):
running_loss = 0.0
for batch_number, batch in tqdm(enumerate(train_dataloader)) :
inputs, labels = batch
batch_size = len(inputs)
for i, (optim, net, label) in enumerate(zip(optims, nets, labels)) :
pred_pc, pred_box, pred_category = net(inputs)
y_pc, y_box, y_category = label[0].reshape(batch_size, 1), label[1].reshape(batch_size, 4), label[2].reshape(batch_size, num_classes)
optim.zero_grad()
confidence_loss = bce_loss(pred_pc, y_pc)
box_loss = mse_loss(pred_box, y_box))
category_loss = xent_loss(pred_category, y_category)
loss = confidence_loss + box_loss + category_loss
loss.backward()
optim.step()
My model fails to generalize the problem and fixates on a single output, no matter the input given to the network (always outputs very similar confidence probabilities, boxes and categories distributions). By looking into the first layer parameters, I can see that the gradient is not None, thus I suppose that there is some kind of learning, but I can't understand what is happening.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
image_size = (180, 180)
batch_size = 32
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
"PetImages",
validation_split=0.2,
subset="training",
seed=1337,
image_size=image_size,
batch_size=batch_size,
)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
"PetImages",
validation_split=0.2,
subset="validation",
seed=1337,
image_size=image_size,
batch_size=batch_size,
)
data_augmentation = keras.Sequential(
[
layers.RandomFlip("horizontal"),
layers.RandomRotation(0.1),
]
)
train_ds = train_ds.prefetch(buffer_size=32)
val_ds = val_ds.prefetch(buffer_size=32)
def make_model(input_shape, num_classes):
inputs = keras.Input(shape=input_shape)
# Image augmentation block
x = data_augmentation(inputs)
# Entry block
x = layers.Rescaling(1.0 / 255)(x)
x = layers.Conv2D(32, 3, strides=2, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Conv2D(64, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
previous_block_activation = x # Set aside residual
for size in [128, 256, 512, 728]:
x = layers.Activation("relu")(x)
x = layers.SeparableConv2D(size, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.SeparableConv2D(size, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D(3, strides=2, padding="same")(x)
# Project residual
residual = layers.Conv2D(size, 1, strides=2, padding="same")(
previous_block_activation
)
x = layers.add([x, residual]) # Add back residual
previous_block_activation = x # Set aside next residual
x = layers.SeparableConv2D(1024, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.GlobalAveragePooling2D()(x)
if num_classes == 2:
activation = "sigmoid"
units = 1
else:
activation = "softmax"
units = num_classes
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(units, activation=activation)(x)
return keras.Model(inputs, outputs)
model = make_model(input_shape=image_size + (3,), num_classes=2)
keras.utils.plot_model(model, show_shapes=True)
epochs = 50
callbacks = [
keras.callbacks.ModelCheckpoint("save_at_{epoch}.h5"),
]
model.compile(
optimizer=keras.optimizers.Adam(1e-3),
loss="binary_crossentropy",
metrics=["accuracy"],
)
model.fit(
train_ds, epochs=epochs, callbacks=callbacks, validation_data=val_ds,
)
So the strategy was to begin the model with the data_augmentation preprocessor, followed by a Rescaling layer and a dropout layer before the final classification layer as shown in the make_model function
for training the model as you can see I set epochs=50 and used buffered prefetching for my input data as it would yield data from disk without having I/O blocking. As for the rest of the parameters I think it was pretty standard. nothing too complicated but when I run my code each epoch is taking approximately 40 minutes and I don't know why.
Any suggestions?
I achieve a part in two ways: pytorch and tensorflow2.3, but in 'for loop', pytorch is 10 times faster than tensorflow2.3, and I don't know why ? Could you please help me about that?
'for loop part' in TF2.3:
for i in range(seq_len):
m_t = mels[:, i, :]
a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
x = tf.concat([x, m_t, a1_t], axis=1)
x = self.I(x)
_, h1 = rnn1(tf.expand_dims(x, axis=1))
x = x + h1
inp = tf.concat([x, a2_t], axis=1)
_, h2 = rnn2(tf.expand_dims(inp, axis=1))
x = x + h2
x = tf.concat([x, a3_t], axis=1)
x = tf.nn.relu(self.fc1(x))
x = tf.concat([x, a4_t], axis=1)
x = tf.nn.relu(self.fc2(x))
logits = self.fc3(x)
if self.mode == 'RAW' :
posterior = tf.nn.softmax(logits, axis=1)
distrib = tfp.distributions.Categorical(posterior, dtype=tf.float32)
sample = 2 * distrib.sample() / (self.n_classes - 1.) - 1.
output.append(sample)
x = tf.expand_dims(sample, axis=-1)
else:
raise RuntimeError("Unknown model mode value - ", self.mode)
if i % 100 == 0:
gen_rate = (i + 1) / (time.time() - start) * b_size_np / 1000
progress_callback(i, seq_len, b_size, gen_rate)
Cost about 50s finished for loop.
for loop part' in Pytorch:
for i in range(seq_len):
m_t = mels[:, i, :]
a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
x = torch.cat([x, m_t, a1_t], dim=1)
x = self.I(x)
h1 = rnn1(x, h1)
x = x + h1
inp = torch.cat([x, a2_t], dim=1)
h2 = rnn2(inp, h2)
x = x + h2
x = torch.cat([x, a3_t], dim=1)
x = F.relu(self.fc1(x))
x = torch.cat([x, a4_t], dim=1)
x = F.relu(self.fc2(x))
logits = self.fc3(x)
if self.mode == 'RAW' :
posterior = F.softmax(logits, dim=1)
distrib = torch.distributions.Categorical(posterior)
sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
output.append(sample)
x = sample.unsqueeze(-1)
else:
raise RuntimeError("Unknown model mode value - ", self.mode)
Cost just 5s finished for loop!
I really need your help and know how to change the tf2.3 code, thanks!
I am trying to run this model but I keep getting this error. There is some mistake with regard to the shape of input data, I played around with it but I still get these errors.
Error:
ValueError: Input 0 of layer sequential is incompatible with the layer: expected axis -1 of input shape to have value 1 but received input with shape (None, 32, 32, 3)
# Image size
img_width = 32
img_height = 32
# Define X as feature variable and Y as name of the class(label)
X = []
Y = []
for features,label in data_set:
X.append(features)
Y.append(label)
X = np.array(X).reshape(-1,img_width,img_height,3)
Y = np.array(Y)
print(X.shape) # Output :(4943, 32, 32, 3)
print(Y.shape) # Output :(4943,)
# Normalize the pixels
X = X/255.0
# Build the model
cnn = Sequential()
cnn.add(keras.Input(shape = (32,32,1)))
cnn.add(Conv2D(32, (3, 3), activation = "relu", input_shape = X.shape[1:]))
cnn.add(MaxPooling2D(pool_size = (2, 2)))
cnn.add(Conv2D(32, (3, 3), activation = "relu",input_shape = X.shape[1:]))
cnn.add(MaxPooling2D(pool_size = (2, 2)))
cnn.add(Conv2D(64, (3,3), activation = "relu",input_shape = X.shape[1:]))
cnn.add(MaxPooling2D(pool_size = (2,2)))
cnn.add(Flatten())
cnn.add(Dense(activation = "relu", units = 150))
cnn.add(Dense(activation = "relu", units = 50))
cnn.add(Dense(activation = "relu", units = 10))
cnn.add(Dense(activation = 'softmax', units = 1))
cnn.summary()
cnn.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
# Model fit
cnn.fit(X, Y, epochs = 15)e
I tried reading about this issue, but still didn't understand it very well.
your input shape should be (32,32,3). y is your label matrix. I assume it contains N unique integer values where N is the number of classes. If N=2 you can treat this as a binary classification problem. In that case your code for the top layer should be
cnn.add(Dense(1, activation = 'sigmoid'))
your code for compile should be
cnn.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
If you have more than 2 classes then your code should be
cnn.add(Dense(N, activation = 'softmax'))
cnn.compile(loss = 'sparse_categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
Where N is the number of classes,
Change this line (the last dimension):
cnn.add(keras.Input(shape = (32,32,3)))
I have trained different number of layers in CNN+LSTM encoder and decoder model with attention.
The problem I am facing is very strange to me. The validation loss is fluctuating around 3.***. As we can see from the below loss graphs. I have 3 CNN layer+1 layer BLSTM at encoder and 1 LSTM at decoder
3 layer CNN+2 layers of BLSTM at encoder and 1 layer LSTM at encoder
I have also tried weight decay from 0.1 to 0.000001. But still I am getting this type of loss graphs. Note that the Accuracy of the model is increasing on both validation and trainset. How is it possible that validation loss is still around 3 but accuracy is increasing? Can someone explain this?
Thanks
`
class Encoder(nn.Module):
def init(self,height, width, enc_hid_dim, dec_hid_dim, dropout):
super().init()
self.height= height
self.enc_hid_dim=enc_hid_dim
self.width=width
self.layer0 = nn.Sequential(
nn.Conv2d(1, 8, kernel_size=(3,3),stride =(1,1), padding=1),
nn.ReLU(),
nn.BatchNorm2d(8),
nn.MaxPool2d(2,2),
)
self.layer1 = nn.Sequential(
nn.Conv2d(8, 32, kernel_size=(3,3),stride =(1,1), padding=1),
nn.ReLU(),
nn.BatchNorm2d(32),
nn.MaxPool2d(2,2),
)
self.layer2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=(3,3),stride =(1,1), padding=1),
nn.ReLU(),
nn.BatchNorm2d(64),
nn.MaxPool2d(2,2)
)
self.rnn = nn.LSTM(self.height//8*64, self.enc_hid_dim, bidirectional=True)
self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
self.dropout = nn.Dropout(dropout)
self.cnn_dropout = nn.Dropout(p=0.2)
def forward(self, src, in_data_len, train):
batch_size = src.shape[0]
out = self.layer0(src)
out = self.layer1(out)
out = self.layer2(out)
out = self.dropout(out) # torch.Size([batch, channel, h, w])
out = out.permute(3, 0, 2, 1) # (width, batch, height, channels)
out.contiguous()
out = out.reshape(-1, batch_size, self.height//8*64) #(w,batch, (height, channels))
width = out.shape[0]
src_len = in_data_len.numpy()*(width/self.width)
src_len = src_len + 0.999 # in case of 0 length value from float to int
src_len = src_len.astype('int')
out = pack_padded_sequence(out, src_len.tolist(), batch_first=False)
outputs, hidden_out = self.rnn(out)
hidden=hidden_out[0]
cell=hidden_out[1]
# output: t, b, f*2 hidden: 2, b, f
outputs, output_len = pad_packed_sequence(outputs, batch_first=False)
hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
cell = torch.tanh(self.fc(torch.cat((cell[-2,:,:], cell[-1,:,:]), dim = 1)))
return outputs, hidden, cell, output_len
class Decoder(nn.Module):
def init(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().init()
self.output_dim = output_dim
self.attention = attention
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.LSTM((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
self.dropout_layer = nn.Dropout(dropout)
def forward(self, input, hidden, cell, encoder_outputs, train):
input=torch.topk(input,1)[1]
embedded = self.embedding(input)
if train:
embedded=self.dropout_layer(embedded)
embedded = embedded.permute(1, 0, 2)
#embedded = [1, batch size, emb dim]
a = self.attention(hidden, encoder_outputs)
#a = [batch size, src len]
a = a.unsqueeze(1)
#a = [batch size, 1, src len]
encoder_outputs = encoder_outputs.permute(1, 0, 2)
#encoder_outputs = [batch size, src len, enc hid dim * 2]
weighted = torch.bmm(a, encoder_outputs)
weighted = weighted.permute(1, 0, 2)
#weighted = [1, batch size, enc hid dim * 2]
rnn_input = torch.cat((embedded, weighted), dim = 2)
output, hidden_out = self.rnn(rnn_input (hidden.unsqueeze(0),cell.unsqueeze(0)))
hidden=hidden_out[0]
cell=hidden_out[1]
assert (output == hidden).all()
embedded = embedded.squeeze(0)
output = output.squeeze(0)
weighted = weighted.squeeze(0)
prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
return prediction, hidden.squeeze(0), cell.squeeze(0)
`