How to keep same sequence size of all batches in Bert hugging face training - huggingface-transformers

I'm using this code to train hugginface bert. But I saw different batch has different sequence length in training time. But I want to keep the same sequence length for all of the batches. How can I do that? And how does hugging face handles different sequence length in different batches?
from transformers import BertTokenizer
bert_cased_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False)
Define model
from transformers import BertConfig, BertForPreTraining
config = BertConfig()
model = BertForPreTraining(config)
Next sentence prediction
from transformers import TextDatasetForNextSentencePrediction
dataset = TextDatasetForNextSentencePrediction(
tokenizer=bert_cased_tokenizer,
file_path="/path/to/your/dataset",
block_size = 256)
mlm
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=bert_cased_tokenizer,
mlm=True,
mlm_probability= 0.15)
Train
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir= "/path/to/output/dir/for/training/arguments"
overwrite_output_dir=True,
num_train_epochs=2,
per_gpu_train_batch_size= 16,
save_steps=10_000,
save_total_limit=2,
prediction_loss_only=True,)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,)
trainer.train()
trainer.save_model("path/to/your/model")

Related

Forecasting validation loss flactuation

I have a question for those who have some experience with timeseries forecasting.
I have been experiment with this field for few weeks and i was trying to forecast some timeseries with both ARIMA and LSTM models to compare the results.
Basically i did plot this graph Figure 1 that has 4 plots :
Top left : ARIMA training data points and fitted model points.
Top right : ARIMA test and forecast points.
Bottom Left : LSTM training data and fitted data (i could not really find fitted point for LSTM so i just forecasted the training data but you can just ignore that part).
Bottom right : Test and forecast data for the LSTM model.
This graph was acceptable and also i did compute the RMSE and MSE and LSTM gave lower error which agrees with most literature online that states the superiority of LSTM over ARIMA models.
However after i did plot the loss and validation loss of the LSTM model to have more insights, i noticed that the validation_loss is following a wierd flectuating pattern Figure 2.
I can explain this as follow : the time series has a lot of outliers or abnormal behaviour, so splitting it to train/validation/test would mean validation cannot be really a good metric to show how good the model can learn.
But since all research papers never show this graph and explain this problem, i don't have a solid argument to defende this idea.
what do you guys think?
Thank you in advance
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_percentage_error,mean_absolute_percentage_error
from statsmodels.tsa.seasonal import STL
import numpy as np
from pandas import Series, DataFrame
from scipy import stats
from statsmodels.tsa.stattools import adfuller
import statsmodels
from statsmodels.tsa.seasonal import seasonal_decompose
from pandas.plotting import register_matplotlib_converters
import pmdarima as pm
register_matplotlib_converters()
import warnings
import time
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from numpy import array
import keras_tuner as kt
import tensorflow as tf
print(tf.__version__)
from numpy import array
from tensorflow import keras
import keras_tuner as kt
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import Bidirectional
from tensorflow.keras import initializers
import random as rn
np.random.seed(123)
rn.seed(123)
tf.random.set_seed(123)
tf.keras.utils.set_random_seed(123)
keras.utils.set_random_seed(123)
warnings.filterwarnings('ignore')
df3 = pd.read_csv('favorita_train.csv')
## 1 - Get TS and do STL
print("TS lenbgth : "+str(len(df3)))
results = seasonal_decompose(df3['unit_sales'],period=30)
results.plot();
train_all = df3.iloc[:int(len(df3)*0.8)]
train = df3.iloc[:int(len(df3)*0.6)]
val = df3.iloc[int(len(df3)*0.6):int(len(df3)*0.8)]
test = df3.iloc[int(len(df3)*0.8):]
scaler = MinMaxScaler()
scaler.fit(train_all)
scaled_all = scaler.transform(df3)
scaled_train = scaler.transform(train)
scaled_train_all = scaler.transform(train_all)
scaled_val = scaler.transform(val)
scaled_test = scaler.transform(test)
# We do the same thing, but now instead for 12 months
n_features = 1
n_input =5
train_generator_all = TimeseriesGenerator(scaled_train_all, scaled_train_all, length=n_input, batch_size=1,shuffle=True)
train_generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=1,shuffle=True)
val_generator = TimeseriesGenerator(scaled_val, scaled_val, length=n_input, batch_size=1,shuffle=True)
adfPValue = adfuller(scaled_all)
adfPValue=adfPValue[1]
adi = len(scaled_all)/((scaled_all != 0).sum())
sd=scaled_all.std()
mean=scaled_all.mean()
cv2 = np.square(sd/mean)
print("CV2 (describe magnitude of demande variability <0.5 is good) :"+str(cv2))
print("SD (-2,2 is good | mean data variance is low) :"+str(sd))
print("ADI (1.3 or smaller means smooth ts) :"+str(adi))
print("Stationarity test (stationary if <0.05) :"+str(adfPValue))
def model_builder(hp):
model = keras.Sequential()
hp_units = hp.Int('units', min_value=1, max_value=50, step=1)
hp_layers = hp.Int('layers', min_value=1, max_value=3, step=1)
if hp_layers==1 :
model.add(Bidirectional(LSTM(hp_units,activation='relu'), input_shape=(n_input, n_features)))
elif hp_layers==2:
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True), input_shape=(n_input, n_features)))
model.add(Bidirectional(LSTM(hp_units, activation='relu')))
else:
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True), input_shape=(n_input, n_features)))
for i in range(hp_layers-2):
model.add(Bidirectional(LSTM(hp_units, activation='relu', return_sequences=True)))
model.add(Bidirectional(LSTM(hp_units, activation='relu')))
model.add(Dense(1))
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), loss='mse',metrics=['accuracy'])
return model
tuner = kt.Hyperband(model_builder,
objective='val_loss',
max_epochs=300,
factor=3,
directory='499',
project_name='949',
seed=123)
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
tuner.search(train_generator, epochs=300, validation_data=val_generator, shuffle=True, callbacks=[stop_early], batch_size=len(train_generator))
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.get('units'))
print(best_hps.get('layers'))
print(best_hps.get('window'))
print(best_hps.get('learning_rate'))
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
history = model.fit(img_train, label_train, epochs=50, validation_split=0.2)
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

sparse matrix use in pycaret for nlp

First of all, thank you for allowing me to use this wonderful library
I am doing Korean NLP now
So I pre-processed Korean and converted it into a Tfidf Vectorizer.
I'm going to put it in setup() and use it, but there are errors
Can't I use the sparse matrix for the pycaret?
If so, is there any way to do Korean NLP?
X_train = train_data.Text.tolist()
Y_train =train_data['Label'].values
X_test = test_data.Text.tolist()
Y_test =test_data['Label'].values
from soynlp.word import WordExtractor
word_extractor = WordExtractor(min_frequency=100,
min_cohesion_forward=0.05,
min_right_branching_entropy=0.0
)
word_extractor.train(X_train) # list of str or like
words = word_extractor.extract()
scores = word_extractor.word_scores()
import math
score_dict = {key: scores[key].cohesion_forward *
math.exp(scores[key].right_branching_entropy)
for key in scores}
from soynlp.tokenizer import LTokenizer
cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
tokenizer = LTokenizer(scores=score_dict)
import os
from scipy.sparse import save_npz, load_npz
from sklearn.feature_extraction.text import TfidfVectorizer
# if not os.path.isfile('soy_train.npz'):
tfidf = TfidfVectorizer(ngram_range=(1, 2),
min_df=3,
tokenizer=tokenizer.tokenize,
token_pattern=None)
tfidf.fit(X_train)
X_train_soy = tfidf.transform(X_train)
X_test_soy = tfidf.transform(X_test)
save_npz('soy_train.npz', X_train_soy)
save_npz('soy_test.npz', X_test_soy)
type(X_train_soy[0])
import numpy as np
train = pd.DataFrame(X_train_soy)
y_train = np.array(Y_train,dtype=float)
train['Label'] = y_train
from pycaret.classification import *
import numpy as np
exp1 = setup(train,train_size=0.8, target = 'Label',use_gpu = True)
TypeError: Cannot compare types 'ndarray(dtype=object)' and 'float'

Uploading models with custom forward functions to the huggingface model hub?

Is it possible to upload a model with a custom forward function to the huggingface model hub?
I can see how to do it if your model is of a normal form but can't see how to customise the forward function and do it?
Yes absolutely. You can create your own model with added any number of layers/customisations you want and upload it to model hub. Let me present you a demo which will describe the entire process.
Uploading custom model to 🤗 model hub
import tqdm
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, BertConfig
from transformers import AdamW
from transformers import get_scheduler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# setting device to `cuda` if gpu exists
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# initialising the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
bert = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
def tokenize_function(examples):
'''Function for tokenizing raw texts'''
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
# downloading IMDB dataset from 🤗 `datasets`
raw_datasets = load_dataset("imdb")
# Running tokenizing function on the raw texts
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# for simplicity I have taken only the train split
tokenized_datasets = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# Now lets create the torch Dataset class
class IMDBClassificationDataset(Dataset):
def __init__(self, dataset):
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
d = self.dataset[idx]
ids = torch.tensor(d['input_ids'])
mask = torch.tensor(d['attention_mask'])
label = torch.tensor(d['label'])
return ids, mask, label
# Preparing the dataset and the Dataloader
dataset = IMDBClassificationDataset(tokenized_datasets)
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=8)
# Now lets create a custom Bert model
class CustomBert(transformers.PreTrainedModel):
'''Custom model class
------------------
Now the trick is not to inherit the class from `nn.Module` but `transformers.PretrainedModel`
Also you need to pass the model config during initialisation'''
def __init__(self, bert):
super(CustomBert, self).__init__(config=BertConfig.from_pretrained('google/bert_uncased_L-2_H-128_A-2'))
self.bert = bert
self.l1 = nn.Linear(128, 1)
self.do = nn.Dropout(0.1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, sent_id, mask):
'''For simplicity I have added only one linear layer, you can create any type of network you want'''
bert_out = self.bert(sent_id, attention_mask=mask)
o = bert_out.last_hidden_state[:,0,:]
o = self.do(o)
o = self.relu(o)
o = self.l1(o)
o = self.sigmoid(o)
return o
# initialising model, loss and optimizer
model = CustomBert(bert)
model.to(device)
criterion = torch.nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)
# setting epochs, num_training_steps and the lr_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
# training loop
model.train()
for epoch in tqdm.tqdm(range(num_epochs)):
for batch in train_dataloader:
ids, masks, labels = batch
labels = labels.type(torch.float32)
o = model(ids.to(device), masks.to(device))
loss = criterion(torch.squeeze(o), labels.to(device))
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
# save the tokenizer and the model in `./test-model/` directory
tokenizer.save_pretrained("./test-model/")
model.save_pretrained("./test-model/", push_to_hub=False)
Now create a new model in 🤗 and push all the contents inside the test-model to 🤗 model hub.
To test the authenticity of the model you can try 🤗's pipeline to check if something is wrong.
from transformers import pipeline
# as this is classification so you need to mention `text-classification` as task
classifier = pipeline('text-classification', model='tanmoyio/test-model')
classifier("This movie was superb")
It will output something like this
[{'label': 'LABEL_0', 'score': 0.5571992993354797}]
This is a real demo, check the model here - https://huggingface.co/tanmoyio/test-model. Let me know if you have further questions.

Why fine-tuning BERT mlm on specific domain doesn't work? What am I doing wrong?

I'm new. I'm trying to fine-tuned a BERT MLM (bert-base-uncased) on a target domain. Unfortunately, results are not good.
Before fine-tuning, the pre-trained model fills the mask of a sentence with words in line of human expectations.
E.g. Wikipedia is a free online [MASK], created and edited by volunteers around the world.
The most probable prediction are encyclopedia (score: 0.650) and resource (score:0.087).
After fine-tuning, the prediction are completely wrong. Often stopwords are predicted as result.
E.g. Wikipedia is a free online [MASK], created and edited by volunteers around the world.
The most probable prediction are the (score: 0.052) and be (score:0.033).
I experimented with different epochs (from 1 to 10) and different datasets (from a few MB to a few GB) but I got the same issue. What am I doing wrong? I'm using the following code, I hope you can help me.
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = AutoModelForMaskedLM.from_config(config) # BertForMaskedLM.from_pretrained(path)
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(tokenizer=tokenizer,
file_path="data/english/corpora.txt",
block_size = 512)
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(output_dir="output/models/english",
overwrite_output_dir=True,
num_train_epochs=5,
per_gpu_train_batch_size=8,
save_steps = 22222222,
save_total_limit=2)
trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=dataset)
trainer.train()
trainer.save_model("output/models/english")
from transformers import pipeline
# Initialize MLM pipeline
mlm = pipeline('fill-mask', model="output/models/english", tokenizer="output/models/english")
# Get mask token
mask = mlm.tokenizer.mask_token
# Get result for particular masked phrase
phrase = f'Wikipedia is a free online {mask}, created and edited by volunteers around the world'
result = mlm(phrase)
# Print result
print(result)

Can't get train and test sets

I applied k-fold cross validation to split data into train and test sets.
But when I want to get train and test sets I have these errors:
AttributeError: 'numpy.ndarray' object has no attribute 'iloc'
Thanks for your help.
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=True)
for train,test in kf.split(X):
print("%s %s" % (train,test))
for train_index, test_index in kf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
from sklearn.linear_model import LogisticRegression
CLF = LogisticRegression().fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(CLF.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(CLF.score(X_test, y_test)))
NameError: name 'y_train' is not defined
The issue is that df_dummies['Churn'].values returns an array not a dataframe. But you are trying to get attributes from an array which don't exist. The iloc function is in pandas.DataFrame.
Use y = df_dummies['Churn'] instead.
Reference: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
PS: I don't know how these type of questions could be migrated to a sister site. Perhaps, someone who knows that could migrate this to cross-validated please.

Resources