I am working on classification model. I have a Description column in my data on which I am using Bert tokenization.
def tokenization_and_encoding(data,model_name,independent_col,target_col):
tokenizer = BertTokenizerFast.from_pretrained(model_name,do_lower_case=True)
train_text=list(data[independent_col])
train_labels=list(data[target_col])
train_encodings = tokenizer(train_text, truncation=True, padding=True,max_length=256)
train_encodings=train_encodings['input_ids']
return train_encodings,train_labels
model_name='uncased_L-12_H-768_A-12/'
data=data[['Description','Target']]
#drop null values
data = data[data['Outage Description'].notnull()]
calibrated_svc = CalibratedClassifierCV(LinearSVC(), method='sigmoid')
calibrated_svc.fit(train_encodings,train_labels)
length_of_encoding = len(train_encodings[0])##length is 51
pickle.dump(calibrated_svc, open(r".\model\bert__"+str(length_of_encoding)+".pkl", 'wb'), protocol=4)
#########################################################################
##########################Prediction#####################################
tokenizer = BertTokenizerFast.from_pretrained(model_name,do_lower_case=True)
#get test text
test_text=list(test_data[independent_col])
#
#set encoding size
test_encodings_fix=[0]*51
#encode text
test_encodings = tokenizer(test_text, truncation=True, padding=True, max_length=256)
test_encodings=test_encodings['input_ids']
#make encoding fix lenght
for enc in test_encodings:
test_encodings_fix_trim=test_encodings_fix[len(enc):51]
enc.extend(test_encodings_fix_trim)
#load model
Pkl_Filename = r'\model_new\bert_model.pkl'
with open(Pkl_Filename, 'rb') as file:
Pickled_svc_Model = pickle.load(file)
#predict
predict_svc_test_pred_bbc = pd.DataFrame(Pickled_svc_Model.predict(test_encodings))
Running the prediction module throwing me error as :
ValueError: Number of features of the model must match the input. Model n_features is 51 and input n_features is 55.
When I checked the test_encoding there the value is 55.
My training data has 105 records and test data has 5 records.
I am not able to figure it out where I need to fix.
Related
Im trying to finetune a T5 model with my own dataset for grammatical error correction, but when i run the model i keep on getting all 0's for my results. Im following the huggingface translation tutorial.
enter image description here
I think its a problem with the preprocess function, but i can't seem to figure out why
prefix = ''
max_input_length = 128
max_target_length = 128
source_lang = "ar"
target_lang = "ar"
def preprocess_function(examples):
inputs = [prefix + ex for ex in examples["original"]]
targets = [ex for ex in examples["corrected"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
I trained a machine translation model using huggingface library:
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
model_dir = './models/'
trainer.save_model(model_dir)
The code above is taken from this Google Colab notebook. After the training, I can see the trained model is saved to the folder models and the metric is calculated. Now I want to load the trained model and do the prediction on a new dataset, here is what I tried:
dataset = load_dataset('csv', data_files='data/training_data.csv')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Tokenize the test dataset
tokenized_datasets = train_test.map(preprocess_function_v2, batched=True)
test_dataset = tokenized_datasets['test']
model = AutoModelForSeq2SeqLM.from_pretrained('models')
model(test_dataset)
It threw the following error:
*** AttributeError: 'Dataset' object has no attribute 'size'
I tried the evaluate() function as well, but it said:
*** torch.nn.modules.module.ModuleAttributeError: 'MarianMTModel' object has no attribute 'evaluate'
And the function eval only prints the configuration of the model.
What is the proper way to evaluate the performance of the trained model on a new dataset?
Turned out that the prediction can be produced using the following code:
inputs = tokenizer(
questions,
max_length=max_input_length,
truncation=True,
return_tensors='pt',
padding=True).to('cuda')
translation = model.generate(**inputs)
I have been solving the NER problem for a Vietnamese dataset with 15 tags in IO format. I have been using the AllenNLP Interpret Toolkit for my model, but I can not configure it completely.
I have used a pre-trained language model "xlm-roberta-base" based-on HuggingFace. I have concatenated 4 last bert layers, and pass through to linear layer. The model architecture you can see in the source below.
class BaseBertSoftmax(nn.Module):
def __init__(self, model, drop_out , num_labels):
super(BaseBertSoftmax, self).__init__()
self.num_labels = num_labels
self.model = model
self.dropout = nn.Dropout(drop_out)
self.classifier = nn.Linear(4*768, num_labels) # 4 last of layer
def forward_custom(self, input_ids, attention_mask=None,
labels=None, head_mask=None):
outputs = self.model(input_ids = input_ids, attention_mask=attention_mask)
sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output) # bsz, seq_len, num_labels
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = nn.CrossEntropyLoss(ignore_index=0)
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs #scores, (hidden_states), (attentions)
What steps do I have to take to integrate this model to AllenNLP Interpret?
Could you please help me with this problem?
I'm actually injecting 77 document in a gensim mode by reading them from a database with a first script and i save the document on file system.
I then load an other doc to check the similarity with a vector
def read_corpus_bdd(cursor, tokens_only=False):
for i, (url_id, url_label, contenu) in enumerate(cursor):
tokens = gensim.utils.simple_preprocess(contenu)
if tokens_only:
yield tokens
else:
# For training data, add tags
# yield gensim.models.doc2vec.TaggedDocument(tokens, dataLine[0])
yield gensim.models.doc2vec.TaggedDocument(tokens, [int(str(url_id))])
print (int(str(url_id)))
targetContentCorpus = list(read_corpus_bdd(cursor))
# Param of trainer corpus
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=40)
# Build a vocabulary
model.build_vocab(targetContentCorpus)
###############################################################################
model.train(targetContentCorpus, total_examples=model.corpus_count, epochs=model.epochs)
##generate file model name for save
from datetime import date
pathModelSave=os.getenv("MODEL_BASE_SAVE") +'/projet_'+ str(projetId)
When i infer the vector :
inferred_vector = model.infer_vector(test_corpus[0])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
len(sims) #output 335
So I don't understand where this 335 come from and also why
sims[0][0]
return other id than the tagged one in the yield section
enter code here
I have been trying to build an empirical codon substitution matrix given a multiple sequence alignment in fasta format using Biopython.
It appears to be relatively straigh-forward for single nucleotide substitution matrices using the AlignInfo module when the aligned sequences have the same length. Here is what I managed to do using python2.7:
#!/usr/bin/env python
import os
import argparse
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio import SubsMat
import sys
version = "0.0.1 (23.04.20)"
name = "Aln2SubMatrix.py"
parser=argparse.ArgumentParser(description="Outputs a codon substitution matrix given a multi-alignment in FastaFormat. Will raise error if alignments contain dots (\".\"), so replace those with dashes (\"-\") beforehand (e.g. using sed)")
parser.add_argument('-i','--input', action = "store", dest = "input", required = True, help = "(aligned) input fasta")
parser.add_argument('-o','--output', action = "store", dest = "output", help = "Output filename (default = <Input-file>.codonSubmatrix")
args=parser.parse_args()
if not args.output:
args.output = args.input + ".codonSubmatrix" #if no outputname was specified set outputname based on inputname
def main():
infile = open(args.input, "r")
outfile = open(args.output, "w")
align = AlignIO.read(infile, "fasta")
summary_align = AlignInfo.SummaryInfo(align)
replace_info = summary_align.replacement_dictionary()
mat = SubsMat.SeqMat(replace_info)
print >> outfile, mat
infile.close()
outfile.close()
sys.stderr.write("\nfinished\n")
main()
Using a multiple sequence alignment file in fasta format with sequences of same length (aln.fa), the output is a half-matrix corresponding to the number of nucleotide substitutions oberved in the alignment (Note that gaps (-) are allowed):
python Aln2SubMatrix.py -i aln.fa
- 0
a 860 232
c 596 75 129
g 571 186 75 173
t 892 58 146 59 141
- a c g t
What I am aiming to do is to compute similar empirical substitution matrix but for all nucleotide triplets (codons) present in a multiple sequence alignment.
I have tried to tweak the _pair_replacement function of the AlignInfo module in order to accept nucleotide triplets by changing:
line 305 to 308
for residue_num in range(len(seq1)):
residue1 = seq1[residue_num]
try:
residue2 = seq2[residue_num]
to
for residue_num in range(0, len(seq1), 3):
residue1 = seq1[residue_num:residue_num+3]
try:
residue2 = seq2[residue_num:residue_num+3]
At this stage it can retrieve the codons from the alignment but complains about the alphabet (the module only accepts single character alphabet?).
Note that
(i) I would like to get a substitution matrix that accounts for the three possible reading frames
Any help is highly appreciated.