Problem with following along with notebook on kaggle "max() received an invalid combination of arguments" issue - huggingface-transformers

For my studying purposes I am following along a very popular notebook for sentiment classification with Bert.
Kaggle notebook for sentiment classification with BERT
But in place of train the model like in notebook, i just load another model
MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
and want to test this on my data, to get a heatmap and accuracy score likde on the end of this notebook.
But when i am at the step of evalution i get
TypeError: max() received an invalid combination of arguments - got (SequenceClassifierOutput, dim=int), but expected one of:
* (Tensor input)
* (Tensor input, Tensor other, *, Tensor out)
* (Tensor input, int dim, bool keepdim, *, tuple of Tensors out)
* (Tensor input, name dim, bool keepdim, *, tuple of Tensors out)
in evaluation function where it says
_, preds = torch.max(outputs, dim=1)
I tried to change this to
_, preds = torch.max(torch.tensor(outputs), dim=1)
But then a got another issue:
RuntimeError: Could not infer dtype of SequenceClassifierOutput
the method for evaluation looks like this:
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
# Get model ouptuts
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
And outputs it self in the code above looks like this
SequenceClassifierOutput(loss=None, logits=tensor([[ 2.2241, 1.2025, 0.1638, -1.4620, -1.6424],
[ 3.1578, 1.3957, -0.1131, -1.8141, -1.9536],
[ 0.7273, 1.7851, 1.1237, -0.9063, -2.3822],
[ 0.9843, 0.9711, 0.5067, -0.7553, -1.4547],
[-0.4127, -0.8895, 0.0572, 0.3550, 0.7377],
[-0.4885, 0.6933, 0.8272, -0.3176, -0.7546],
[ 1.3953, 1.4224, 0.7842, -0.9143, -2.2898],
[-2.4618, -1.2675, 0.5480, 1.4326, 1.2893],
[ 2.5044, 0.9191, -0.1483, -1.4413, -1.4156],
[ 1.3901, 1.0331, 0.4259, -0.8006, -1.6999],
[ 4.2252, 2.6539, -0.0392, -2.6362, -3.3261],
[ 1.9750, 1.8845, 0.6779, -1.3163, -2.5570],
[ 5.1688, 2.2360, -0.6230, -2.9657, -2.9031],
[ 1.1857, 0.4277, -0.1837, -0.7163, -0.6682],
[ 2.1133, 1.3829, 0.5750, -1.3095, -2.2234],
[ 2.3258, 0.9406, -0.0115, -1.1673, -1.6775]], device='cuda:0'), hidden_states=None, attentions=None)
How i can make it work?
Kind regards

Related

RayTune HyperOptSearch - fitting resampling into pipeline throws error: All intermediate steps should be transformers and implement fit and transform

I'm getting started with Raytune and trying to set up a HyperOptSearch with imbalanced data.
Fitting a pipeline without RandomOverSampler works fine, but when I add that in, I get the error:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough'
Code sample here, and works fine without the RandomOverSampler step:
cfg_hgb = {
'clf__learning_rate' : tune.loguniform(0.001, 0.8),
'clf__max_leaf_nodes' : tune.randint(2,20),
'clf__min_samples_leaf' : tune.randint(50,500),
'clf__max_depth' : tune.randint(2,15),
'clf__l2_regularization' : tune.loguniform(0.001, 1000),
'clf__max_iter' : tune.choice([800]),
}
hyperopt = HyperOptSearch(
# space=cfg_hgb,
metric="mean_auc",
mode="max",
points_to_evaluate=None,
n_initial_points=20,
random_state_seed=RANDOM_STATE,
gamma=0.25,
)
def train_hgb(config):
# LOAD DATA
X, y, nominal, ordinal, numeric = load_clean_data()
# LOAD TRANSFORMERS
prep = Preprocessor(nominal, ordinal, numeric)
# CHOOSE CV STRATEGY
splitter = StratifiedKFold(CV, random_state=RANDOM_STATE, shuffle=True)
# TRAIN
scores = []
for train_ind, val_ind in splitter.split(X,y):
hgb_os = Pipeline(steps=[
('coltrans', prep.transformer_ord),
('ros', RandomOverSampler(random_state=RANDOM_STATE)), # if I comment out, works fine
('clf', HistGradientBoostingClassifier(
categorical_features=prep.cat_feature_mask,
random_state=RANDOM_STATE))
])
hgb_os.set_params(**config)
hgb_os.fit(X.iloc[train_ind], y[train_ind])
y_pred = hgb_os.predict(X.iloc[val_ind])
scores.append(roc_auc_score(y_true=y[val_ind], y_score=y_pred, average="macro"))
# REPORT SCORES
session.report({
'mean_auc' : np.array(scores).mean(),
'std_auc' : np.array(scores).std(),
})
tuner = tune.Tuner(
trainable=train_hgb,
param_space=cfg_hgb,
tune_config=tune.TuneConfig(
num_samples=10,
search_alg=hyperopt,
),
run_config=RunConfig(
name="experiment_name",
local_dir="./results/hgb",
)
)
results = tuner.fit()
Whereas if using ray.tune.sklearn.TuneSearchCV, RandomOverSampler works fine in the pipeline:
hgb_tune = {
'learning_rate' : tune.loguniform(0.001, 0.15),
'max_leaf_nodes' : tune.randint(2,4),
'min_samples_leaf' : tune.randint(160,300),
'max_depth' : tune.randint(2,7),
'l2_regularization' : tune.loguniform(5, 1000),
'max_iter' : tune.choice([400]),
}
hgb_os = Pipeline(steps=[
('trans', prep.transformer_ord),
('ros', RandomOverSampler(random_state=RANDOM_STATE)),
('clf', TuneSearchCV(
HistGradientBoostingClassifier(
categorical_features=prep.cat_feature_mask,
random_state=RANDOM_STATE),
param_distributions=hgb_tune,
cv=CV, scoring=SCORER,
verbose=VERBOSE, search_optimization="bayesian",
n_trials=N_TRIALS, )) # local_dir='~/rayresults/hgbtune'
])
results, params = fit_eval(hgb_os, X_train, X_test, y_train, y_test)
I understand that probably tune is expecting .fit_transform for intermediate steps whereas RandomOverSampler uses .fit_resample. Also note that RandomOverSampler requires imblearn.pipeline.Pipeline rather than sklearn.pipeline.Pipeline so perhaps therein lies the problem.
Is there a way to add any form of resampling with the current Tuner API? Or do I need to part out the pipeline and resample it first outside of this loop?
Thanks in advance.

Properly evaluate a test dataset

I trained a machine translation model using huggingface library:
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
model_dir = './models/'
trainer.save_model(model_dir)
The code above is taken from this Google Colab notebook. After the training, I can see the trained model is saved to the folder models and the metric is calculated. Now I want to load the trained model and do the prediction on a new dataset, here is what I tried:
dataset = load_dataset('csv', data_files='data/training_data.csv')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Tokenize the test dataset
tokenized_datasets = train_test.map(preprocess_function_v2, batched=True)
test_dataset = tokenized_datasets['test']
model = AutoModelForSeq2SeqLM.from_pretrained('models')
model(test_dataset)
It threw the following error:
*** AttributeError: 'Dataset' object has no attribute 'size'
I tried the evaluate() function as well, but it said:
*** torch.nn.modules.module.ModuleAttributeError: 'MarianMTModel' object has no attribute 'evaluate'
And the function eval only prints the configuration of the model.
What is the proper way to evaluate the performance of the trained model on a new dataset?
Turned out that the prediction can be produced using the following code:
inputs = tokenizer(
questions,
max_length=max_input_length,
truncation=True,
return_tensors='pt',
padding=True).to('cuda')
translation = model.generate(**inputs)

How to test a model before fine-tuning in Pytorch Lightning?

Doing things on Google Colab.
transformers: 4.10.2
pytorch-lightning: 1.2.7
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
dataset_for_loader = [
{'data':torch.tensor([0,1]), 'labels':torch.tensor(0)},
{'data':torch.tensor([2,3]), 'labels':torch.tensor(1)},
{'data':torch.tensor([4,5]), 'labels':torch.tensor(2)},
{'data':torch.tensor([6,7]), 'labels':torch.tensor(3)},
]
loader = DataLoader(dataset_for_loader, batch_size=2)
for idx, batch in enumerate(loader):
print(f'# batch {idx}')
print(batch)
category_list = [
'dokujo-tsushin',
'it-life-hack',
'kaden-channel',
'livedoor-homme',
'movie-enter',
'peachy',
'smax',
'sports-watch',
'topic-news'
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
# file ./text has lots of articles, categorized by category
# and they are just plain texts, whose content begins from forth line
for file in glob.glob(f'./text/{category}/{category}*'):
lines = open(file).read().splitlines()
text = '\n'.join(lines[3:])
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True
)
encoding['labels'] = label
encoding = { k: torch.tensor(v) for k, v in encoding.items() }
dataset_for_loader.append(encoding)
SEED=lambda:0.0
# random.shuffle(dataset_for_loader) # ランダムにシャッフル
random.shuffle(dataset_for_loader,SEED)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]
dataloader_train = DataLoader(
dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)
class BertForSequenceClassification_pl(pl.LightningModule):
def __init__(self, model_name, num_labels, lr):
super().__init__()
self.save_hyperparameters()
self.bert_sc = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def training_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
loss = output.loss
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
val_loss = output.loss
self.log('val_loss', val_loss)
def test_step(self, batch, batch_idx):
labels = batch.pop('labels')
output = self.bert_sc(**batch)
labels_predicted = output.logits.argmax(-1)
num_correct = ( labels_predicted == labels ).sum().item()
accuracy = num_correct/labels.size(0)
self.log('accuracy', accuracy)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
checkpoint = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=1,
save_weights_only=True,
dirpath='model/',
)
trainer = pl.Trainer(
gpus=1,
max_epochs=10,
callbacks = [checkpoint]
)
model = BertForSequenceClassification_pl(
MODEL_NAME, num_labels=9, lr=1e-5
)
### (a) ###
# I think this is where I am doing fine-tuning
trainer.fit(model, dataloader_train, dataloader_val)
# this is to score after fine-tuning
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
But I am not really sure how to do a test before fine-tuning, in order to compare two models before and after fine-tuning, in order to show how effective fine-tuning is.
Inserting the following two lines to ### (a) ###:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
I got this result:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-c8b2c67f2d5c> in <module>()
9
10 # 6-19
---> 11 test = trainer.test(test_dataloaders=dataloader_test)
12 print(f'Accuracy: {test[0]["accuracy"]:.2f}')
13
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in test(self, model, test_dataloaders, ckpt_path, verbose, datamodule)
896 self.verbose_test = verbose
897
--> 898 self._set_running_stage(RunningStage.TESTING, model or self.lightning_module)
899
900 # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _set_running_stage(self, stage, model_ref)
563 the trainer and the model
564 """
--> 565 model_ref.running_stage = stage
566 self._running_stage = stage
567
AttributeError: 'NoneType' object has no attribute 'running_stage'
I noticed that Trainer.fit() can take None as arguments other than model, so I tried this:
trainer.fit(model)
test=trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
The result:
MisconfigurationException: No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.
Thanks.
The Trainer needs to call its .fit() in order to set up a lot of things and then only you can do .test() or other methods.
You are right about putting a .fit() just before .test() but the fit call needs to a valid one. You have to feed a dataloader/datamodule to it. But since you don't want to do a training/validation in this fit call, just pass limit_[train/val]_batches=0 while Trainer construction.
trainer = Trainer(gpus=..., ..., limit_train_batches=0, limit_val_batches=0)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # without fine-tuning
The fit call here will just set things up for you and skip training/validation. And then the testing follows. Next time run the same code but without the limit_[train/val]_batches, this will do the pretraining for you
trainer = Trainer(gpus=..., ...)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # with fine-tuning
Clarifying a bit about .fit() taking None for all but model: Its not quite true - you must provide either a DataLoader or a DataModule.

How to distribute and fit h2o model by group in sparklyr

I would like to fit a model by group in h2o using some type of distributed apply function.
I tried the following but it doesn't work. Probably due to the fact I cannot pipe the sc object through.
df%>%
spark_apply(function(e)
h2o.coxph(x = predictors,
event_column = "event",
stop_column = "time_to_next",
training_frame = as_h2o_frame(sc, e, strict_version_check = FALSE))
group_by = "id"
)
I receive a pretty generic spark error like this:
error : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 23.0 failed 4 times, most recent failure: Lost task 0.3 in stage 23.0 :
I'm not sure if you can return an entire H2OCoxPH model from sparklyr::spark_apply(): Errors are no method for coercing this S4 class to a vector if you set the fetch_result_as_sdf argument to FALSE and cannot coerce class ‘structure("H2OCoxPHModel", package = "h2o")’ to a data.frame if set to TRUE.
But if you can make your own vector or dataframe from the relevant parts of the model, I think you can do it.
Here I'll use a sample Cox Proportional Hazards file from H2O Docs Cox Proportional Hazards (CoxPH) and I'll use group_by = "surgery".
heart_hf <- h2o::h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/coxph_test/heart.csv")
##### Convert to Spark DataFrame since I assume that is the use case
heart_sf <- sparklyr::copy_to(sc, heart_hf %>% as.data.frame())
##### Use sparklyr::spark_apply() on Spark DataFrame to "distribute and fit h2o model by group"
sparklyr::spark_apply(
x = heart_sf,
f = function(x) {
h2o::h2o.init()
heart_coxph <- h2o::h2o.coxph(x = c("age", "year"),
event_column = "event",
start_column = "start",
stop_column = "stop",
ties = "breslow",
training_frame = h2o::as.h2o(x, strict_version_check = FALSE))
return(data.frame(conc = heart_coxph#model$model_summary$concordance))
},
columns = list(surgery = "integer", conc = "numeric"),
group_by = c("surgery"))
# Source: spark<?> [?? x 2]
surgery conc
<int> <dbl>
1 1 0.588
2 0 0.614

Unable to predict when loading a Tensorflow model in Go

I've loaded a Tensorflow model in Go and cannot get predictions - it keeps complaining about shape mismatch - a simple 2d array. Would appreciate an idea here, thank you so much in advance.
Error running the session with input, err: You must feed a value for placeholder tensor 'theoutput_target' with dtype float
[[Node: theoutput_target = Placeholder[_output_shapes=[[?,?]], dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Input tensor being sent is a [][]float32{ {1.0}, }
a := [][]float32{ {1.0}, }
tensor, terr := tf.NewTensor(a)
if terr != nil {
fmt.Printf("Error creating input tensor: %s\n", terr.Error())
return
}
result, runErr := model.Session.Run(
map[tf.Output]*tf.Tensor{
model.Graph.Operation("theinput").Output(0): tensor,
},
[]tf.Output{
model.Graph.Operation("theoutput_target").Output(0),
},
nil,
)
and the model is generated via Keras and exported to TF using SavedModelBuilder after:
layer_name_input = "theinput"
layer_name_output = "theoutput"
def get_encoder():
model = Sequential()
model.add(Dense(5, input_dim=1))
model.add(Activation("relu"))
model.add(Dense(5, input_dim=1))
return model
inputs = Input(shape=(1, ), name=layer_name_input)
encoder = get_encoder()
model = encoder(inputs)
model = Activation("relu")(model)
objective = Dense(1, name=layer_name_output)(model)
model = Model(inputs=[inputs], outputs=objective)
model.compile(loss='mean_squared_error', optimizer='sgd')
EDIT - fixed, it was a problem with exporting from Keras to TF (layer names). Pasting the export here, hopefully helpful for someone else:
def export_to_tf(keras_model_path, export_path, export_version, is_functional=False):
sess = tf.Session()
K.set_session(sess)
K.set_learning_phase(0)
export_path = os.path.join(export_path, str(export_version))
model = load_model(keras_model_path)
config = model.get_config()
weights = model.get_weights()
if is_functional == True:
model = Model.from_config(config)
else:
model = Sequential.from_config(config)
model.set_weights(weights)
with K.get_session() as sess:
inputs = [ (model_input.name.split(":")[0], model_input) for model_input in model.inputs]
outputs = [ (model_output.name.split(":")[0], model_output) for model_output in model.outputs]
signature = predict_signature_def(inputs=dict(inputs),
outputs=dict(outputs))
input_descriptor = [ { 'name': item[0], 'shape': item[1].shape.as_list() } for item in inputs]
output_descriptor = [ { 'name': item[0], 'shape': item[1].shape.as_list() } for item in outputs]
builder = saved_model_builder.SavedModelBuilder(export_path)
builder.add_meta_graph_and_variables(
sess=sess,
tags=[tag_constants.SERVING],
signature_def_map={signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature})
builder.save()
descriptor = dict()
descriptor["inputs"] = input_descriptor
descriptor["outputs"] = output_descriptor
pprint.pprint(descriptor)
That's something strange in your code and error. Tensorflow is complaining about a missing value for the placeholder with name 'theoutput_target', whilst this placeholder is never defined in the code you posted. Instead, your code defines a placeholder whose name is 'theinput'.
Also, I suggest you to use a more complete and easy to use wrapper around the tensorflow API: tfgo

Resources