T5 while doing hyperparameter search shows "ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds" - huggingface-transformers

I am working with the huggingface transformers and training a pretrained byt5-small on my data. I am also trying to do hyperparameter search using Trainer API with optuna as backend. But the following error is appearing every time. Please help me out. below is the while code.
transformer version = 4.23.1
from transformers import HfArgumentParser, TensorFlowBenchmark, TensorFlowBenchmarkArguments
import pandas as pd
from transformers import T5ForConditionalGeneration, ByT5Tokenizer
from transformers import TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer
import datasets
import transformers
from tqdm import tqdm
from numba import cuda
device = cuda.get_current_device()
device.reset()
train_df = pd.read_csv("/home/bhavuk/project1/data/train_split.csv")
eval_df = pd.read_csv("/home/bhavuk/project1/data/eval_split.csv")
test_df = pd.read_csv("/home/bhavuk/project1/data/test_split.csv")
train_df = train_df.dropna()
eval_df = eval_df.dropna()
test_df = test_df.dropna(subset=["Hypothesis","Reference"])
train_df.shape, eval_df.shape[0], test_df.shape[0]
args_dict = {
"output_dir": './byt5-small-hp-search',
"overwrite_output_dir": True,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 2,
"gradient_accumulation_steps": 4,
"learning_rate": 1e-1,
"warmup_steps": 2,
"logging_steps": 100,
"evaluation_strategy": "steps",
"eval_steps": 250,
"num_train_epochs": 4,
"do_train": True,
"do_eval": True,
"fp16": False,
"max_steps": 100000,
"load_best_model_at_end":True,
"logging_dir": './logs',
"save_total_limit" : 2,
"weight_decay" : 0.1,
"label_smoothing_factor" : 0.1
}
parser = HfArgumentParser(
(TrainingArguments))
training_args = parser.parse_dict(args_dict)
args = training_args[0]
def optuna_hp_space(trial):
return {
"learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
"dropout_rate": trial.suggest_float("dropout_rate", 0.1, 0.6, step=0.1),
"weight_decay": trial.suggest_float("weight_decay", 0.1, 0.3, step=0.1),
"label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.1, 0.3, step=0.1)
}
config = '/home/bhavuk/project1/notebooks/models--google--byt5-small/snapshots/ce8f3a48ed7676af36476a01fb01f95ea529599c/config.json'
def model_init(trial):
return T5ForConditionalGeneration.from_pretrained(
'google/byt5-small',
config=config,
dropout_rate = 0.1
)
tokenizer = ByT5Tokenizer.from_pretrained(
"google/byt5-small",
cache_dir=".",
max_length=512
)
class GPReviewDataset(Dataset):
def __init__(self, Text, Label):
self.Text = Text
self.Label = Label
# self.tokenizer = tokenizer
# self.max_len = max_len
def __len__(self):
return len(self.Text)
def __getitem__(self, item):
Text = str(self.Text[item])
Label = self.Label[item]
inputs = tokenizer(Text, padding="max_length", truncation=True, max_length=512)
outputs = tokenizer(Label, padding="max_length", truncation=True, max_length=512)
return {
"input_ids":inputs.input_ids,
"attention_mask" : inputs.attention_mask,
"labels" : outputs.input_ids,
"decoder_attention_mask" : outputs.attention_mask,
# "labels" : lbz
}
ds_train = GPReviewDataset(
Text=train_df.Hypothesis.to_numpy(),
Label=train_df.Reference.to_numpy()
ds_test = GPReviewDataset(
Text=eval_df.Hypothesis.to_numpy(),
Label=eval_df.Reference.to_numpy()
# tokenizer=tokenizer,
# max_len=max_len
)
train_dataset = ds_train
valid_dataset = ds_test
trainer = Trainer(
model=None,
args=args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
tokenizer=tokenizer,
model_init=model_init
)
best_trial = trainer.hyperparameter_search(
direction="minimize",
backend="optuna",
hp_space=optuna_hp_space,
n_trials=20
)
ERROR:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/home/bhavuk/project1/notebooks/byT5small_hp_search_2.ipynb Cell 14 in <cell line: 1>()
----> 1 best_trial = trainer.hyperparameter_search(
2 direction="minimize",
3 backend="optuna",
4 hp_space=optuna_hp_space,
5 n_trials=20
6 )
File ~/anaconda3/envs/cvenv/lib/python3.9/site-packages/transformers/trainer.py:2368, in Trainer.hyperparameter_search(self, hp_space, compute_objective, n_trials, direction, backend, hp_name, **kwargs)
2360 self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
2362 backend_dict = {
2363 HPSearchBackend.OPTUNA: run_hp_search_optuna,
2364 HPSearchBackend.RAY: run_hp_search_ray,
2365 HPSearchBackend.SIGOPT: run_hp_search_sigopt,
2366 HPSearchBackend.WANDB: run_hp_search_wandb,
2367 }
-> 2368 best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
2370 self.hp_search_backend = None
2371 return best_run
File ~/anaconda3/envs/cvenv/lib/python3.9/site-packages/transformers/integrations.py:189, in run_hp_search_optuna(trainer, n_trials, direction, **kwargs)
187 n_jobs = kwargs.pop("n_jobs", 1)
188 study = optuna.create_study(direction=direction, **kwargs)
...
return forward_call(*input, **kwargs)
File "/home/bhavuk/anaconda3/envs/cvenv/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 937, in forward
raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

Related

How to speed up resample with for loop?

I want to get the value of rsi on h1 for each m15 candle, this is how I do this. However, with data larger than 500000 lines, this is very time consuming, is there any better way. Note that it is mandatory to resample each row to get the correct result
import talib
import pandas as pd
import numpy as np
def Data(df):
df['RSI1'] = talib.RSI(df['close'], timeperiod=13)
df['RSI2'] = talib.RSI(df['close'], timeperiod=21)
return df
#len(df) > 555555
df = pd.read_csv('m15_candle.csv')
for i in range(0, len(df)):
t = df.at[i, 'time']
if t.hour == 0 and t.minute == 0:
df = df[i:]
break
df = df.set_index('time')
ohlc = {
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last'
}
rsi1 = [0]*len(df)
rsi2 = [0]*len(df)
for i in range(100000, len(df)):
h1 = Data(df[:i].resample("1h", offset=0).apply(ohlc).dropna())
rsi1[i] = h1.iloc[-1]['RSI1']
rsi2[i] = h1.iloc[-1]['RSI2']
df['RSI1_h1'] = rsi1
df['RSI2_h1'] = rsi2
df = df.reset_index()
df.to_csv("data.csv", index = False)

Hyperparameter tuning using GridSearch()

I am trying to do hyperparametric tuning using the following code :
def df_to_new(df,window_size):
df_as_np = df.to_numpy()
X = []
y = []
for i in range(len(df_as_np)-window_size):
row = [[a] for a in df_as_np[i:i+window_size]]
X.append(row)
label = df_as_np[i+window_size]
y.append(label)
return np.array(X),np.array(y)
Xgrid,Xnotuse,ygrid,ynouse = train_test_split(Xtrain,ytrain,test_size=0.8)
input_dim = Xgrid.shape[1]
def define_model(learning_rate=0.01):
model = Sequential()
model.add(LSTM(128,input_dim = input_dim))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
optimizer = Adam(lr=learning_rate)
# compile the model
model.compile(loss=MeanSquaredError(),
optimizer=optimizer,
metrics=[RootMeanSquaredError()])
return model
model = KerasClassifier(build_fn=define_model,
epochs=epochs,
batch_size = batch_size,
verbose=1)
learning_rate = [0.0001, 0.001, 0.01, 0.1]
epochs = [10, 30, 60, 150]
batch_size = [5, 15, 25, 50]
param_grid = dict(learning_rate=learning_rate, epochs=epochs,batch_size=batch_size)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(Xgrid, ygrid)
But I am getting the following error:
ValueError: Invalid parameter learning_rate for estimator KerasClassifier.
This issue can likely be resolved by setting this parameter in the KerasClassifier constructor:
KerasClassifier(learning_rate=0.0001)
Check the list of available parameters with estimator.get_params().keys()
What I'm trying to do is to get the best epochs, batch_size, and learning_rate combination for my LSTM model. Any help?

How to test a model before fine-tuning in Pytorch Lightning?

Doing things on Google Colab.
transformers: 4.10.2
pytorch-lightning: 1.2.7
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
dataset_for_loader = [
{'data':torch.tensor([0,1]), 'labels':torch.tensor(0)},
{'data':torch.tensor([2,3]), 'labels':torch.tensor(1)},
{'data':torch.tensor([4,5]), 'labels':torch.tensor(2)},
{'data':torch.tensor([6,7]), 'labels':torch.tensor(3)},
]
loader = DataLoader(dataset_for_loader, batch_size=2)
for idx, batch in enumerate(loader):
print(f'# batch {idx}')
print(batch)
category_list = [
'dokujo-tsushin',
'it-life-hack',
'kaden-channel',
'livedoor-homme',
'movie-enter',
'peachy',
'smax',
'sports-watch',
'topic-news'
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
# file ./text has lots of articles, categorized by category
# and they are just plain texts, whose content begins from forth line
for file in glob.glob(f'./text/{category}/{category}*'):
lines = open(file).read().splitlines()
text = '\n'.join(lines[3:])
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True
)
encoding['labels'] = label
encoding = { k: torch.tensor(v) for k, v in encoding.items() }
dataset_for_loader.append(encoding)
SEED=lambda:0.0
# random.shuffle(dataset_for_loader) # ランダムにシャッフル
random.shuffle(dataset_for_loader,SEED)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]
dataloader_train = DataLoader(
dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)
class BertForSequenceClassification_pl(pl.LightningModule):
def __init__(self, model_name, num_labels, lr):
super().__init__()
self.save_hyperparameters()
self.bert_sc = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def training_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
loss = output.loss
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
val_loss = output.loss
self.log('val_loss', val_loss)
def test_step(self, batch, batch_idx):
labels = batch.pop('labels')
output = self.bert_sc(**batch)
labels_predicted = output.logits.argmax(-1)
num_correct = ( labels_predicted == labels ).sum().item()
accuracy = num_correct/labels.size(0)
self.log('accuracy', accuracy)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
checkpoint = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=1,
save_weights_only=True,
dirpath='model/',
)
trainer = pl.Trainer(
gpus=1,
max_epochs=10,
callbacks = [checkpoint]
)
model = BertForSequenceClassification_pl(
MODEL_NAME, num_labels=9, lr=1e-5
)
### (a) ###
# I think this is where I am doing fine-tuning
trainer.fit(model, dataloader_train, dataloader_val)
# this is to score after fine-tuning
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
But I am not really sure how to do a test before fine-tuning, in order to compare two models before and after fine-tuning, in order to show how effective fine-tuning is.
Inserting the following two lines to ### (a) ###:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
I got this result:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-c8b2c67f2d5c> in <module>()
9
10 # 6-19
---> 11 test = trainer.test(test_dataloaders=dataloader_test)
12 print(f'Accuracy: {test[0]["accuracy"]:.2f}')
13
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in test(self, model, test_dataloaders, ckpt_path, verbose, datamodule)
896 self.verbose_test = verbose
897
--> 898 self._set_running_stage(RunningStage.TESTING, model or self.lightning_module)
899
900 # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _set_running_stage(self, stage, model_ref)
563 the trainer and the model
564 """
--> 565 model_ref.running_stage = stage
566 self._running_stage = stage
567
AttributeError: 'NoneType' object has no attribute 'running_stage'
I noticed that Trainer.fit() can take None as arguments other than model, so I tried this:
trainer.fit(model)
test=trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
The result:
MisconfigurationException: No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.
Thanks.
The Trainer needs to call its .fit() in order to set up a lot of things and then only you can do .test() or other methods.
You are right about putting a .fit() just before .test() but the fit call needs to a valid one. You have to feed a dataloader/datamodule to it. But since you don't want to do a training/validation in this fit call, just pass limit_[train/val]_batches=0 while Trainer construction.
trainer = Trainer(gpus=..., ..., limit_train_batches=0, limit_val_batches=0)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # without fine-tuning
The fit call here will just set things up for you and skip training/validation. And then the testing follows. Next time run the same code but without the limit_[train/val]_batches, this will do the pretraining for you
trainer = Trainer(gpus=..., ...)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # with fine-tuning
Clarifying a bit about .fit() taking None for all but model: Its not quite true - you must provide either a DataLoader or a DataModule.

How to run a transformers bert without pipeline?

I have found myself dealing with an enviroment that does not support multiprocessing. How do I run my DistillBert without transformers pipeline?
Here is code right now:
import json
import os
import sys
sys.path.append("/mnt/access")
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers.pipelines import pipeline
def lambda_handler(event, context):
print("After:",os.listdir("/mnt/access"))
tokenizer = AutoTokenizer.from_pretrained('/mnt/access/Dis_Save/')
model = AutoModelForQuestionAnswering.from_pretrained('/mnt/access/Dis_Save/')
nlp_qa = pipeline('question-answering', tokenizer=tokenizer,model=model)
context = "tra"
question = "tra"
X = nlp_qa(context=context, question=question)
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
Error message I get right now:
{
"errorMessage": "[Errno 38] Function not implemented",
"errorType": "OSError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 18, in lambda_handler\n X = nlp_qa(context=context, question=question)\n",
" File \"/mnt/access/transformers/pipelines.py\", line 1776, in __call__\n features_list = [\n",
" File \"/mnt/access/transformers/pipelines.py\", line 1777, in <listcomp>\n squad_convert_examples_to_features(\n",
" File \"/mnt/access/transformers/data/processors/squad.py\", line 354, in squad_convert_examples_to_features\n with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:\n",
" File \"/var/lang/lib/python3.8/multiprocessing/context.py\", line 119, in Pool\n return Pool(processes, initializer, initargs, maxtasksperchild,\n",
" File \"/var/lang/lib/python3.8/multiprocessing/pool.py\", line 191, in __init__\n self._setup_queues()\n",
" File \"/var/lang/lib/python3.8/multiprocessing/pool.py\", line 343, in _setup_queues\n self._inqueue = self._ctx.SimpleQueue()\n",
" File \"/var/lang/lib/python3.8/multiprocessing/context.py\", line 113, in SimpleQueue\n return SimpleQueue(ctx=self.get_context())\n",
" File \"/var/lang/lib/python3.8/multiprocessing/queues.py\", line 336, in __init__\n self._rlock = ctx.Lock()\n",
" File \"/var/lang/lib/python3.8/multiprocessing/context.py\", line 68, in Lock\n return Lock(ctx=self.get_context())\n",
" File \"/var/lang/lib/python3.8/multiprocessing/synchronize.py\", line 162, in __init__\n SemLock.__init__(self, SEMAPHORE, 1, 1, ctx=ctx)\n",
" File \"/var/lang/lib/python3.8/multiprocessing/synchronize.py\", line 57, in __init__\n sl = self._semlock = _multiprocessing.SemLock(\n"
]
}
Other code:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import json
import sys
sys.path.append("/mnt/access")
tokenizer = AutoTokenizer.from_pretrained("/mnt/access/Dis_Save/")
model = AutoModelForQuestionAnswering.from_pretrained("/mnt/access/Dis_Save/", return_dict=True)
def lambda_handler(event, context):
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""
questions = ["How many pretrained models are available in 🤗 Transformers?",]
for question in questions:
inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer_start_scores, answer_end_scores = model(**inputs).values()
answer_start = torch.argmax(
answer_start_scores
) # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(f"Question: {question}")
print(f"Answer: {answer}")
return {
'statusCode': 200,
'body': json.dumps(answer)
}
Edit:
I run the code. It runs well on it's own, however I get an error whne running on API itself:
{
"errorMessage": "'tuple' object has no attribute 'values'",
"errorType": "AttributeError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 39, in lambda_handler\n answer_start_scores, answer_end_scores = model(**inputs).values()\n"
]
}

PyTorch custom dataset dataloader returns strings (of keys) not tensors

I am trying to load my own dataset and I use a custom Dataloader that reads in images and labels and converts them to PyTorch Tensors. However when the Dataloader is instantiated it returns strings x "image" and y "labels" but not the real values or tensors when read (iter)
print(self.train_loader) # shows a Tensor object
tic = time.time()
with tqdm(total=self.num_train) as pbar:
for i, (x, y) in enumerate(self.train_loader): # x and y are returned as string (where it fails)
if self.use_gpu:
x, y = x.cuda(), y.cuda()
x, y = Variable(x), Variable(y)
This is how dataloader.py looks like:
from __future__ import print_function, division #ds
import numpy as np
from utils import plot_images
import os #ds
import pandas as pd #ds
from skimage import io, transform #ds
import torch
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader #ds
from torchvision import transforms
from torchvision import utils #ds
from torch.utils.data.sampler import SubsetRandomSampler
class CDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.frame = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.frame)
def __getitem__(self, idx):
img_name = os.path.join(self.root_dir,
self.frame.iloc[idx, 0]+'.jpg')
image = io.imread(img_name)
# image = image.transpose((2, 0, 1))
labels = np.array(self.frame.iloc[idx, 1])#.as_matrix() #ds
#landmarks = landmarks.astype('float').reshape(-1, 2)
#print(image.shape)
#print(img_name,labels)
sample = {'image': image, 'labels': labels}
if self.transform:
sample = self.transform(sample)
return sample
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
image, labels = sample['image'], sample['labels']
#print(image)
#print(labels)
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
image = image.transpose((2, 0, 1))
#print(image.shape)
#print((torch.from_numpy(image)))
#print((torch.from_numpy(labels)))
return {'image': torch.from_numpy(image),
'labels': torch.from_numpy(labels)}
def get_train_valid_loader(data_dir,
batch_size,
random_seed,
#valid_size=0.1, #ds
#shuffle=True,
show_sample=False,
num_workers=4,
pin_memory=False):
"""
Utility function for loading and returning train and valid
multi-process iterators over the MNIST dataset. A sample
9x9 grid of the images can be optionally displayed.
If using CUDA, num_workers should be set to 1 and pin_memory to True.
Args
----
- data_dir: path directory to the dataset.
- batch_size: how many samples per batch to load.
- random_seed: fix seed for reproducibility.
- #ds valid_size: percentage split of the training set used for
the validation set. Should be a float in the range [0, 1].
In the paper, this number is set to 0.1.
- shuffle: whether to shuffle the train/validation indices.
- show_sample: plot 9x9 sample grid of the dataset.
- num_workers: number of subprocesses to use when loading the dataset.
- pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
True if using GPU.
Returns
-------
- train_loader: training set iterator.
- valid_loader: validation set iterator.
"""
#ds
#error_msg = "[!] valid_size should be in the range [0, 1]."
#assert ((valid_size >= 0) and (valid_size <= 1)), error_msg
#ds
# define transforms
#normalize = transforms.Normalize((0.1307,), (0.3081,))
trans = transforms.Compose([
ToTensor(), #normalize,
])
# load train dataset
#train_dataset = datasets.MNIST(
# data_dir, train=True, download=True, transform=trans
#)
train_dataset = CDataset(csv_file='/home/Desktop/6June17/util/train.csv',
root_dir='/home/caffe/data/images/',transform=trans)
# load validation dataset
#valid_dataset = datasets.MNIST( #ds
# data_dir, train=True, download=True, transform=trans #ds
#)
valid_dataset = CDataset(csv_file='/home/Desktop/6June17/util/eval.csv',
root_dir='/home/caffe/data/images/',transform=trans)
num_train = len(train_dataset)
train_indices = list(range(num_train))
#ds split = int(np.floor(valid_size * num_train))
num_valid = len(valid_dataset) #ds
valid_indices = list(range(num_valid)) #ds
#if shuffle:
# np.random.seed(random_seed)
# np.random.shuffle(indices)
#ds train_idx, valid_idx = indices[split:], indices[:split]
train_idx = train_indices #ds
valid_idx = valid_indices #ds
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=batch_size, sampler=train_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
print(train_loader)
valid_loader = torch.utils.data.DataLoader(
valid_dataset, batch_size=batch_size, sampler=valid_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
# visualize some images
if show_sample:
sample_loader = torch.utils.data.DataLoader(
dataset, batch_size=9, #shuffle=shuffle,
num_workers=num_workers, pin_memory=pin_memory
)
data_iter = iter(sample_loader)
images, labels = data_iter.next()
X = images.numpy()
X = np.transpose(X, [0, 2, 3, 1])
plot_images(X, labels)
return (train_loader, valid_loader)
def get_test_loader(data_dir,
batch_size,
num_workers=4,
pin_memory=False):
"""
Utility function for loading and returning a multi-process
test iterator over the MNIST dataset.
If using CUDA, num_workers should be set to 1 and pin_memory to True.
Args
----
- data_dir: path directory to the dataset.
- batch_size: how many samples per batch to load.
- num_workers: number of subprocesses to use when loading the dataset.
- pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
True if using GPU.
Returns
-------
- data_loader: test set iterator.
"""
# define transforms
#normalize = transforms.Normalize((0.1307,), (0.3081,))
trans = transforms.Compose([
ToTensor(), #normalize,
])
# load dataset
#dataset = datasets.MNIST(
# data_dir, train=False, download=True, transform=trans
#)
test_dataset = CDataset(csv_file='/home/Desktop/6June17/util/test.csv',
root_dir='/home/caffe/data/images/',transform=trans)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=batch_size, shuffle=False,
num_workers=num_workers, pin_memory=pin_memory,
)
return test_loader
#for i_batch, sample_batched in enumerate(dataloader):
# print(i_batch, sample_batched['image'].size(),
# sample_batched['landmarks'].size())
# # observe 4th batch and stop.
# if i_batch == 3:
# plt.figure()
# show_landmarks_batch(sample_batched)
# plt.axis('off')
# plt.ioff()
# plt.show()
# break
A minimal working sample will be difficult to post here but basically I am trying to modify this project http://torch.ch/blog/2015/09/21/rmva.html which works smoothly with MNIST. I am just trying to run it with my own dataset with the custom dataloader.py I use above.
It instantiates a Dataloader like this:
in trainer.py:
if config.is_train:
self.train_loader = data_loader[0]
self.valid_loader = data_loader[1]
self.num_train = len(self.train_loader.sampler.indices)
self.num_valid = len(self.valid_loader.sampler.indices)
-> run from main.py:
if config.is_train:
data_loader = get_train_valid_loader(
config.data_dir, config.batch_size,
config.random_seed, #config.valid_size,
#config.shuffle,
config.show_sample, **kwargs
)
You are not properly using python's enumerate(). (x, y) are currently assigned the 2 keys of your batch dictionary i.e. the strings "image" and "labels". This should solve your problem:
for i, batch in enumerate(self.train_loader):
x, y = batch["image"], batch["labels"]
# ...

Resources