How to update training dataset at epoch begin in Huggingface Trainer using Callback? - huggingface-transformers

I want to recreate the training dataset by a function generate_custom_train_set at the beginning of every epoch, however, is there a way I could do it with Trainer using callback?
My trainer looks like
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset.,
eval_dataset=validation_dataset,
tokenizer=tokenizer,
)

I'm having the same question as I try to implement Examples-proportional mixing from the T5 paper. I didn't find support from hugging face.
My current solution is to modify the trainer.train_dataset in the on_epoch_begin callback.
Here's an implementation. I'm using this in my own project. Seems to work.
First, implement your per-epoch change in your Dataset, in my case, it's the sample function for Examples-Proportional Mixing.
class ProportionMixingDataset:
"""
Examples-proportional mixing from T5
TODO: failed to find a pytorch working implementation
Equivalent to, for the larger datasets, a new subset is taken at each epoch,
then sample in the joined subset once
"""
def __init__(self, dataset_list: List[Dataset] = None, k: int = None):
"""
:param dataset_list: Ordered list of datasets
:param k: Artificial limit
"""
self.dsets = dataset_list
assert k is not None
self.k = k
self.dset_szs = [min(len(d), k) for d in self.dsets]
self.sz = sum(self.dset_szs)
self._sampled_idxs: List[Optional[torch.Tensor]] = [None] * len(self.dsets)
self.sample()
def sample(self):
"""
Sub-sample datasets larger than k
Intended to call in each epoch
"""
for i, dset in enumerate(self.dsets):
sz = len(dset)
if sz > self.k:
self._sampled_idxs[i] = torch.randperm(sz)[:self.k]
def __len__(self):
return self.sz
def _idx2dset_idx(self, idx: int) -> Tuple[int, int]:
"""
Convert a global index to a dataset index
"""
for i, sz in enumerate(self.dset_szs):
if idx < sz:
return i, idx
idx -= sz
raise ValueError('Should not happen')
def __getitem__(self, idx):
if not isinstance(idx, int):
raise ValueError('Batched indexing not supported')
idx_dset, idx = self._idx2dset_idx(idx)
dset = self.dsets[idx_dset]
if self._sampled_idxs[idx_dset] is not None: # A sub-sample index
idx = self._sampled_idxs[idx_dset][idx].item()
return dset[idx]
Then pass that dataset to Trainer.
Now comes the magic part:
class ProportionalMixCallback(TrainerCallback):
"""
Trigger re-computing subset for dataset Examples-proportional mixing, see `dataset::ProportionMixingDataset`
A hack that modifies the train dataset, pointed by Trainer's dataloader
"""
def __init__(self, trainer: Trainer):
self.trainer = trainer
def on_epoch_begin(self, args: TrainingArguments, state, control, **kwargs):
self.trainer.train_dataset.sample()
Pass this to your trainer as a callback.
This triggers the sample call which modifies the dataset at the times we need it.
This works becasue train_dataLoader in trainer still points to the same train dataset object.

Related

TensorFlow - directly calling tf.function much faster than calling tf.function returned from wrapper

I am training a VAE (using federated learning, but that is not so important) and wanted to keep the loss and train functions simple to exchange. The initial approach was to have a tf.function as loss function and a tf.function as train function as follows:
#tf.function
def kl_reconstruction_loss(model, model_input, beta):
x, y = model_input
mean, logvar = model.encode(x, y)
z = model.reparameterize(mean, logvar)
x_logit = model.decode(z, y)
cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)
reconstruction_loss = tf.reduce_mean(tf.reduce_sum(cross_ent, axis=[1, 2, 3]), axis=0)
kl_loss = tf.reduce_mean(0.5 * tf.reduce_sum(tf.exp(logvar) + tf.square(mean) - 1. - logvar, axis=-1), axis=0)
loss = reconstruction_loss + beta * kl_loss
return loss, kl_loss, reconstruction_loss
#tf.function
def train_fn(model: tf.keras.Model, batch, optimizer, kl_beta):
"""Trains the model on a single batch.
Args:
model: The VAE model.
batch: A batch of inputs [images, labels] for the vae.
optimizer: The optimizer to train the model.
beta: Weighting of KL loss
Returns:
The loss.
"""
def vae_loss():
"""Does the forward pass and computes losses for the generator."""
# N.B. The complete pass must be inside loss() for gradient tracing.
return kl_reconstruction_loss(model, batch, kl_beta)
with tf.GradientTape() as tape:
loss, kl_loss, rc_loss = vae_loss()
grads = tape.gradient(loss, model.trainable_variables)
grads_and_vars = zip(grads, model.trainable_variables)
optimizer.apply_gradients(grads_and_vars)
return loss
For my dataset this results in an epoch duration of approx. 25 seconds. However, since I have to call those functions directly in my code, I would have to enter different ones if I would want to try out different loss/train functions.
So, alternatively, I followed https://github.com/google-research/federated/tree/master/gans and wrapped the loss function in a class and the train function in another function. Now I have:
class VaeKlReconstructionLossFns(AbstractVaeLossFns):
#tf.function
def vae_loss(self, model, model_input, labels, global_round):
# KL Reconstruction loss
mean, logvar = model.encode(model_input, labels)
z = model.reparameterize(mean, logvar)
x_logit = model.decode(z, labels)
cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=model_input)
reconstruction_loss = tf.reduce_mean(tf.reduce_sum(cross_ent, axis=[1, 2, 3]), axis=0)
kl_loss = tf.reduce_mean(0.5 * tf.reduce_sum(tf.exp(logvar) + tf.square(mean) - 1. - logvar, axis=-1), axis=0)
loss = reconstruction_loss + self._get_beta(global_round) * kl_loss
if model.losses:
loss += tf.add_n(model.losses)
return loss, kl_loss, reconstruction_loss
def create_train_vae_fn(
vae_loss_fns: vae_losses.AbstractVaeLossFns,
vae_optimizer: tf.keras.optimizers.Optimizer):
"""Create a function that trains VAE, binding loss and optimizer.
Args:
vae_loss_fns: Instance of gan_losses.AbstractVAELossFns interface,
specifying the VAE training loss.
vae_optimizer: Optimizer for training the VAE.
Returns:
Function that executes one step of VAE training.
"""
# We check that the optimizer has not been used previously, which ensures
# that when it is bound the train fn isn't holding onto a different copy of
# the optimizer variables then the copy that is being exchanged b/w server and
# clients.
if vae_optimizer.variables():
raise ValueError(
'Expected vae_optimizer to not have been used previously, but '
'variables were already initialized.')
#tf.function
def train_vae_fn(model: tf.keras.Model,
model_inputs,
labels,
global_round,
new_optimizer_state=None):
"""Trains the model on a single batch.
Args:
model: The VAE model.
model_inputs: A batch of inputs (usually images) for the VAE.
labels: A batch of labels corresponding to the inputs.
global_round: The current glob al FL round for beta calculation
new_optimizer_state: A possible optimizer state to overwrite the current one with.
Returns:
The number of examples trained on.
The loss.
The updated optimizer state.
"""
def vae_loss():
"""Does the forward pass and computes losses for the generator."""
# N.B. The complete pass must be inside loss() for gradient tracing.
return vae_loss_fns.vae_loss(model, model_inputs, labels, global_round)
# Set optimizer vars
optimizer_state = get_optimizer_state(vae_optimizer)
if new_optimizer_state is not None:
# if optimizer is uninitialised, initialise vars
try:
tf.nest.assert_same_structure(optimizer_state, new_optimizer_state)
except ValueError:
initialize_optimizer_vars(vae_optimizer, model)
optimizer_state = get_optimizer_state(vae_optimizer)
tf.nest.assert_same_structure(optimizer_state, new_optimizer_state)
tf.nest.map_structure(lambda a, b: a.assign(b), optimizer_state, new_optimizer_state)
with tf.GradientTape() as tape:
loss, kl_loss, rc_loss = vae_loss()
grads = tape.gradient(loss, model.trainable_variables)
grads_and_vars = zip(grads, model.trainable_variables)
vae_optimizer.apply_gradients(grads_and_vars)
return tf.shape(model_inputs)[0], loss, optimizer_state
return train_vae_fn
This new formulation takes about 86 seconds per epoch.
I am struggling to understand why the second version performs so much worse than the first one. Does anyone have a good explanation for this?
Thanks in advance!
EDIT: My Tensorflow version is 2.5.0

How to test a model before fine-tuning in Pytorch Lightning?

Doing things on Google Colab.
transformers: 4.10.2
pytorch-lightning: 1.2.7
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
dataset_for_loader = [
{'data':torch.tensor([0,1]), 'labels':torch.tensor(0)},
{'data':torch.tensor([2,3]), 'labels':torch.tensor(1)},
{'data':torch.tensor([4,5]), 'labels':torch.tensor(2)},
{'data':torch.tensor([6,7]), 'labels':torch.tensor(3)},
]
loader = DataLoader(dataset_for_loader, batch_size=2)
for idx, batch in enumerate(loader):
print(f'# batch {idx}')
print(batch)
category_list = [
'dokujo-tsushin',
'it-life-hack',
'kaden-channel',
'livedoor-homme',
'movie-enter',
'peachy',
'smax',
'sports-watch',
'topic-news'
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
# file ./text has lots of articles, categorized by category
# and they are just plain texts, whose content begins from forth line
for file in glob.glob(f'./text/{category}/{category}*'):
lines = open(file).read().splitlines()
text = '\n'.join(lines[3:])
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True
)
encoding['labels'] = label
encoding = { k: torch.tensor(v) for k, v in encoding.items() }
dataset_for_loader.append(encoding)
SEED=lambda:0.0
# random.shuffle(dataset_for_loader) # ランダムにシャッフル
random.shuffle(dataset_for_loader,SEED)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]
dataloader_train = DataLoader(
dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)
class BertForSequenceClassification_pl(pl.LightningModule):
def __init__(self, model_name, num_labels, lr):
super().__init__()
self.save_hyperparameters()
self.bert_sc = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def training_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
loss = output.loss
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
val_loss = output.loss
self.log('val_loss', val_loss)
def test_step(self, batch, batch_idx):
labels = batch.pop('labels')
output = self.bert_sc(**batch)
labels_predicted = output.logits.argmax(-1)
num_correct = ( labels_predicted == labels ).sum().item()
accuracy = num_correct/labels.size(0)
self.log('accuracy', accuracy)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
checkpoint = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=1,
save_weights_only=True,
dirpath='model/',
)
trainer = pl.Trainer(
gpus=1,
max_epochs=10,
callbacks = [checkpoint]
)
model = BertForSequenceClassification_pl(
MODEL_NAME, num_labels=9, lr=1e-5
)
### (a) ###
# I think this is where I am doing fine-tuning
trainer.fit(model, dataloader_train, dataloader_val)
# this is to score after fine-tuning
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
But I am not really sure how to do a test before fine-tuning, in order to compare two models before and after fine-tuning, in order to show how effective fine-tuning is.
Inserting the following two lines to ### (a) ###:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
I got this result:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-c8b2c67f2d5c> in <module>()
9
10 # 6-19
---> 11 test = trainer.test(test_dataloaders=dataloader_test)
12 print(f'Accuracy: {test[0]["accuracy"]:.2f}')
13
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in test(self, model, test_dataloaders, ckpt_path, verbose, datamodule)
896 self.verbose_test = verbose
897
--> 898 self._set_running_stage(RunningStage.TESTING, model or self.lightning_module)
899
900 # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _set_running_stage(self, stage, model_ref)
563 the trainer and the model
564 """
--> 565 model_ref.running_stage = stage
566 self._running_stage = stage
567
AttributeError: 'NoneType' object has no attribute 'running_stage'
I noticed that Trainer.fit() can take None as arguments other than model, so I tried this:
trainer.fit(model)
test=trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
The result:
MisconfigurationException: No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.
Thanks.
The Trainer needs to call its .fit() in order to set up a lot of things and then only you can do .test() or other methods.
You are right about putting a .fit() just before .test() but the fit call needs to a valid one. You have to feed a dataloader/datamodule to it. But since you don't want to do a training/validation in this fit call, just pass limit_[train/val]_batches=0 while Trainer construction.
trainer = Trainer(gpus=..., ..., limit_train_batches=0, limit_val_batches=0)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # without fine-tuning
The fit call here will just set things up for you and skip training/validation. And then the testing follows. Next time run the same code but without the limit_[train/val]_batches, this will do the pretraining for you
trainer = Trainer(gpus=..., ...)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # with fine-tuning
Clarifying a bit about .fit() taking None for all but model: Its not quite true - you must provide either a DataLoader or a DataModule.

Using multiple validation sets with keras

I am training a model with keras using the model.fit() method.
I would like to use multiple validation sets that should be validated on separately after each training epoch so that i get one loss value for each validation set. If possible they should be both displayed during training and as well be returned by the keras.callbacks.History() callback.
I am thinking of something like this:
history = model.fit(train_data, train_targets,
epochs=epochs,
batch_size=batch_size,
validation_data=[
(validation_data1, validation_targets1),
(validation_data2, validation_targets2)],
shuffle=True)
I currently have no idea how to implement this. Is it possible to achieve this by writing my own Callback? Or how else would you approach this problem?
I ended up writing my own Callback based on the History callback to solve the problem. I'm not sure if this is the best approach but the following Callback records losses and metrics for the training and validation set like the History callback as well as losses and metrics for additional validation sets passed to the constructor.
class AdditionalValidationSets(Callback):
def __init__(self, validation_sets, verbose=0, batch_size=None):
"""
:param validation_sets:
a list of 3-tuples (validation_data, validation_targets, validation_set_name)
or 4-tuples (validation_data, validation_targets, sample_weights, validation_set_name)
:param verbose:
verbosity mode, 1 or 0
:param batch_size:
batch size to be used when evaluating on the additional datasets
"""
super(AdditionalValidationSets, self).__init__()
self.validation_sets = validation_sets
for validation_set in self.validation_sets:
if len(validation_set) not in [3, 4]:
raise ValueError()
self.epoch = []
self.history = {}
self.verbose = verbose
self.batch_size = batch_size
def on_train_begin(self, logs=None):
self.epoch = []
self.history = {}
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
self.epoch.append(epoch)
# record the same values as History() as well
for k, v in logs.items():
self.history.setdefault(k, []).append(v)
# evaluate on the additional validation sets
for validation_set in self.validation_sets:
if len(validation_set) == 3:
validation_data, validation_targets, validation_set_name = validation_set
sample_weights = None
elif len(validation_set) == 4:
validation_data, validation_targets, sample_weights, validation_set_name = validation_set
else:
raise ValueError()
results = self.model.evaluate(x=validation_data,
y=validation_targets,
verbose=self.verbose,
sample_weight=sample_weights,
batch_size=self.batch_size)
for metric, result in zip(self.model.metrics_names,results):
valuename = validation_set_name + '_' + metric
self.history.setdefault(valuename, []).append(result)
which i am then using like this:
history = AdditionalValidationSets([(validation_data2, validation_targets2, 'val2')])
model.fit(train_data, train_targets,
epochs=epochs,
batch_size=batch_size,
validation_data=(validation_data1, validation_targets1),
callbacks=[history]
shuffle=True)
I tested this on TensorFlow 2 and it worked. You can evaluate on as many validation sets as you want at the end of each epoch:
class MyCustomCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
res_eval_1 = self.model.evaluate(X_test_1, y_test_1, verbose = 0)
res_eval_2 = self.model.evaluate(X_test_2, y_test_2, verbose = 0)
print(res_eval_1)
print(res_eval_2)
And later:
my_val_callback = MyCustomCallback()
# Your model creation code
model.fit(..., callbacks=[my_val_callback])
Considering the current keras docs, you can pass callbacks to evaluate and evaluate_generator. So you can call evaluate multiple times with different datasets.
I have not tested it, so I am happy if you comment your experiences with it below.

What is the difference between gensim LabeledSentence and TaggedDocument

Please help me in understanding the difference between how TaggedDocument and LabeledSentence of gensim works. My ultimate goal is Text Classification using Doc2Vec model and any classifier. I am following this blog!
class MyLabeledSentences(object):
def __init__(self, dirname, dataDct={}, sentList=[]):
self.dirname = dirname
self.dataDct = {}
self.sentList = []
def ToArray(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname, fname)) as fin:
for item_no, sentence in enumerate(fin):
self.sentList.append(LabeledSentence([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no]))
return sentList
class MyTaggedDocument(object):
def __init__(self, dirname, dataDct={}, sentList=[]):
self.dirname = dirname
self.dataDct = {}
self.sentList = []
def ToArray(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname, fname)) as fin:
for item_no, sentence in enumerate(fin):
self.sentList.append(TaggedDocument([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no]))
return sentList
sentences = MyLabeledSentences(some_dir_name)
model_l = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7)
sentences_l = sentences.ToArray()
model_l.build_vocab(sentences_l )
for epoch in range(15): #
random.shuffle(sentences_l )
model.train(sentences_l )
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model_l.alpha
sentences = MyTaggedDocument(some_dir_name)
model_t = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7)
sentences_t = sentences.ToArray()
model_l.build_vocab(sentences_t)
for epoch in range(15): #
random.shuffle(sentences_t)
model.train(sentences_t)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model_l.alpha
My question is model_l.docvecs['some_word'] is same as model_t.docvecs['some_word']?
Can you provide me weblink of good sources to get a grasp on how TaggedDocument or LabeledSentence works.
LabeledSentence is an older, deprecated name for the same simple object-type to encapsulate a text-example that is now called TaggedDocument. Any objects that have words and tags properties, each a list, will do. (words is always a list of strings; tags can be a mix of integers and strings, but in the common and most-efficient case, is just a list with a single id integer, starting at 0.)
model_l and model_t will serve the same purposes, having trained on the same data with the same parameters, using just different names for the objects. But the vectors they'll return for individual word-tokens (model['some_word']) or document-tags (model.docvecs['somefilename_NN']) will likely be different – there's randomness in Word2Vec/Doc2Vec initialization and training-sampling, and introduced by ordering-jitter from multithreaded training.

PySide TableView and Variant data - Display Role Not Displaying

I have this code, which when i run it in PyQt it works totally fine, but when i run it in pyside things get wierd. I get all the columns and rows im supposed to, and if i go to them via scripting and get the data, each cell says what it should. However, even though i set these as display roles, NO text shows in the table.
None in the headers, none in any of the cells. Im at a loss!
(For thos wondering, NulLVariant() just returns either None or QVariant() depending if were on pyside or pyqt)
This model is meant to take a List of Dicts to addRows, and uses dict keys to make columns.
class CustomTableModel(QtCore.QAbstractTableModel):
def __init__(self, parent=None, parentTable=None):
"""
Custom data model for holding table data.
:param parent: The parent widget/layout so that this data model gets deleted properly on close.
:param parentTable: the table that is using this data. This is used to get the font metrics of the table
display font.
"""
super(CustomTableModel, self).__init__(parent)
self.parent_table = parentTable
self.auto_resize = False
self._avg_font_w = 5
self._resize_data = defaultdict(int)
self.items = []
self.headers = []
def setParentTable(self, widget):
"""
Sets the parent table widget so that we can get its font metrics for setting our column width with autoResize.
:param widget: TableViewWidget
:raise TypeError:
"""
if not isinstance(widget, QtGui.QTableView):
raise TypeError('Must be a TableView item')
self.parent_table = widget
def setAutoResize(self, b):
"""
Turns on or off auto resize for the table. This gathers the font metrics of the parent table, and then loops
over any current data, or newly added data (including table headers) to get the widest item, and sets the
column width to fit this.
:param b: bool
:raise AttributeError:
"""
if not self.parent_table:
raise AttributeError('You must call setParentTable first to set the parent TableView item')
self.auto_resize = b
if b:
self._autoAllResizeData()
self._doColumnResize()
else:
self._resize_data = dict()
def updateSize(self):
"""
Force the table size to update to the current size data.
"""
self._doColumnResize()
def updateSizeData(self):
"""
Force an update/regathering of all the size data for each row and column.
"""
self._autoAllResizeData(True)
self._doColumnResize()
def _doColumnResize(self):
for i in range(len(self.headers)):
txt = self.headers[i]
self.parent_table.setColumnWidth(i, self._resize_data.get(txt))
def _getKeyList(self):
if self.headers:
return self.headers
elif self.items:
return sorted(self.items[0].keys())
def _getTableFontWidth(self):
self._avg_font_w = self.parent_table.fontMetrics().averageCharWidth()
def _autoAllResizeData(self, reset=False):
if not self._resize_data or reset is True:
self._resize_data = defaultdict(int)
key_list = self._getKeyList()
for header in key_list:
header_width = len(header) * (self._avg_font_w * 1.55)
if header_width > self._resize_data[header]:
self._resize_data[header] = header_width
for item in self.items:
value = item.get(header)
width = len(str(value)) * self._avg_font_w
if width > self._resize_data[header]:
self._resize_data[header] = width
def _autoSingleResizeData(self, data):
key_list = self._getKeyList()
for header in key_list:
value = data.get(header)
if value:
width = len(str(value)) * self._avg_font_w
if width > self._resize_data[header]:
self._resize_data[header] = width
def setHeaders(self, items):
"""
This allows you to set your header item text
:param items: a list of header text, ie ['Name', 'Email', 'Department']
"""
lastCount = self.columnCount(QtCore.QModelIndex())
self.headers = items
self.beginRemoveColumns(QtCore.QModelIndex(), 0, lastCount)
for x in range(lastCount):
self.removeColumn(x)
self.endRemoveColumns()
self.beginInsertColumns(QtCore.QModelIndex(), 0, len(items)-1)
self.endInsertColumns()
def addRow(self, data):
"""
Accepts a dict of data to add to the data model.
:param data: dict (this should match the same key length/names as the other data in the table.)
"""
row = len(self.items)
self.beginInsertRows(QtCore.QModelIndex(), row, row)
self.items.append(data)
self.endInsertRows()
if self.auto_resize:
self._autoSingleResizeData(data)
self._doColumnResize()
def addRows(self, data):
"""
Accepts a list of dicts to add them all to the table, with each list index being a row, and each dict key
a column.
:param data: list of dicts
:raise ValueError:
"""
if not isinstance(data, list) or not isinstance(data[0], dict):
raise ValueError('input must be a list of dicts!')
start_row = len(self.items)
end_row = len(data) + start_row - 1
self.beginInsertRows(QtCore.QModelIndex(), start_row, end_row)
self.items.extend(data)
self.endInsertRows()
if self.auto_resize:
for item in data:
self._autoSingleResizeData(item)
self._doColumnResize()
def removeRow(self, row):
"""
Remove the row at index 'row'.
:param row: int
"""
self.beginRemoveRows(QtCore.QModelIndex(), row, row)
self.items.pop(row)
self.endRemoveRows()
def clear(self):
"""
Clear all table data and start fresh.
"""
rows = self.rowCount(QtCore.QModelIndex())
self.beginRemoveRows(QtCore.QModelIndex(), 0, rows)
self.items = []
self.endRemoveRows()
cols = self.columnCount(QtCore.QModelIndex())
self.beginRemoveColumns(QtCore.QModelIndex(), 0, cols)
self.headers = []
self.endRemoveColumns()
def rowCount(self, QModelIndex):
"""
Return the row count.
:param QModelIndex:
:return:
"""
return len(self.items)
def columnCount(self, QModelIndex):
"""
Return the column count (default 1)
:param QModelIndex:
:return:
"""
try:
return len(self.items[0].keys())
except:
return 1
def data(self, index, role):
"""
Accepts a QModelIndex and a Qt.Role and returns the data at the given modelIndex.
:param index: QModelIndex
:param role: QtCore.Qt.<Role>
:return:
"""
row = index.row()
col = index.column()
if role == QtCore.Qt.DisplayRole:
key_list = self._getKeyList()
return QtCore.QVariant(str(self.items[row][key_list[col]]))
return NullVariant()
def intGetData(self, row, col):
"""
Gets the data at 'row' and 'col'.
:param row: int
:param col: int
:return: QVariant() data.
"""
try:
key_list = self._getKeyList()
return QtCore.QVariant(str(self.items[row][key_list[col]]))
except:
return NullVariant()
def headerData(self, section, orientation, role):
"""
Sets the header data based on our header key list.
:param section: section header
:param orientation: orientation
:param role: Qt<Role>
:return:
"""
if role == QtCore.Qt.DisplayRole:
if orientation == QtCore.Qt.Horizontal:
if not self.items:
if section == 0:
return QtCore.QVariant(str("Column 1"))
else:
key_list = self._getKeyList()
try:
return QtCore.QVariant(str(key_list[section]))
except:
return QtCore.QVariant('No Data')
return NullVariant()
class CustomSortModel(QtGui.QSortFilterProxyModel):
def __init__(self, parent=None):
"""
Custom QSortFilterProxyModel to allow sorting and filtering of our custom data model.
:param parent: parent so that this model is deleted properly upon close.
"""
super(CustomSortModel, self).__init__(parent)
self.countAllColumns = False
self._sortingColumn = 0
def filterAcceptsRow(self, sourceRow, sourceParent):
"""
Overriding how we choose what rows match our input filter text.
:param sourceRow: row index in question
:param sourceParent: QModelIndex
:return: bool (accepted or not)
"""
txt = ''
if self.countAllColumns:
for x in range(len(self.sourceModel().headers)):
txt += self.sourceModel().intGetData(sourceRow, x).toString()
else:
txt = self.sourceModel().intGetData(sourceRow, self._sortingColumn).toString()
if self.filterRegExp().pattern():
b = bool(re.search(str(self.filterRegExp().pattern()), str(txt)))
else:
b = bool(re.search('.*', str(txt)))
return b
def setFilterKeyColumn(self, col):
"""
Sets which column index you want the filter to apply to. -1 or less means we search all columns - otherwise,
the filter rules apply to the column index given.
:param col: signed int
:return:
"""
if col <= -1:
self.countAllColumns = True
return
self.countAllColumns = False
self._sortingColumn = col
super(CustomSortModel, self).setFilterKeyColumn(col)
Edit:
I was getting a wierd error when i tried to delete this question, but I have added a newer one, with a better cut down example for testing here:
https://stackoverflow.com/questions/34074825/pyside-qtableview-not-displaying-text-like-pyqt-does
Running your code in PySide gives a bunch of errors :
AttributeError: 'module' object has no attribute 'QVariant'
That's because there is no QVariant in PySide any more. Replacing all QVariantby regular python types fixes the code.
For example
return QtCore.QVariant('No Data')
becomes
return "No Data"

Resources