Discrepancies in gensim doc2vec embedding vectors - gensim

I use gensim Doc2Vec package to train doc2vec embeddings. I would expect that two models trained with the identical parameters and data would have very close values of the doc2vec vectors. However, in my experience it is only true with doc2vec trained in the PV-DBOW without training word embedding (dbow_words = 0).
For PV-DM and for PV-DBOW with dbow_words = 1, i.e. every case the word embedding are trained along with doc2vec, the doc2vec embedding vectors for identically trained models are fairly different.
Here is my code
from sklearn.datasets import fetch_20newsgroups
from gensim import models
import scipy.spatial.distance as distance
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
def clean_text(texts, min_length = 2):
clean = []
#don't remove apostrophes
translator = str.maketrans(punctuation.replace('\'',' '), ' '*len(punctuation))
for text in texts:
text = text.translate(translator)
tokens = text.split()
# remove not alphabetic tokens
tokens = [word.lower() for word in tokens if word.isalpha()]
# filter out stop words
stop_words = stopwords.words('english')
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) >= min_length]
tokens = ' '.join(tokens)
clean.append(tokens)
return clean
def tag_text(all_text, tag_type =''):
tagged_text = []
for i, text in enumerate(all_text):
tag = tag_type + '_' + str(i)
tagged_text.append(models.doc2vec.TaggedDocument(text.split(), [tag]))
return tagged_text
def train_docvec(dm, dbow_words, min_count, epochs, training_data):
model = models.Doc2Vec(dm=dm, dbow_words = dbow_words, min_count = min_count)
model.build_vocab(tagged_data)
model.train(training_data, total_examples=len(training_data), epochs=epochs)
return model
def compare_vectors(vector1, vector2):
cos_distances = []
for i in range(len(vector1)):
d = distance.cosine(vector1[i], vector2[i])
cos_distances.append(d)
print (np.median(cos_distances))
print (np.std(cos_distances))
dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
n_samples = len(dataset.data)
data = clean_text(dataset.data)
tagged_data = tag_text(data)
data_labels = dataset.target
data_label_names = dataset.target_names
model_dbow1 = train_docvec(0, 0, 4, 30, tagged_data)
model_dbow2 = train_docvec(0, 0, 4, 30, tagged_data)
model_dbow3 = train_docvec(0, 1, 4, 30, tagged_data)
model_dbow4 = train_docvec(0, 1, 4, 30, tagged_data)
model_dm1 = train_docvec(1, 0, 4, 30, tagged_data)
model_dm2 = train_docvec(1, 0, 4, 30, tagged_data)
compare_vectors(model_dbow1.docvecs, model_dbow2.docvecs)
> 0.07795828580856323
> 0.02610614028793008
compare_vectors(model_dbow1.docvecs, model_dbow3.docvecs)
> 0.6476179957389832
> 0.14797587172616306
compare_vectors(model_dbow3.docvecs, model_dbow4.docvecs)
> 0.19878000020980835
> 0.06362519480831186
compare_vectors(model_dm1.docvecs, model_dm2.docvecs)
> 0.13536489009857178
> 0.045365127475424386
compare_vectors(model_dbow1.docvecs, model_dm1.docvecs)
> 0.6358324736356735
> 0.15150255674571805
UPDATE
I tried, as suggested by gojomo, to compare the differences between the vectors, and, unfortunately, those are even worse:
def compare_vector_differences(vector1, vector2):
diff1 = []
diff2 = []
for i in range(len(vector1)-1):
diff1.append( vector1[i+1] - vector1[i])
for i in range(len(vector2)-1):
diff2[i].append(vector2[i+1] - vector2[i])
cos_distances = []
for i in range(len(diff1)):
d = distance.cosine(diff1[i], diff2[i])
cos_distances.append(d)
print (np.median(cos_distances))
print (np.std(cos_distances))
compare_vector_differences(model_dbow1.docvecs, model_dbow2.docvecs)
> 0.1134452223777771
> 0.02676398444178949
compare_vector_differences(model_dbow1.docvecs, model_dbow3.docvecs)
> 0.8464127033948898
> 0.11423789350773429
compare_vector_differences(model_dbow4.docvecs, model_dbow3.docvecs)
> 0.27400463819503784
> 0.05984108730423529
SECOND UPDATE
This time, after I finally understood gojomo, the things look fine.
def compare_distance_differences(vector1, vector2):
diff1 = []
diff2 = []
for i in range(len(vector1)-1):
diff1.append( distance.cosine(vector1[i+1], vector1[i]))
for i in range(len(vector2)-1):
diff2.append( distance.cosine(vector2[i+1], vector2[i]))
diff_distances = []
for i in range(len(diff1)):
diff_distances.append(abs(diff1[i] - diff2[i]))
print (np.median(diff_distances))
print (np.std(diff_distances))
compare_distance_differences(model_dbow1.docvecs, model_dbow2.docvecs)
>0.017469733953475952
>0.01659284710785352
compare_distance_differences(model_dbow1.docvecs, model_dbow3.docvecs)
>0.0786697268486023
>0.06092163158218411
compare_distance_differences(model_dbow3.docvecs, model_dbow4.docvecs)
>0.02321992814540863
>0.023095123172320778

The doc-vectors (or word-vectors) of Doc2Vec & Word2Vec models are only meaningfully comparable to other vectors that were co-trained, in the same interleaved training sessions.
Otherwise, randomness introduced by the algorithms (random-initialization & random-sampling) and by slight differences in training ordering (from multithreading) will cause the trained positions of individual vectors to wander to arbitrarily different positions. Their relative distances/directions, to other vectors that shared interleaved training, should be about as equally-useful from one model to the next.
But there's no one right place for such a vector, and measuring the differences between the vector for document '1' (or word 'foo') in one model, and the corresponding vector in another model, isn't reflective of anything the models/algorithms are trained to provide.
There's more information in the Gensim FAQ:
Q11: I've trained my Word2Vec/Doc2Vec/etc model repeatedly using the exact same text corpus, but the vectors are different each time. Is there a bug or have I made a mistake?

Related

Understanding the distance metric in company name matching using KNN

I am trying to understand the following code that I found for matching a messy list of company names to a list of clean list of company names. My question is what the 'Ratio' metric is calculated using. It appears that the ratio is from scorer = fuzz.token_sort_ratio which is I understand is part of the fuzzywuzzy package and therefore a levenschtein distance calculation correct? I'm trying to understand why the author uses this as the scorer rather than the distance output from KNN. When I try changing the metric inside NearestNeighbors, it doesn't appear to change the results. Does the metric in NearestNeighbors matter then?
Original article:
https://audhiaprilliant.medium.com/fuzzy-string-matching-optimization-using-tf-idf-and-knn-b07fce69b58f
def build_vectorizer(
clean: pd.Series,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 4),
n_neighbors: int = 1,
**kwargs
) -> Tuple:
# Create vectorizer
vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
X = vectorizer.fit_transform(clean.values.astype('U'))
# Fit nearest neighbors corpus
nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
return vectorizer, nbrs
# String matching - KNN
def tfidf_nn(
messy,
clean,
n_neighbors = 1,
**kwargs
):
# Fit clean data and transform messy data
vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
input_vec = vectorizer.transform(messy)
# Determine best possible matches
distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
nearest_values = np.array(clean)[indices]
return nearest_values, distances
# String matching - match fuzzy
def find_matches_fuzzy(
row,
match_candidates,
limit = 5
):
row_matches = process.extract(
row, dict(enumerate(match_candidates)),
scorer = fuzz.token_sort_ratio,
limit = limit
)
result = [(row, match[0], match[1]) for match in row_matches]
return result
# String matching - TF-IDF
def fuzzy_nn_match(
messy,
clean,
column,
col,
n_neighbors = 100,
limit = 5, **kwargs):
nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)
results = [find_matches_fuzzy(row, nearest_values[i], limit) for i, row in enumerate(messy)]
df = pd.DataFrame(itertools.chain.from_iterable(results),
columns = [column, col, 'Ratio']
)
return df
# String matching - Fuzzy
def fuzzy_tf_idf(
df: pd.DataFrame,
column: str,
clean: pd.Series,
mapping_df: pd.DataFrame,
col: str,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 3)
) -> pd.Series:
# Create vectorizer
clean = clean.drop_duplicates().reset_index(drop = True)
messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
messy = messy_prep.apply(preprocess_string)
result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
# Map value from messy to clean
return result

How can I get the score from Question-Answer Pipeline? Is there a bug when Question-answer pipeline is used?

When I run the following code
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
text = r"""
As checked Dis is not yet on boarded to ARB portal, hence we cannot upload the invoices in portal
"""
questions = [
"Dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.",
]
for question in questions:
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(
answer_start_scores
) # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(f"Question: {question}")
print(f"Answer: {answer}\n")
The answer that I get here is:
Question: Dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.
Answer: dis is not yet on boarded to ARB portal
How do I get a score for this answer? Score here is very similar to what is I get when I run Question-Answer pipeline .
I have to take this approach since Question-Answer pipeline when used is giving me Key Error for the below code
from transformers import pipeline
nlp = pipeline("question-answering")
context = r"""
As checked Dis is not yet on boarded to ARB portal, hence we cannot upload the invoices in portal.
"""
print(nlp(question="Dis asked if it is possible to post the two invoice in ARB?", context=context))
This is my attempt to get the score. It appears that I cannot figure out what feature.p_mask. So I could not remove the non-context indexes that contribute to the softmax at the moment.
# ... assuming imports and question and context
model_name="deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
inputs = tokenizer(question, context,
add_special_tokens=True,
return_tensors='pt')
input_ids = inputs['input_ids'].tolist()[0]
outputs = model(**inputs)
# used to compute score
start = outputs.start_logits.detach().numpy()
end = outputs.end_logits.detach().numpy()
# from source code
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
#?? undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
# Generate mask
undesired_tokens = inputs['attention_mask']
undesired_tokens_mask = undesired_tokens == 0.0
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start_ = np.where(undesired_tokens_mask, -10000.0, start)
end_ = np.where(undesired_tokens_mask, -10000.0, end)
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
# Compute the score of each tuple(start, end) to be the real answer
outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))
# Remove candidate with end < start and end - start > max_answer_len
max_answer_len = 15
candidates = np.tril(np.triu(outer), max_answer_len - 1)
scores_flat = candidates.flatten()
idx_sort = [np.argmax(scores_flat)]
start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
end += 1
score = candidates[0, start, end-1]
start, end, score = start.item(), end.item(), score.item()
print(tokenizer.decode(input_ids[start:end]))
print(score)
See more source code

Get position of token in berts output layer

We are interested in the bert vectors for each token. With bert vector we mean the word vector for a specific token in berts output layer. So we would like to find out which token produces which bert vector. We wrote some code but we are not sure if it is correct or how to test it.
So in the code we process a sentence with bert. We construct a list of position ids and hand them to the model. Afterwards we use the same position ids to map the tokens to the output layer. Then there is some code that produces calculates the character offsets of each vector in the input sentence.
Is this the correct way how to use position_ids to generate
from transformers import BertModel, BertConfig, BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def sentence_to_vector(input_sentence):
tokens_encoded = tokenizer.encode(input_sentence, add_special_tokens=True)
input_ids = torch.tensor(tokens_encoded).unsqueeze(0) # Batch size 1
seq_length = input_ids.size(1)
# code to construct position_ids from here:
# https://github.com/huggingface/transformers/blob/8da280ebbeca5ebd7561fd05af78c65df9161f92/pytorch_pretrained_bert/modeling.py#L188:L189
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
outputs = model(input_ids, position_ids=position_ids)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
# from the BertModel documentation (example at the bottom):
# The last hidden-state is the first element of the output tuple
# https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
#ttv = {} # token to vector
#for i in position_ids[0]:
# ttv[tokens[i]] = outputs[0][0][position_ids[0][i]]
data = []
last_offset = 0
for i in range(0, len(position_ids[0])):
token = tokens[position_ids[0][i]]
vector = outputs[0][0][position_ids[0][i]]
pos_begin = None
pos_end = None
if not token == "[CLS]" and not token == "[SEP]":
pos_begin = input_sentence.find(token, last_offset)
pos_end = pos_begin + len(token)
last_offset = pos_end
data.append({
"token": token,
"pos_begin": pos_begin,
"pos_end": pos_end,
"vector": vector
})
return data
input_sentence = "do the chicken dance!"
data = sentence_to_vector(input_sentence)
for token in data:
print(token["token"] + "\t" + str(token["pos_begin"]) + "\t" + str(token["pos_end"]) + "\t" + str(token["vector"][0:3]) + "..." )

How to do parallel processing in pytorch

I am working on a deep learning problem. I am solving it using pytorch. I have two GPU's which are on the same machine (16273MiB,12193MiB). I want to use both the GPU's for my training (video dataset).
I get a warning:
There is an imbalance between your GPUs. You may want to exclude GPU 1 which
has less than 75% of the memory or cores of GPU 0. You can do so by setting
the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
environment variable.
warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
I also get an error:
raise TypeError('Broadcast function not implemented for CPU tensors')
TypeError: Broadcast function not implemented for CPU tensors
if __name__ == '__main__':
opt.scales = [opt.initial_scale]
for i in range(1, opt.n_scales):
opt.scales.append(opt.scales[-1] * opt.scale_step)
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.mean = get_mean(opt.norm_value)
opt.std = get_std(opt.norm_value)
print("opt",opt)
with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
json.dump(vars(opt), opt_file)
torch.manual_seed(opt.manual_seed)
model, parameters = generate_model(opt)
#print(model)
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable parameters: ", pytorch_total_params)
# Define Class weights
if opt.weighted:
print("Weighted Loss is created")
if opt.n_finetune_classes == 2:
weight = torch.tensor([1.0, 3.0])
else:
weight = torch.ones(opt.n_finetune_classes)
else:
weight = None
criterion = nn.CrossEntropyLoss()
if not opt.no_cuda:
criterion = nn.DataParallel(criterion.cuda())
if opt.no_mean_norm and not opt.std_norm:
norm_method = Normalize([0, 0, 0], [1, 1, 1])
elif not opt.std_norm:
norm_method = Normalize(opt.mean, [1, 1, 1])
else:
norm_method = Normalize(opt.mean, opt.std)
train_loader = torch.utils.data.DataLoader(
training_data,
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.n_threads,
pin_memory=True)
train_logger = Logger(
os.path.join(opt.result_path, 'train.log'),
['epoch', 'loss', 'acc', 'precision','recall','lr'])
train_batch_logger = Logger(
os.path.join(opt.result_path, 'train_batch.log'),
['epoch', 'batch', 'iter', 'loss', 'acc', 'precision', 'recall', 'lr'])
if opt.nesterov:
dampening = 0
else:
dampening = opt.dampening
optimizer = optim.SGD(
parameters,
lr=opt.learning_rate,
momentum=opt.momentum,
dampening=dampening,
weight_decay=opt.weight_decay,
nesterov=opt.nesterov)
# scheduler = lr_scheduler.ReduceLROnPlateau(
# optimizer, 'min', patience=opt.lr_patience)
if not opt.no_val:
spatial_transform = Compose([
Scale(opt.sample_size),
CenterCrop(opt.sample_size),
ToTensor(opt.norm_value), norm_method
])
print('run')
for i in range(opt.begin_epoch, opt.n_epochs + 1):
if not opt.no_train:
adjust_learning_rate(optimizer, i, opt.lr_steps)
train_epoch(i, train_loader, model, criterion, optimizer, opt,
train_logger, train_batch_logger)
I have also made changes in my train file:
model = nn.DataParallel(model(),device_ids=[0,1]).cuda()
outputs = model(inputs)
It does not seem to work properly and is giving error. Please advice, I am new to pytorch.
Thanks
As mentioned in this link, you have to do model.cuda() before passing it to nn.DataParallel.
net = nn.DataParallel(model.cuda(), device_ids=[0,1])
https://github.com/pytorch/pytorch/issues/17065

What is the difference between gensim LabeledSentence and TaggedDocument

Please help me in understanding the difference between how TaggedDocument and LabeledSentence of gensim works. My ultimate goal is Text Classification using Doc2Vec model and any classifier. I am following this blog!
class MyLabeledSentences(object):
def __init__(self, dirname, dataDct={}, sentList=[]):
self.dirname = dirname
self.dataDct = {}
self.sentList = []
def ToArray(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname, fname)) as fin:
for item_no, sentence in enumerate(fin):
self.sentList.append(LabeledSentence([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no]))
return sentList
class MyTaggedDocument(object):
def __init__(self, dirname, dataDct={}, sentList=[]):
self.dirname = dirname
self.dataDct = {}
self.sentList = []
def ToArray(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname, fname)) as fin:
for item_no, sentence in enumerate(fin):
self.sentList.append(TaggedDocument([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no]))
return sentList
sentences = MyLabeledSentences(some_dir_name)
model_l = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7)
sentences_l = sentences.ToArray()
model_l.build_vocab(sentences_l )
for epoch in range(15): #
random.shuffle(sentences_l )
model.train(sentences_l )
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model_l.alpha
sentences = MyTaggedDocument(some_dir_name)
model_t = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7)
sentences_t = sentences.ToArray()
model_l.build_vocab(sentences_t)
for epoch in range(15): #
random.shuffle(sentences_t)
model.train(sentences_t)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model_l.alpha
My question is model_l.docvecs['some_word'] is same as model_t.docvecs['some_word']?
Can you provide me weblink of good sources to get a grasp on how TaggedDocument or LabeledSentence works.
LabeledSentence is an older, deprecated name for the same simple object-type to encapsulate a text-example that is now called TaggedDocument. Any objects that have words and tags properties, each a list, will do. (words is always a list of strings; tags can be a mix of integers and strings, but in the common and most-efficient case, is just a list with a single id integer, starting at 0.)
model_l and model_t will serve the same purposes, having trained on the same data with the same parameters, using just different names for the objects. But the vectors they'll return for individual word-tokens (model['some_word']) or document-tags (model.docvecs['somefilename_NN']) will likely be different – there's randomness in Word2Vec/Doc2Vec initialization and training-sampling, and introduced by ordering-jitter from multithreaded training.

Resources