Understanding the distance metric in company name matching using KNN - knn

I am trying to understand the following code that I found for matching a messy list of company names to a list of clean list of company names. My question is what the 'Ratio' metric is calculated using. It appears that the ratio is from scorer = fuzz.token_sort_ratio which is I understand is part of the fuzzywuzzy package and therefore a levenschtein distance calculation correct? I'm trying to understand why the author uses this as the scorer rather than the distance output from KNN. When I try changing the metric inside NearestNeighbors, it doesn't appear to change the results. Does the metric in NearestNeighbors matter then?
Original article:
https://audhiaprilliant.medium.com/fuzzy-string-matching-optimization-using-tf-idf-and-knn-b07fce69b58f
def build_vectorizer(
clean: pd.Series,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 4),
n_neighbors: int = 1,
**kwargs
) -> Tuple:
# Create vectorizer
vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
X = vectorizer.fit_transform(clean.values.astype('U'))
# Fit nearest neighbors corpus
nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
return vectorizer, nbrs
# String matching - KNN
def tfidf_nn(
messy,
clean,
n_neighbors = 1,
**kwargs
):
# Fit clean data and transform messy data
vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
input_vec = vectorizer.transform(messy)
# Determine best possible matches
distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
nearest_values = np.array(clean)[indices]
return nearest_values, distances
# String matching - match fuzzy
def find_matches_fuzzy(
row,
match_candidates,
limit = 5
):
row_matches = process.extract(
row, dict(enumerate(match_candidates)),
scorer = fuzz.token_sort_ratio,
limit = limit
)
result = [(row, match[0], match[1]) for match in row_matches]
return result
# String matching - TF-IDF
def fuzzy_nn_match(
messy,
clean,
column,
col,
n_neighbors = 100,
limit = 5, **kwargs):
nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)
results = [find_matches_fuzzy(row, nearest_values[i], limit) for i, row in enumerate(messy)]
df = pd.DataFrame(itertools.chain.from_iterable(results),
columns = [column, col, 'Ratio']
)
return df
# String matching - Fuzzy
def fuzzy_tf_idf(
df: pd.DataFrame,
column: str,
clean: pd.Series,
mapping_df: pd.DataFrame,
col: str,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 3)
) -> pd.Series:
# Create vectorizer
clean = clean.drop_duplicates().reset_index(drop = True)
messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
messy = messy_prep.apply(preprocess_string)
result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
# Map value from messy to clean
return result

Related

How to update training dataset at epoch begin in Huggingface Trainer using Callback?

I want to recreate the training dataset by a function generate_custom_train_set at the beginning of every epoch, however, is there a way I could do it with Trainer using callback?
My trainer looks like
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset.,
eval_dataset=validation_dataset,
tokenizer=tokenizer,
)
I'm having the same question as I try to implement Examples-proportional mixing from the T5 paper. I didn't find support from hugging face.
My current solution is to modify the trainer.train_dataset in the on_epoch_begin callback.
Here's an implementation. I'm using this in my own project. Seems to work.
First, implement your per-epoch change in your Dataset, in my case, it's the sample function for Examples-Proportional Mixing.
class ProportionMixingDataset:
"""
Examples-proportional mixing from T5
TODO: failed to find a pytorch working implementation
Equivalent to, for the larger datasets, a new subset is taken at each epoch,
then sample in the joined subset once
"""
def __init__(self, dataset_list: List[Dataset] = None, k: int = None):
"""
:param dataset_list: Ordered list of datasets
:param k: Artificial limit
"""
self.dsets = dataset_list
assert k is not None
self.k = k
self.dset_szs = [min(len(d), k) for d in self.dsets]
self.sz = sum(self.dset_szs)
self._sampled_idxs: List[Optional[torch.Tensor]] = [None] * len(self.dsets)
self.sample()
def sample(self):
"""
Sub-sample datasets larger than k
Intended to call in each epoch
"""
for i, dset in enumerate(self.dsets):
sz = len(dset)
if sz > self.k:
self._sampled_idxs[i] = torch.randperm(sz)[:self.k]
def __len__(self):
return self.sz
def _idx2dset_idx(self, idx: int) -> Tuple[int, int]:
"""
Convert a global index to a dataset index
"""
for i, sz in enumerate(self.dset_szs):
if idx < sz:
return i, idx
idx -= sz
raise ValueError('Should not happen')
def __getitem__(self, idx):
if not isinstance(idx, int):
raise ValueError('Batched indexing not supported')
idx_dset, idx = self._idx2dset_idx(idx)
dset = self.dsets[idx_dset]
if self._sampled_idxs[idx_dset] is not None: # A sub-sample index
idx = self._sampled_idxs[idx_dset][idx].item()
return dset[idx]
Then pass that dataset to Trainer.
Now comes the magic part:
class ProportionalMixCallback(TrainerCallback):
"""
Trigger re-computing subset for dataset Examples-proportional mixing, see `dataset::ProportionMixingDataset`
A hack that modifies the train dataset, pointed by Trainer's dataloader
"""
def __init__(self, trainer: Trainer):
self.trainer = trainer
def on_epoch_begin(self, args: TrainingArguments, state, control, **kwargs):
self.trainer.train_dataset.sample()
Pass this to your trainer as a callback.
This triggers the sample call which modifies the dataset at the times we need it.
This works becasue train_dataLoader in trainer still points to the same train dataset object.

How to test a model before fine-tuning in Pytorch Lightning?

Doing things on Google Colab.
transformers: 4.10.2
pytorch-lightning: 1.2.7
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
dataset_for_loader = [
{'data':torch.tensor([0,1]), 'labels':torch.tensor(0)},
{'data':torch.tensor([2,3]), 'labels':torch.tensor(1)},
{'data':torch.tensor([4,5]), 'labels':torch.tensor(2)},
{'data':torch.tensor([6,7]), 'labels':torch.tensor(3)},
]
loader = DataLoader(dataset_for_loader, batch_size=2)
for idx, batch in enumerate(loader):
print(f'# batch {idx}')
print(batch)
category_list = [
'dokujo-tsushin',
'it-life-hack',
'kaden-channel',
'livedoor-homme',
'movie-enter',
'peachy',
'smax',
'sports-watch',
'topic-news'
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
# file ./text has lots of articles, categorized by category
# and they are just plain texts, whose content begins from forth line
for file in glob.glob(f'./text/{category}/{category}*'):
lines = open(file).read().splitlines()
text = '\n'.join(lines[3:])
encoding = tokenizer(
text,
max_length=max_length,
padding='max_length',
truncation=True
)
encoding['labels'] = label
encoding = { k: torch.tensor(v) for k, v in encoding.items() }
dataset_for_loader.append(encoding)
SEED=lambda:0.0
# random.shuffle(dataset_for_loader) # ランダムにシャッフル
random.shuffle(dataset_for_loader,SEED)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]
dataloader_train = DataLoader(
dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)
class BertForSequenceClassification_pl(pl.LightningModule):
def __init__(self, model_name, num_labels, lr):
super().__init__()
self.save_hyperparameters()
self.bert_sc = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def training_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
loss = output.loss
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
output = self.bert_sc(**batch)
val_loss = output.loss
self.log('val_loss', val_loss)
def test_step(self, batch, batch_idx):
labels = batch.pop('labels')
output = self.bert_sc(**batch)
labels_predicted = output.logits.argmax(-1)
num_correct = ( labels_predicted == labels ).sum().item()
accuracy = num_correct/labels.size(0)
self.log('accuracy', accuracy)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
checkpoint = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
mode='min',
save_top_k=1,
save_weights_only=True,
dirpath='model/',
)
trainer = pl.Trainer(
gpus=1,
max_epochs=10,
callbacks = [checkpoint]
)
model = BertForSequenceClassification_pl(
MODEL_NAME, num_labels=9, lr=1e-5
)
### (a) ###
# I think this is where I am doing fine-tuning
trainer.fit(model, dataloader_train, dataloader_val)
# this is to score after fine-tuning
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
But I am not really sure how to do a test before fine-tuning, in order to compare two models before and after fine-tuning, in order to show how effective fine-tuning is.
Inserting the following two lines to ### (a) ###:
test = trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
I got this result:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-13-c8b2c67f2d5c> in <module>()
9
10 # 6-19
---> 11 test = trainer.test(test_dataloaders=dataloader_test)
12 print(f'Accuracy: {test[0]["accuracy"]:.2f}')
13
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in test(self, model, test_dataloaders, ckpt_path, verbose, datamodule)
896 self.verbose_test = verbose
897
--> 898 self._set_running_stage(RunningStage.TESTING, model or self.lightning_module)
899
900 # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _set_running_stage(self, stage, model_ref)
563 the trainer and the model
564 """
--> 565 model_ref.running_stage = stage
566 self._running_stage = stage
567
AttributeError: 'NoneType' object has no attribute 'running_stage'
I noticed that Trainer.fit() can take None as arguments other than model, so I tried this:
trainer.fit(model)
test=trainer.test(test_dataloaders=dataloader_test)
print(f'Accuracy: {test[0]["accuracy"]:.2f}')
The result:
MisconfigurationException: No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.
Thanks.
The Trainer needs to call its .fit() in order to set up a lot of things and then only you can do .test() or other methods.
You are right about putting a .fit() just before .test() but the fit call needs to a valid one. You have to feed a dataloader/datamodule to it. But since you don't want to do a training/validation in this fit call, just pass limit_[train/val]_batches=0 while Trainer construction.
trainer = Trainer(gpus=..., ..., limit_train_batches=0, limit_val_batches=0)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # without fine-tuning
The fit call here will just set things up for you and skip training/validation. And then the testing follows. Next time run the same code but without the limit_[train/val]_batches, this will do the pretraining for you
trainer = Trainer(gpus=..., ...)
trainer.fit(model, dataloader_train, dataloader_val)
trainer.test(model, dataloader_test) # with fine-tuning
Clarifying a bit about .fit() taking None for all but model: Its not quite true - you must provide either a DataLoader or a DataModule.

how to add symbols to the multiple stock data

#i have scraped data below is my code, now i want to add a column of symbols to the respective company data, plz guide me how the symbol can be added to the respective firm data
#code below
from time import sleep
import pandas as pd
import os
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
symbols =['FATIMA',
'SSGC',
'FCCL',
'ISL',
'KEL',
'NCL',
'DGKC',
'SNGP',
'NML',
'ENGRO',
'HUMNL',
'CHCC',
'ATRL',
'HUBC',
'ASTL',
'PIBTL',
'OGDC',
'EFERT',
'FFC',
'NCPL',
'KTML',
'PSO',
'LUCK',
'SEARL',
'KOHC',
'ABOT',
'AICL',
'HASCOL',
'PTC',
'KAPCO',
'PIOC',
'POL',
'SHEL',
'GHGL',
'HCAR',
'DCR',
'BWCL',
'MTL',
'GLAXO',
'PKGS',
'SHFA','MARI',
'ICI',
'ACPL',
'PSMC',
'SPWL',
'THALL',
'BNWM',
'EFUG',
'GADT',
'AABS']
company = 1
for ThisSymbol in symbols :
# Get first symbol from the above python list
company = 2
# In the URL, make symbol as variable
url = 'http://www.scstrade.com/stockscreening/SS_CompanySnapShotYF.aspx?symbol=' + ThisSymbol
browser.get(url)
sleep(2)
# The below command will get all the contents from the url
html = browser.execute_script("return document.documentElement.outerHTML")
# So we will supply the contents to beautiful soup and we tell to consider this text as a html, with the following command
soup = BeautifulSoup (html, "html.parser")
for rn in range(0,9) :
plist = []
r = soup.find_all('tr')[rn]
# Condition: if first row, then th, otherwise td
if (rn==0) :
celltag = 'th'
else :
celltag = 'td'
# Now use the celltag instead of using fixed td or th
col = r.find_all(celltag)
print()
if col[i] == 0:
print ("")
else:
for i in range(0,4) :
cell = col[i].text
clean = cell.replace('\xa0 ', '')
clean = clean.replace (' ', '')
plist.append(clean)
# If first row, create df, otherwise add to it
if (rn == 0) :
df = pd.DataFrame(plist)
else :
df2 = pd.DataFrame(plist)
colname = 'y' + str(2019-rn)
df[colname] = df2
if (company == 1):
dft = df.T
# Get header Column
head = dft.iloc[0]
# Exclude first row from the data
dft = dft[1:]
dft.columns = head
dft = dft.reset_index()
# Assign Headers
dft = dft.drop(['index'], axis = 'columns')
else:
dft2 = df.T
# Get header Column
head = dft2.iloc[0]
# Exclude first row from the data
dft2 = dft2[1:]
dft2.columns = head
dft2 = dft2.reset_index()
# Assign Headers
dft2 = dft2.drop(['index'], axis = 'columns')
dft['Symbol'] = ThisSymbol
dft = dft.append(dft2, sort=['Year','Symbol'])
company = company +1
dft
my output looks this, i want to have a symbol column to each respective firm data
Symbol,i have added
dft['Symbol'] = ThisSymbol
but it add just first company from the list to all companies data
enter image description here

Discrepancies in gensim doc2vec embedding vectors

I use gensim Doc2Vec package to train doc2vec embeddings. I would expect that two models trained with the identical parameters and data would have very close values of the doc2vec vectors. However, in my experience it is only true with doc2vec trained in the PV-DBOW without training word embedding (dbow_words = 0).
For PV-DM and for PV-DBOW with dbow_words = 1, i.e. every case the word embedding are trained along with doc2vec, the doc2vec embedding vectors for identically trained models are fairly different.
Here is my code
from sklearn.datasets import fetch_20newsgroups
from gensim import models
import scipy.spatial.distance as distance
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
def clean_text(texts, min_length = 2):
clean = []
#don't remove apostrophes
translator = str.maketrans(punctuation.replace('\'',' '), ' '*len(punctuation))
for text in texts:
text = text.translate(translator)
tokens = text.split()
# remove not alphabetic tokens
tokens = [word.lower() for word in tokens if word.isalpha()]
# filter out stop words
stop_words = stopwords.words('english')
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) >= min_length]
tokens = ' '.join(tokens)
clean.append(tokens)
return clean
def tag_text(all_text, tag_type =''):
tagged_text = []
for i, text in enumerate(all_text):
tag = tag_type + '_' + str(i)
tagged_text.append(models.doc2vec.TaggedDocument(text.split(), [tag]))
return tagged_text
def train_docvec(dm, dbow_words, min_count, epochs, training_data):
model = models.Doc2Vec(dm=dm, dbow_words = dbow_words, min_count = min_count)
model.build_vocab(tagged_data)
model.train(training_data, total_examples=len(training_data), epochs=epochs)
return model
def compare_vectors(vector1, vector2):
cos_distances = []
for i in range(len(vector1)):
d = distance.cosine(vector1[i], vector2[i])
cos_distances.append(d)
print (np.median(cos_distances))
print (np.std(cos_distances))
dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
n_samples = len(dataset.data)
data = clean_text(dataset.data)
tagged_data = tag_text(data)
data_labels = dataset.target
data_label_names = dataset.target_names
model_dbow1 = train_docvec(0, 0, 4, 30, tagged_data)
model_dbow2 = train_docvec(0, 0, 4, 30, tagged_data)
model_dbow3 = train_docvec(0, 1, 4, 30, tagged_data)
model_dbow4 = train_docvec(0, 1, 4, 30, tagged_data)
model_dm1 = train_docvec(1, 0, 4, 30, tagged_data)
model_dm2 = train_docvec(1, 0, 4, 30, tagged_data)
compare_vectors(model_dbow1.docvecs, model_dbow2.docvecs)
> 0.07795828580856323
> 0.02610614028793008
compare_vectors(model_dbow1.docvecs, model_dbow3.docvecs)
> 0.6476179957389832
> 0.14797587172616306
compare_vectors(model_dbow3.docvecs, model_dbow4.docvecs)
> 0.19878000020980835
> 0.06362519480831186
compare_vectors(model_dm1.docvecs, model_dm2.docvecs)
> 0.13536489009857178
> 0.045365127475424386
compare_vectors(model_dbow1.docvecs, model_dm1.docvecs)
> 0.6358324736356735
> 0.15150255674571805
UPDATE
I tried, as suggested by gojomo, to compare the differences between the vectors, and, unfortunately, those are even worse:
def compare_vector_differences(vector1, vector2):
diff1 = []
diff2 = []
for i in range(len(vector1)-1):
diff1.append( vector1[i+1] - vector1[i])
for i in range(len(vector2)-1):
diff2[i].append(vector2[i+1] - vector2[i])
cos_distances = []
for i in range(len(diff1)):
d = distance.cosine(diff1[i], diff2[i])
cos_distances.append(d)
print (np.median(cos_distances))
print (np.std(cos_distances))
compare_vector_differences(model_dbow1.docvecs, model_dbow2.docvecs)
> 0.1134452223777771
> 0.02676398444178949
compare_vector_differences(model_dbow1.docvecs, model_dbow3.docvecs)
> 0.8464127033948898
> 0.11423789350773429
compare_vector_differences(model_dbow4.docvecs, model_dbow3.docvecs)
> 0.27400463819503784
> 0.05984108730423529
SECOND UPDATE
This time, after I finally understood gojomo, the things look fine.
def compare_distance_differences(vector1, vector2):
diff1 = []
diff2 = []
for i in range(len(vector1)-1):
diff1.append( distance.cosine(vector1[i+1], vector1[i]))
for i in range(len(vector2)-1):
diff2.append( distance.cosine(vector2[i+1], vector2[i]))
diff_distances = []
for i in range(len(diff1)):
diff_distances.append(abs(diff1[i] - diff2[i]))
print (np.median(diff_distances))
print (np.std(diff_distances))
compare_distance_differences(model_dbow1.docvecs, model_dbow2.docvecs)
>0.017469733953475952
>0.01659284710785352
compare_distance_differences(model_dbow1.docvecs, model_dbow3.docvecs)
>0.0786697268486023
>0.06092163158218411
compare_distance_differences(model_dbow3.docvecs, model_dbow4.docvecs)
>0.02321992814540863
>0.023095123172320778
The doc-vectors (or word-vectors) of Doc2Vec & Word2Vec models are only meaningfully comparable to other vectors that were co-trained, in the same interleaved training sessions.
Otherwise, randomness introduced by the algorithms (random-initialization & random-sampling) and by slight differences in training ordering (from multithreading) will cause the trained positions of individual vectors to wander to arbitrarily different positions. Their relative distances/directions, to other vectors that shared interleaved training, should be about as equally-useful from one model to the next.
But there's no one right place for such a vector, and measuring the differences between the vector for document '1' (or word 'foo') in one model, and the corresponding vector in another model, isn't reflective of anything the models/algorithms are trained to provide.
There's more information in the Gensim FAQ:
Q11: I've trained my Word2Vec/Doc2Vec/etc model repeatedly using the exact same text corpus, but the vectors are different each time. Is there a bug or have I made a mistake?

You are given a list of names and a list of email addresses. How would you automatically assign the 'best' email?

I am having a bit of trouble in terms of runtime for an algorithm that matches names with the most likely email address. The function itself works well (in that it pairs the name and email address correctly), but the runtime is so grand that it is difficult to implement on large data sets. I am a beginner at coding and would love to hear what solutions you guys might offer.
quick note I implemented Levenshtein's algorithm here. If there are more efficient algorithms, comment below!
from string import digits
import copy
import re
# levenshtein algorithm found on https://www.python-course.eu/levenshtein_distance.php
def call_counter(func):
def helper(*args, **kwargs):
helper.calls += 1
return func(*args, **kwargs)
helper.calls = 0
helper.__name__= func.__name__
return helper
def memoize(func):
mem = {}
def memoizer(*args, **kwargs):
key = str(args) + str(kwargs)
if key not in mem:
mem[key] = func(*args, **kwargs)
return mem[key]
return memoizer
#call_counter
#memoize
def levenshtein(s, t):
if s == "":
return len(t)
if t == "":
return len(s)
if s[-1] == t[-1]:
cost = 0
else:
cost = 1
res = min([levenshtein(s[:-1], t)+1,
levenshtein(s, t[:-1])+1,
levenshtein(s[:-1], t[:-1]) + cost])
return res
def emailmatch(emails_file,name_file):
name_email_match = {} #store the matching emails in a dictionary
with open(name_file, 'r') as names:
match_name = 0
for individual in names:
with open(emails_file,'r') as address_emails:
first_name = individual[:(individual.index(" "))].lower()
last_name = individual[(individual.rindex(" ")):].lower()
full_name = (first_name + last_name).lower()
full_name_period = (first_name+"."+last_name).lower()
best_match = "" #this holds the best matching email
minimum = 999
for emails in address_emails:
email = emails[0:(emails.index('#'))]
temp = min(levenshtein(last_name,email),
levenshtein(first_name,email),
levenshtein(full_name,email),
levenshtein(full_name_period,email))
if (temp < minimum):
minimum = temp
best_match = emails
name_email_match[individual] = best_match
return name_email_match
emailmatch('emails.txt', 'names.txt')

Resources