PySide TableView and Variant data - Display Role Not Displaying - pyside

I have this code, which when i run it in PyQt it works totally fine, but when i run it in pyside things get wierd. I get all the columns and rows im supposed to, and if i go to them via scripting and get the data, each cell says what it should. However, even though i set these as display roles, NO text shows in the table.
None in the headers, none in any of the cells. Im at a loss!
(For thos wondering, NulLVariant() just returns either None or QVariant() depending if were on pyside or pyqt)
This model is meant to take a List of Dicts to addRows, and uses dict keys to make columns.
class CustomTableModel(QtCore.QAbstractTableModel):
def __init__(self, parent=None, parentTable=None):
"""
Custom data model for holding table data.
:param parent: The parent widget/layout so that this data model gets deleted properly on close.
:param parentTable: the table that is using this data. This is used to get the font metrics of the table
display font.
"""
super(CustomTableModel, self).__init__(parent)
self.parent_table = parentTable
self.auto_resize = False
self._avg_font_w = 5
self._resize_data = defaultdict(int)
self.items = []
self.headers = []
def setParentTable(self, widget):
"""
Sets the parent table widget so that we can get its font metrics for setting our column width with autoResize.
:param widget: TableViewWidget
:raise TypeError:
"""
if not isinstance(widget, QtGui.QTableView):
raise TypeError('Must be a TableView item')
self.parent_table = widget
def setAutoResize(self, b):
"""
Turns on or off auto resize for the table. This gathers the font metrics of the parent table, and then loops
over any current data, or newly added data (including table headers) to get the widest item, and sets the
column width to fit this.
:param b: bool
:raise AttributeError:
"""
if not self.parent_table:
raise AttributeError('You must call setParentTable first to set the parent TableView item')
self.auto_resize = b
if b:
self._autoAllResizeData()
self._doColumnResize()
else:
self._resize_data = dict()
def updateSize(self):
"""
Force the table size to update to the current size data.
"""
self._doColumnResize()
def updateSizeData(self):
"""
Force an update/regathering of all the size data for each row and column.
"""
self._autoAllResizeData(True)
self._doColumnResize()
def _doColumnResize(self):
for i in range(len(self.headers)):
txt = self.headers[i]
self.parent_table.setColumnWidth(i, self._resize_data.get(txt))
def _getKeyList(self):
if self.headers:
return self.headers
elif self.items:
return sorted(self.items[0].keys())
def _getTableFontWidth(self):
self._avg_font_w = self.parent_table.fontMetrics().averageCharWidth()
def _autoAllResizeData(self, reset=False):
if not self._resize_data or reset is True:
self._resize_data = defaultdict(int)
key_list = self._getKeyList()
for header in key_list:
header_width = len(header) * (self._avg_font_w * 1.55)
if header_width > self._resize_data[header]:
self._resize_data[header] = header_width
for item in self.items:
value = item.get(header)
width = len(str(value)) * self._avg_font_w
if width > self._resize_data[header]:
self._resize_data[header] = width
def _autoSingleResizeData(self, data):
key_list = self._getKeyList()
for header in key_list:
value = data.get(header)
if value:
width = len(str(value)) * self._avg_font_w
if width > self._resize_data[header]:
self._resize_data[header] = width
def setHeaders(self, items):
"""
This allows you to set your header item text
:param items: a list of header text, ie ['Name', 'Email', 'Department']
"""
lastCount = self.columnCount(QtCore.QModelIndex())
self.headers = items
self.beginRemoveColumns(QtCore.QModelIndex(), 0, lastCount)
for x in range(lastCount):
self.removeColumn(x)
self.endRemoveColumns()
self.beginInsertColumns(QtCore.QModelIndex(), 0, len(items)-1)
self.endInsertColumns()
def addRow(self, data):
"""
Accepts a dict of data to add to the data model.
:param data: dict (this should match the same key length/names as the other data in the table.)
"""
row = len(self.items)
self.beginInsertRows(QtCore.QModelIndex(), row, row)
self.items.append(data)
self.endInsertRows()
if self.auto_resize:
self._autoSingleResizeData(data)
self._doColumnResize()
def addRows(self, data):
"""
Accepts a list of dicts to add them all to the table, with each list index being a row, and each dict key
a column.
:param data: list of dicts
:raise ValueError:
"""
if not isinstance(data, list) or not isinstance(data[0], dict):
raise ValueError('input must be a list of dicts!')
start_row = len(self.items)
end_row = len(data) + start_row - 1
self.beginInsertRows(QtCore.QModelIndex(), start_row, end_row)
self.items.extend(data)
self.endInsertRows()
if self.auto_resize:
for item in data:
self._autoSingleResizeData(item)
self._doColumnResize()
def removeRow(self, row):
"""
Remove the row at index 'row'.
:param row: int
"""
self.beginRemoveRows(QtCore.QModelIndex(), row, row)
self.items.pop(row)
self.endRemoveRows()
def clear(self):
"""
Clear all table data and start fresh.
"""
rows = self.rowCount(QtCore.QModelIndex())
self.beginRemoveRows(QtCore.QModelIndex(), 0, rows)
self.items = []
self.endRemoveRows()
cols = self.columnCount(QtCore.QModelIndex())
self.beginRemoveColumns(QtCore.QModelIndex(), 0, cols)
self.headers = []
self.endRemoveColumns()
def rowCount(self, QModelIndex):
"""
Return the row count.
:param QModelIndex:
:return:
"""
return len(self.items)
def columnCount(self, QModelIndex):
"""
Return the column count (default 1)
:param QModelIndex:
:return:
"""
try:
return len(self.items[0].keys())
except:
return 1
def data(self, index, role):
"""
Accepts a QModelIndex and a Qt.Role and returns the data at the given modelIndex.
:param index: QModelIndex
:param role: QtCore.Qt.<Role>
:return:
"""
row = index.row()
col = index.column()
if role == QtCore.Qt.DisplayRole:
key_list = self._getKeyList()
return QtCore.QVariant(str(self.items[row][key_list[col]]))
return NullVariant()
def intGetData(self, row, col):
"""
Gets the data at 'row' and 'col'.
:param row: int
:param col: int
:return: QVariant() data.
"""
try:
key_list = self._getKeyList()
return QtCore.QVariant(str(self.items[row][key_list[col]]))
except:
return NullVariant()
def headerData(self, section, orientation, role):
"""
Sets the header data based on our header key list.
:param section: section header
:param orientation: orientation
:param role: Qt<Role>
:return:
"""
if role == QtCore.Qt.DisplayRole:
if orientation == QtCore.Qt.Horizontal:
if not self.items:
if section == 0:
return QtCore.QVariant(str("Column 1"))
else:
key_list = self._getKeyList()
try:
return QtCore.QVariant(str(key_list[section]))
except:
return QtCore.QVariant('No Data')
return NullVariant()
class CustomSortModel(QtGui.QSortFilterProxyModel):
def __init__(self, parent=None):
"""
Custom QSortFilterProxyModel to allow sorting and filtering of our custom data model.
:param parent: parent so that this model is deleted properly upon close.
"""
super(CustomSortModel, self).__init__(parent)
self.countAllColumns = False
self._sortingColumn = 0
def filterAcceptsRow(self, sourceRow, sourceParent):
"""
Overriding how we choose what rows match our input filter text.
:param sourceRow: row index in question
:param sourceParent: QModelIndex
:return: bool (accepted or not)
"""
txt = ''
if self.countAllColumns:
for x in range(len(self.sourceModel().headers)):
txt += self.sourceModel().intGetData(sourceRow, x).toString()
else:
txt = self.sourceModel().intGetData(sourceRow, self._sortingColumn).toString()
if self.filterRegExp().pattern():
b = bool(re.search(str(self.filterRegExp().pattern()), str(txt)))
else:
b = bool(re.search('.*', str(txt)))
return b
def setFilterKeyColumn(self, col):
"""
Sets which column index you want the filter to apply to. -1 or less means we search all columns - otherwise,
the filter rules apply to the column index given.
:param col: signed int
:return:
"""
if col <= -1:
self.countAllColumns = True
return
self.countAllColumns = False
self._sortingColumn = col
super(CustomSortModel, self).setFilterKeyColumn(col)
Edit:
I was getting a wierd error when i tried to delete this question, but I have added a newer one, with a better cut down example for testing here:
https://stackoverflow.com/questions/34074825/pyside-qtableview-not-displaying-text-like-pyqt-does

Running your code in PySide gives a bunch of errors :
AttributeError: 'module' object has no attribute 'QVariant'
That's because there is no QVariant in PySide any more. Replacing all QVariantby regular python types fixes the code.
For example
return QtCore.QVariant('No Data')
becomes
return "No Data"

Related

How to update training dataset at epoch begin in Huggingface Trainer using Callback?

I want to recreate the training dataset by a function generate_custom_train_set at the beginning of every epoch, however, is there a way I could do it with Trainer using callback?
My trainer looks like
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset.,
eval_dataset=validation_dataset,
tokenizer=tokenizer,
)
I'm having the same question as I try to implement Examples-proportional mixing from the T5 paper. I didn't find support from hugging face.
My current solution is to modify the trainer.train_dataset in the on_epoch_begin callback.
Here's an implementation. I'm using this in my own project. Seems to work.
First, implement your per-epoch change in your Dataset, in my case, it's the sample function for Examples-Proportional Mixing.
class ProportionMixingDataset:
"""
Examples-proportional mixing from T5
TODO: failed to find a pytorch working implementation
Equivalent to, for the larger datasets, a new subset is taken at each epoch,
then sample in the joined subset once
"""
def __init__(self, dataset_list: List[Dataset] = None, k: int = None):
"""
:param dataset_list: Ordered list of datasets
:param k: Artificial limit
"""
self.dsets = dataset_list
assert k is not None
self.k = k
self.dset_szs = [min(len(d), k) for d in self.dsets]
self.sz = sum(self.dset_szs)
self._sampled_idxs: List[Optional[torch.Tensor]] = [None] * len(self.dsets)
self.sample()
def sample(self):
"""
Sub-sample datasets larger than k
Intended to call in each epoch
"""
for i, dset in enumerate(self.dsets):
sz = len(dset)
if sz > self.k:
self._sampled_idxs[i] = torch.randperm(sz)[:self.k]
def __len__(self):
return self.sz
def _idx2dset_idx(self, idx: int) -> Tuple[int, int]:
"""
Convert a global index to a dataset index
"""
for i, sz in enumerate(self.dset_szs):
if idx < sz:
return i, idx
idx -= sz
raise ValueError('Should not happen')
def __getitem__(self, idx):
if not isinstance(idx, int):
raise ValueError('Batched indexing not supported')
idx_dset, idx = self._idx2dset_idx(idx)
dset = self.dsets[idx_dset]
if self._sampled_idxs[idx_dset] is not None: # A sub-sample index
idx = self._sampled_idxs[idx_dset][idx].item()
return dset[idx]
Then pass that dataset to Trainer.
Now comes the magic part:
class ProportionalMixCallback(TrainerCallback):
"""
Trigger re-computing subset for dataset Examples-proportional mixing, see `dataset::ProportionMixingDataset`
A hack that modifies the train dataset, pointed by Trainer's dataloader
"""
def __init__(self, trainer: Trainer):
self.trainer = trainer
def on_epoch_begin(self, args: TrainingArguments, state, control, **kwargs):
self.trainer.train_dataset.sample()
Pass this to your trainer as a callback.
This triggers the sample call which modifies the dataset at the times we need it.
This works becasue train_dataLoader in trainer still points to the same train dataset object.

QtableView column sorting with floats formatted as strings

I need to use a custom QtableView in python to display and format data.
The example app below shows a table with in the first column floats formatted as strings to get proper number of decimals, second column are pure float displayed so without formatting and the third one are strings.
When clicking on columns I want to sort my data which works fine for strings and floats (columns #2 and #3) but not for my column #1 with formatted floats as strings where it's sorted alphabetically rather than numerically.
I'm googling since a while without finding a way to have something working with QtableView.
Any clue on how to get both floats sorting and decimal formatting ?
Thanks & cheers
Stephane
import sys
from PyQt5 import QtCore, QtWidgets
from PyQt5.QtCore import *
# Table model
class TableModel(QtCore.QAbstractTableModel):
def __init__(self, data):
super(TableModel, self).__init__()
self._data = data
# Set columns headers
self.horizontalHeaders = [''] * 3
self.setHeaderData(0, Qt.Horizontal, "Col #1\nfloats as string")
self.setHeaderData(1, Qt.Horizontal, "Col #2\nfloats")
self.setHeaderData(2, Qt.Horizontal, "Col #3\nstrings")
def data(self, index, role):
value = self._data[index.row()][index.column()]
if role == Qt.DisplayRole:
# convert col #1 from floats to string to get proper number of decimal formatting
if index.column() == 0:
return '%.4f' % value
# otherwise display floats or strings for col #2 and #3
else:
return value
# Align values right
if role == Qt.TextAlignmentRole:
return Qt.AlignVCenter + Qt.AlignRight
def rowCount(self, index):
# The length of the outer list.
return len(self._data)
def columnCount(self, index):
# The following takes the first sub-list, and returns
# the length (only works if all rows are an equal length)
return len(self._data[0])
def setHeaderData(self, section, orientation, data, role=Qt.EditRole):
if orientation == Qt.Horizontal and role in (Qt.DisplayRole, Qt.EditRole):
try:
self.horizontalHeaders[section] = data
return True
except:
return False
return super().setHeaderData(section, orientation, data, role)
def headerData(self, section, orientation, role=Qt.DisplayRole):
if orientation == Qt.Horizontal and role == Qt.DisplayRole:
try:
return self.horizontalHeaders[section]
except:
pass
return super().headerData(section, orientation, role)
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
# Create a TableView (not a TableWidget !!!)
self.table = QtWidgets.QTableView()
# sample data
data = [
[4.2, 9.6, 1],
[42.1, 0.0, 11],
[3.1, 5.55, 2],
[30.0, 3.55, 2222],
[7.99, 8.99, 33],
]
# Set table model
self.model = TableModel(data)
self.table.setModel(self.model)
self.setCentralWidget(self.table)
# Use proxy for column sorting
proxyModel = QSortFilterProxyModel()
proxyModel.setSourceModel(self.model)
self.table.setModel(proxyModel)
self.table.setSortingEnabled(True)
# hide vertical headers
self.table.verticalHeader().setVisible(False)
# format horizontal headers
stylesheet = "::section{Background-color:rgb(171,178,185);font-weight:bold}"
self.table.setStyleSheet(stylesheet)
self.table.setAlternatingRowColors(True)
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
window = MainWindow()
window.setMinimumSize(350, 250)
window.setWindowTitle('Sorting column example')
window.show()
app.exec_()
Thanks to a colleague I've found an implementation which works.
Basically one has to override the sorting function and not using the QSortFilterProxyModel() function but rewrite your own function
this new sorting function will be called and just do a custom sorting
Here is the modified code which now works fine for any type of data.
import sys
from PyQt5 import QtCore, QtWidgets
from PyQt5.QtCore import *
# Table model
class TableModel(QtCore.QAbstractTableModel):
def __init__(self, data):
super(TableModel, self).__init__()
self._data = data
# Set columns headers
self.horizontalHeaders = [''] * 3
self.setHeaderData(0, Qt.Horizontal, "Col #1\nfloats as string")
self.setHeaderData(1, Qt.Horizontal, "Col #2\nfloats")
self.setHeaderData(2, Qt.Horizontal, "Col #3\nstrings")
def data(self, index, role):
value = self._data[index.row()][index.column()]
if role == Qt.DisplayRole:
# convert col #1 from floats to string to get proper number of decimal formatting
if index.column() == 0:
return '%.4f' % value
# otherwise display floats or strings for col #2 and #3
else:
return value
if role == Qt.UserRole:
return value
# Align values right
if role == Qt.TextAlignmentRole:
return Qt.AlignVCenter + Qt.AlignRight
def rowCount(self, index):
# The length of the outer list.
return len(self._data)
def columnCount(self, index):
# The following takes the first sub-list, and returns
# the length (only works if all rows are an equal length)
return len(self._data[0])
def setHeaderData(self, section, orientation, data, role=Qt.EditRole):
if orientation == Qt.Horizontal and role in (Qt.DisplayRole, Qt.EditRole):
try:
self.horizontalHeaders[section] = data
return True
except:
return False
return super().setHeaderData(section, orientation, data, role)
def headerData(self, section, orientation, role=Qt.DisplayRole):
if orientation == Qt.Horizontal and role == Qt.DisplayRole:
try:
return self.horizontalHeaders[section]
except:
pass
return super().headerData(section, orientation, role)
class mysortingproxy(QSortFilterProxyModel):
def __init__(self):
super(mysortingproxy, self).__init__()
def lessThan(self, left: QModelIndex, right: QModelIndex) -> bool:
leftDqtq = self.sourceModel().data(left, Qt.UserRole)
rightDqtq = self.sourceModel().data(right, Qt.UserRole)
return leftDqtq < rightDqtq
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
# Create a TableView (not a TableWidget !!!)
self.table = QtWidgets.QTableView()
# sample data
data = [
[4.2, 9.6, 1],
[42.1, 0.0, 11],
[3.1, 5.55, 2],
[30.0, 3.55, 2222],
[7.99, 8.99, 33],
]
# Set table model
self.model = TableModel(data)
self.table.setModel(self.model)
self.setCentralWidget(self.table)
# Use proxy for column sorting overriding the QSortFilterProxyModel() function with a custom sorting proxy function
proxyModel = mysortingproxy()
proxyModel.setSourceModel(self.model)
self.table.setModel(proxyModel)
self.table.setSortingEnabled(True)
# hide vertical headers
self.table.verticalHeader().setVisible(False)
# format horizontal headers
stylesheet = "::section{Background-color:rgb(171,178,185);font-weight:bold}"
self.table.setStyleSheet(stylesheet)
self.table.setAlternatingRowColors(True)
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
window = MainWindow()
window.setMinimumSize(350, 250)
window.setWindowTitle('Sorting column example')
window.show()
app.exec_()

PyYAML loader with duplicate keys

Using PyYAML for loading a YAML (large) file which has duplicate keys. I would like to preserve all keys and would modify duplicate key according to project need. But it seems PyYAML is silently overwrites results with the last key and not getting a chance to modify it as my need (loss of information), resulting in this dict: {'blocks':{'a':'b2:11 c2:22'}}
simple example YAML:
import yaml
given_str = '''
blocks:
a:
b1:1
c1:2
a:
b2:11
c2:22'''
p = yaml.load(given_str)
How can I load the YAML with duplicate keys so that I get a chance to recursively traverse it and modify keys as my need. I need to load YAML and then transfer it into a database.
Assuming your input YAML has no merge keys ('<<'), no tags and no comments you want
to preserve, you can use the following:
import sys
import ruamel.yaml
from pathlib import Path
from collections.abc import Hashable
file_in = Path('input.yaml')
class MyConstructor(ruamel.yaml.constructor.SafeConstructor):
def construct_mapping(self, node, deep=False):
"""deep is True when creating an object/mapping recursively,
in that case want the underlying elements available during construction
"""
if not isinstance(node, ruamel.yaml.nodes.MappingNode):
raise ConstructorError(
None, None, f'expected a mapping node, but found {node.id!s}', node.start_mark,
)
total_mapping = self.yaml_base_dict_type()
if getattr(node, 'merge', None) is not None:
todo = [(node.merge, False), (node.value, False)]
else:
todo = [(node.value, True)]
for values, check in todo:
mapping: Dict[Any, Any] = self.yaml_base_dict_type()
for key_node, value_node in values:
# keys can be list -> deep
key = self.construct_object(key_node, deep=True)
# lists are not hashable, but tuples are
if not isinstance(key, Hashable):
if isinstance(key, list):
key = tuple(key)
if not isinstance(key, Hashable):
raise ConstructorError(
'while constructing a mapping',
node.start_mark,
'found unhashable key',
key_node.start_mark,
)
value = self.construct_object(value_node, deep=deep)
if key in mapping:
pat = key + '_undup_{}'
index = 0
while True:
nkey = pat.format(index)
if nkey not in mapping:
key = nkey
break
index += 1
mapping[key] = value
total_mapping.update(mapping)
return total_mapping
yaml = ruamel.yaml.YAML(typ='safe')
yaml.default_flow_style = False
yaml.Constructor = MyConstructor
data = yaml.load(file_in)
yaml.dump(data, sys.stdout)
which gives:
blocks:
a: b1:1 c1:2
a_undup_0: b2:11 c2:22
Please note that the values for both a keys are multiline plain scalars. For b1 and c1 to be a key
the mapping value indicator (:, the colon) needs to be followed by a whitespace character:
a:
b1: 1
c1: 2
After reading many forums, I think best solution is create a wrapper for yml loader (removing duplicates) is the solution. #Anthon - any comment?
import yaml
from collections import defaultdict, Counter
####### Preserving Duplicate ###################
def parse_preserving_duplicates(input_file):
class PreserveDuplicatesLoader(yaml.CLoader):
pass
def map_constructor(loader, node, deep=False):
"""Walk tree, removing degeneracy in any duplicate keys"""
keys = [loader.construct_object(node, deep=deep) for node, _ in node.value]
vals = [loader.construct_object(node, deep=deep) for _, node in node.value]
key_count = Counter(keys)
data = defaultdict(dict) # map all data removing duplicates
c = 0
for key, value in zip(keys, vals):
if key_count[key] > 1:
data[f'{key}{c}'] = value
c += 1
else:
data[key] = value
return data
PreserveDuplicatesLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
map_constructor)
return yaml.load(input_file, PreserveDuplicatesLoader)
##########################################################
with open(inputf, 'r') as file:
fp = parse_preserving_duplicates(input_file)

Understanding the distance metric in company name matching using KNN

I am trying to understand the following code that I found for matching a messy list of company names to a list of clean list of company names. My question is what the 'Ratio' metric is calculated using. It appears that the ratio is from scorer = fuzz.token_sort_ratio which is I understand is part of the fuzzywuzzy package and therefore a levenschtein distance calculation correct? I'm trying to understand why the author uses this as the scorer rather than the distance output from KNN. When I try changing the metric inside NearestNeighbors, it doesn't appear to change the results. Does the metric in NearestNeighbors matter then?
Original article:
https://audhiaprilliant.medium.com/fuzzy-string-matching-optimization-using-tf-idf-and-knn-b07fce69b58f
def build_vectorizer(
clean: pd.Series,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 4),
n_neighbors: int = 1,
**kwargs
) -> Tuple:
# Create vectorizer
vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
X = vectorizer.fit_transform(clean.values.astype('U'))
# Fit nearest neighbors corpus
nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
return vectorizer, nbrs
# String matching - KNN
def tfidf_nn(
messy,
clean,
n_neighbors = 1,
**kwargs
):
# Fit clean data and transform messy data
vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
input_vec = vectorizer.transform(messy)
# Determine best possible matches
distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
nearest_values = np.array(clean)[indices]
return nearest_values, distances
# String matching - match fuzzy
def find_matches_fuzzy(
row,
match_candidates,
limit = 5
):
row_matches = process.extract(
row, dict(enumerate(match_candidates)),
scorer = fuzz.token_sort_ratio,
limit = limit
)
result = [(row, match[0], match[1]) for match in row_matches]
return result
# String matching - TF-IDF
def fuzzy_nn_match(
messy,
clean,
column,
col,
n_neighbors = 100,
limit = 5, **kwargs):
nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)
results = [find_matches_fuzzy(row, nearest_values[i], limit) for i, row in enumerate(messy)]
df = pd.DataFrame(itertools.chain.from_iterable(results),
columns = [column, col, 'Ratio']
)
return df
# String matching - Fuzzy
def fuzzy_tf_idf(
df: pd.DataFrame,
column: str,
clean: pd.Series,
mapping_df: pd.DataFrame,
col: str,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 3)
) -> pd.Series:
# Create vectorizer
clean = clean.drop_duplicates().reset_index(drop = True)
messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
messy = messy_prep.apply(preprocess_string)
result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
# Map value from messy to clean
return result

You are given a list of names and a list of email addresses. How would you automatically assign the 'best' email?

I am having a bit of trouble in terms of runtime for an algorithm that matches names with the most likely email address. The function itself works well (in that it pairs the name and email address correctly), but the runtime is so grand that it is difficult to implement on large data sets. I am a beginner at coding and would love to hear what solutions you guys might offer.
quick note I implemented Levenshtein's algorithm here. If there are more efficient algorithms, comment below!
from string import digits
import copy
import re
# levenshtein algorithm found on https://www.python-course.eu/levenshtein_distance.php
def call_counter(func):
def helper(*args, **kwargs):
helper.calls += 1
return func(*args, **kwargs)
helper.calls = 0
helper.__name__= func.__name__
return helper
def memoize(func):
mem = {}
def memoizer(*args, **kwargs):
key = str(args) + str(kwargs)
if key not in mem:
mem[key] = func(*args, **kwargs)
return mem[key]
return memoizer
#call_counter
#memoize
def levenshtein(s, t):
if s == "":
return len(t)
if t == "":
return len(s)
if s[-1] == t[-1]:
cost = 0
else:
cost = 1
res = min([levenshtein(s[:-1], t)+1,
levenshtein(s, t[:-1])+1,
levenshtein(s[:-1], t[:-1]) + cost])
return res
def emailmatch(emails_file,name_file):
name_email_match = {} #store the matching emails in a dictionary
with open(name_file, 'r') as names:
match_name = 0
for individual in names:
with open(emails_file,'r') as address_emails:
first_name = individual[:(individual.index(" "))].lower()
last_name = individual[(individual.rindex(" ")):].lower()
full_name = (first_name + last_name).lower()
full_name_period = (first_name+"."+last_name).lower()
best_match = "" #this holds the best matching email
minimum = 999
for emails in address_emails:
email = emails[0:(emails.index('#'))]
temp = min(levenshtein(last_name,email),
levenshtein(first_name,email),
levenshtein(full_name,email),
levenshtein(full_name_period,email))
if (temp < minimum):
minimum = temp
best_match = emails
name_email_match[individual] = best_match
return name_email_match
emailmatch('emails.txt', 'names.txt')

Resources