Related
I am trying to understand the following code that I found for matching a messy list of company names to a list of clean list of company names. My question is what the 'Ratio' metric is calculated using. It appears that the ratio is from scorer = fuzz.token_sort_ratio which is I understand is part of the fuzzywuzzy package and therefore a levenschtein distance calculation correct? I'm trying to understand why the author uses this as the scorer rather than the distance output from KNN. When I try changing the metric inside NearestNeighbors, it doesn't appear to change the results. Does the metric in NearestNeighbors matter then?
Original article:
https://audhiaprilliant.medium.com/fuzzy-string-matching-optimization-using-tf-idf-and-knn-b07fce69b58f
def build_vectorizer(
clean: pd.Series,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 4),
n_neighbors: int = 1,
**kwargs
) -> Tuple:
# Create vectorizer
vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
X = vectorizer.fit_transform(clean.values.astype('U'))
# Fit nearest neighbors corpus
nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
return vectorizer, nbrs
# String matching - KNN
def tfidf_nn(
messy,
clean,
n_neighbors = 1,
**kwargs
):
# Fit clean data and transform messy data
vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
input_vec = vectorizer.transform(messy)
# Determine best possible matches
distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
nearest_values = np.array(clean)[indices]
return nearest_values, distances
# String matching - match fuzzy
def find_matches_fuzzy(
row,
match_candidates,
limit = 5
):
row_matches = process.extract(
row, dict(enumerate(match_candidates)),
scorer = fuzz.token_sort_ratio,
limit = limit
)
result = [(row, match[0], match[1]) for match in row_matches]
return result
# String matching - TF-IDF
def fuzzy_nn_match(
messy,
clean,
column,
col,
n_neighbors = 100,
limit = 5, **kwargs):
nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)
results = [find_matches_fuzzy(row, nearest_values[i], limit) for i, row in enumerate(messy)]
df = pd.DataFrame(itertools.chain.from_iterable(results),
columns = [column, col, 'Ratio']
)
return df
# String matching - Fuzzy
def fuzzy_tf_idf(
df: pd.DataFrame,
column: str,
clean: pd.Series,
mapping_df: pd.DataFrame,
col: str,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 3)
) -> pd.Series:
# Create vectorizer
clean = clean.drop_duplicates().reset_index(drop = True)
messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
messy = messy_prep.apply(preprocess_string)
result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
# Map value from messy to clean
return result
#i have scraped data below is my code, now i want to add a column of symbols to the respective company data, plz guide me how the symbol can be added to the respective firm data
#code below
from time import sleep
import pandas as pd
import os
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
symbols =['FATIMA',
'SSGC',
'FCCL',
'ISL',
'KEL',
'NCL',
'DGKC',
'SNGP',
'NML',
'ENGRO',
'HUMNL',
'CHCC',
'ATRL',
'HUBC',
'ASTL',
'PIBTL',
'OGDC',
'EFERT',
'FFC',
'NCPL',
'KTML',
'PSO',
'LUCK',
'SEARL',
'KOHC',
'ABOT',
'AICL',
'HASCOL',
'PTC',
'KAPCO',
'PIOC',
'POL',
'SHEL',
'GHGL',
'HCAR',
'DCR',
'BWCL',
'MTL',
'GLAXO',
'PKGS',
'SHFA','MARI',
'ICI',
'ACPL',
'PSMC',
'SPWL',
'THALL',
'BNWM',
'EFUG',
'GADT',
'AABS']
company = 1
for ThisSymbol in symbols :
# Get first symbol from the above python list
company = 2
# In the URL, make symbol as variable
url = 'http://www.scstrade.com/stockscreening/SS_CompanySnapShotYF.aspx?symbol=' + ThisSymbol
browser.get(url)
sleep(2)
# The below command will get all the contents from the url
html = browser.execute_script("return document.documentElement.outerHTML")
# So we will supply the contents to beautiful soup and we tell to consider this text as a html, with the following command
soup = BeautifulSoup (html, "html.parser")
for rn in range(0,9) :
plist = []
r = soup.find_all('tr')[rn]
# Condition: if first row, then th, otherwise td
if (rn==0) :
celltag = 'th'
else :
celltag = 'td'
# Now use the celltag instead of using fixed td or th
col = r.find_all(celltag)
print()
if col[i] == 0:
print ("")
else:
for i in range(0,4) :
cell = col[i].text
clean = cell.replace('\xa0 ', '')
clean = clean.replace (' ', '')
plist.append(clean)
# If first row, create df, otherwise add to it
if (rn == 0) :
df = pd.DataFrame(plist)
else :
df2 = pd.DataFrame(plist)
colname = 'y' + str(2019-rn)
df[colname] = df2
if (company == 1):
dft = df.T
# Get header Column
head = dft.iloc[0]
# Exclude first row from the data
dft = dft[1:]
dft.columns = head
dft = dft.reset_index()
# Assign Headers
dft = dft.drop(['index'], axis = 'columns')
else:
dft2 = df.T
# Get header Column
head = dft2.iloc[0]
# Exclude first row from the data
dft2 = dft2[1:]
dft2.columns = head
dft2 = dft2.reset_index()
# Assign Headers
dft2 = dft2.drop(['index'], axis = 'columns')
dft['Symbol'] = ThisSymbol
dft = dft.append(dft2, sort=['Year','Symbol'])
company = company +1
dft
my output looks this, i want to have a symbol column to each respective firm data
Symbol,i have added
dft['Symbol'] = ThisSymbol
but it add just first company from the list to all companies data
enter image description here
I was wondering if there's a way to only OCR the document on the right (ignoring the left) without having to split the images in PS or any other image editor?
The problem is that sometimes there is text on the images. However, they are polluting my results as I only need to rear the right-hand side.
Kind regards,
O.
## PREPROCESSING (load and read images to OCR and transform them into a DataFrame)
import pytesseract as tess
from tesserocr import PyTessBaseAPI, RIL
import os
from PIL import Image
import pandas as pd
import re
import tesserocr
path = "/Users/oliviervandhuynslager/PycharmProjects/Design Centre/assets/img/" ##path to directory (folder) where the images are located
count = 0
fileName = [] #create empty list that will contain the original filenames
fullText = [] #create empty list to store the OCR results per file
for imageName in os.listdir(path):
count = count + 1
fileName.append(imageName)
# fileName.sort()#generate list from texts.
with PyTessBaseAPI(lang='eng') as api:
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
d = {"FILENAME":fileName, "OCR": fullText}
df = pd.DataFrame(d)
##Generate empty lists
search_material = []
search_product = []
search_manufacturer = []
search_designer = []
search_description = []
search_dimensions = []
search_packing = []
search_price = []
search_delivery = []
## -_-_-_-_-_-_-_-_-_-_-_-_-_-
count_material = 0
count_product = 0
count_maufacturer = 0
count_designer = 0
count_description = 0
count_dimension = 0
count_packing = 0
count_price = 0
## search for PRODUCT (NAME/TITLE)
for values in df["OCR"]:
try:
search_product.append((re.search(r'Product[\s\S]+', values).group()).split("\n")[0].split(":")[1])
count_product = count_product + 1
except:
search_product.append("")
df["PRODUCT"] = search_product
## search for MANUFACTURER
for values in df["OCR"]:
try:
search_manufacturer.append((re.search(r'Manufacturer[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_maufacturer = count_maufacturer + 1
except:
search_manufacturer.append("")
df["MANUFACTURER"] = search_manufacturer
## search for DESIGNER
for values in df["OCR"]:
try:
search_designer.append((re.search(r'Designer[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_designer = count_designer + 1
except:
search_designer.append("")
df["DESIGNER"] = search_designer
## search for MATERIALS
for values in df["OCR"]:
try:
search_material.append((re.search(r'Material[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_material = count_material + 1
except:
search_material.append("")
df["MATERIAL"] = search_material
#search for DESCRIPTION:
for values in df["OCR"]:
try:
search_description.append((re.search(r'Description[\S\s]+', values).group()).split(":")[1])
count_description = count_description + 1
except:
search_description.append("")
df["DESCRIPTION"] = search_description
#search for DIMENSIONS
for values in df["OCR"]:
try:
search_dimensions.append((re.search(r'Dimensions[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_dimension = count_dimension + 1
except:
search_dimensions.append("")
df["DIMENSIONS"] = search_dimensions
#search for PACKING
for values in df["OCR"]:
try:
search_packing.append((re.search(r'Packing[\S\s]+', values).group()).split('\n\n')[0].split(":")[1])
count_packing = count_packing + 1
except:
search_packing.append("")
df["PACKING"] = search_packing
#search for PRICE
for values in df["OCR"]:
try:
search_price.append((re.search(r'Price[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_price = count_price + 1
except:
search_price.append("")
df["PRICE"] = search_price
#search for DELIVERY DAYS
for values in df["OCR"]:
try:
search_delivery.append((re.search(r'Delivery[\S\s]+', values).group()).split("\n\n")[0].split(":")[1])
count_delivery = count_delivery + 1
except:
search_delivery.append("")
df["DELIVERY"] = search_delivery
df.drop(columns="OCR", inplace=True)
print(df)
If the layout of text on your image is fixed then you can simply read the full Image but pass only half of that image array to tesseract.
import cv2
img = cv2.imread(inputPath)
_, width, _ = img.shape
half = width//2
cut = img[: half: , :]
temp_path = r'path/where/you/want/your/cropped/image/to/be/saved'
cv2.imwrite(temp_path, cut)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
os.remove(temp_path) #removing cut image from the directory
Alternate Approach
You can pass the image array cut to the tesseract instead of saving it and then removing it. In that case, remember to convert the image array cut to RGB format since open cv uses BGR format by default while reading images.
rgb_arr = cv2.cvtColor(cut, cv2.COLOR_BGR2RGB)
All these things can be done with PIL also. In PIL you can use crop() to extract the required part of the image. Also by default, it reads images in the RGB format and can be passed directly to tesseract if you are following the alternate approach as mentioned above
You can call api.SetRectangle method passing the coordinates of the right half before recognition.
I have 6 subplots that need 2 dynamic title components and I can code for 1 but I'm not sure how to change my code below to add a 2nd dynamic title component on the same line after searching the literature. Here is my for loop to generate the 6 subplots with the "plt.title.." line below:
list = [0,1,2,3,4,5]
now = datetime.datetime.now()
currm = now.month
import calendar
fig, ax = plt.subplots(6)
for x in list:
dam = DS.where(DS['time.year']==rmax.iloc[x,1]).groupby('time.month').mean()#iterate by index of
column "1" or the years
dam = dam.sel(month=3)#current month mean 500
dam = dam.sel(level=500)
damc = dam.to_array()
lats = damc['lat'].data
lons = damc['lon'].data
#plot data
ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines(lw=1)
damc = damc.squeeze()
cnplot = plt.contour(lons,lats,damc,cmap='jet')
plt.title('Mean 500mb Hgt + Phase {} 2020'.format(calendar.month_name[currm-1]))
plt.show()
#plt.clf()
I need to add one of each from this list in the loop to the "plt.title.." between the "+" and the word "Phase" line above...?
tindices = ['SOI','AO','NAO','PNA','EPO','PDO']
Thank you for any help with this!
Try accessing the tindices one by one and passing them to the title
plt.title('Mean 500mb Hgt + {} Phase {} 2020'.format(tindices[x],
calendar.month_name[currm-1]))
From a file i import lines. In this line an (escaped) string is part of the line:
DP,0,"021",257
DP,1,"022",257
DP,2,"023",513
DP,3,"024",513
DP,4,"025",1025
DP,5,"026",1025
DP,6,"081",257
DP,7,"082",257
DP,8,"083",513
DP,9,"084",513
DP,10,"085",1025
DP,11,"086",1025
DP,12,"087",1025
DP,13,"091",257
DP,14,"092",513
DP,15,"093",1025
IS,0,"FIX",0
IS,1,"KARIN02",0
IS,2,"KARUIT02",0
IS,3,"KARIN02HOV",0
IS,4,"KARUIT02HOV",0
IS,5,"KARIN08",0
IS,6,"KARUIT08",0
IS,7,"KARIN08HOV",0
IS,8,"KARUIT08HOV",0
IS,9,"KARIN09",0
IS,10,"KARUIT09",0
IS,11,"KARIN09HOV",0
IS,12,"KARUIT09HOV",0
IS,13,"KARIN10",0
IS,14,"KARUIT10",0
IS,15,"KARIN10HOV",0
I get the following Objects (if DP) :
index - parts1 (int)
name - parts2 (string)
ref - parts3 (int)
I tried using REGEX to replace the excape-sequence from the lines but to no effect
#name_to_ID = {}
kruising = 2007
File.open(cfgFile).each{|line|
parts = line.split(",")
if parts[0]=="DP"
index = parts[1].to_i
hex = index.to_s(16).upcase.rjust(2, '0')
cname = parts[2].to_s
tname = cname.gsub('\\"','')
p "cname= #{cname} (#{cname.length})"
p "tname= #{tname} (#{tname.length})"
p cname == tname
#name_to_ID[tname] = kruising.to_s + "-" + hex.to_s
end
}
teststring = "021"
p #name_to_ID[teststring]
> "021" (5)
> "021" (5)
> true
> nil
The problem came to light when calling from another string reference (length3)
hash[key] isnt equal as string "021" ( length 5) is not string 021 ( length 3)
any method that actually replaces the chars i need?
EDIT: I used
cname.each_char{|c|
p c
}
> "\""
> "0"
> "2"
> "1"
> "\""
EDIT: requested outcome update:
# Current output:
#name_to_ID["021"] = 2007-00 "021".length = 5
#name_to_ID["022"] = 2007-01 "022".length = 5
#name_to_ID["081"] = 2007-06 "081".length = 5
#name_to_ID["082"] = 2007-07 "082".length = 5
#name_to_ID["091"] = 2007-0D "091".length = 5
#name_to_ID["101"] = 2007-10 "101".length = 5
# -------------
# Expected output:
#name_to_ID["021"] = 2007-00 "021".length = 3
#name_to_ID["022"] = 2007-01 "022".length = 3
#name_to_ID["081"] = 2007-06 "081".length = 3
#name_to_ID["082"] = 2007-07 "082".length = 3
#name_to_ID["091"] = 2007-0D "091".length = 3
#name_to_ID["101"] = 2007-10 "101".length = 3
Your problem is you don't know the correct character in your string. It might not be the same character when printing it.
Try parts[2].to_s.bytes to check exactly what is the character code of that unexpected character. For example:
> "asd".bytes
=> [205, 184, 97, 115, 100]
Alternatively, you can delete the first and the last characters, if you are sure that every part of the string has the same format:
cname = parts[2].to_s[1..-2]
Or you can remove all special characters in the string if you know that the string will not contain any special character
cname = parts[2].to_s.gsub(/[^0-9A-Za-z]/, '')