When I run the following code
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
text = r"""
As checked Dis is not yet on boarded to ARB portal, hence we cannot upload the invoices in portal
"""
questions = [
"Dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.",
]
for question in questions:
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(
answer_start_scores
) # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(f"Question: {question}")
print(f"Answer: {answer}\n")
The answer that I get here is:
Question: Dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.
Answer: dis is not yet on boarded to ARB portal
How do I get a score for this answer? Score here is very similar to what is I get when I run Question-Answer pipeline .
I have to take this approach since Question-Answer pipeline when used is giving me Key Error for the below code
from transformers import pipeline
nlp = pipeline("question-answering")
context = r"""
As checked Dis is not yet on boarded to ARB portal, hence we cannot upload the invoices in portal.
"""
print(nlp(question="Dis asked if it is possible to post the two invoice in ARB?", context=context))
This is my attempt to get the score. It appears that I cannot figure out what feature.p_mask. So I could not remove the non-context indexes that contribute to the softmax at the moment.
# ... assuming imports and question and context
model_name="deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
inputs = tokenizer(question, context,
add_special_tokens=True,
return_tensors='pt')
input_ids = inputs['input_ids'].tolist()[0]
outputs = model(**inputs)
# used to compute score
start = outputs.start_logits.detach().numpy()
end = outputs.end_logits.detach().numpy()
# from source code
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
#?? undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
# Generate mask
undesired_tokens = inputs['attention_mask']
undesired_tokens_mask = undesired_tokens == 0.0
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start_ = np.where(undesired_tokens_mask, -10000.0, start)
end_ = np.where(undesired_tokens_mask, -10000.0, end)
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
# Compute the score of each tuple(start, end) to be the real answer
outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))
# Remove candidate with end < start and end - start > max_answer_len
max_answer_len = 15
candidates = np.tril(np.triu(outer), max_answer_len - 1)
scores_flat = candidates.flatten()
idx_sort = [np.argmax(scores_flat)]
start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
end += 1
score = candidates[0, start, end-1]
start, end, score = start.item(), end.item(), score.item()
print(tokenizer.decode(input_ids[start:end]))
print(score)
See more source code
I was wondering if there's a way to only OCR the document on the right (ignoring the left) without having to split the images in PS or any other image editor?
The problem is that sometimes there is text on the images. However, they are polluting my results as I only need to rear the right-hand side.
Kind regards,
O.
## PREPROCESSING (load and read images to OCR and transform them into a DataFrame)
import pytesseract as tess
from tesserocr import PyTessBaseAPI, RIL
import os
from PIL import Image
import pandas as pd
import re
import tesserocr
path = "/Users/oliviervandhuynslager/PycharmProjects/Design Centre/assets/img/" ##path to directory (folder) where the images are located
count = 0
fileName = [] #create empty list that will contain the original filenames
fullText = [] #create empty list to store the OCR results per file
for imageName in os.listdir(path):
count = count + 1
fileName.append(imageName)
# fileName.sort()#generate list from texts.
with PyTessBaseAPI(lang='eng') as api:
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
d = {"FILENAME":fileName, "OCR": fullText}
df = pd.DataFrame(d)
##Generate empty lists
search_material = []
search_product = []
search_manufacturer = []
search_designer = []
search_description = []
search_dimensions = []
search_packing = []
search_price = []
search_delivery = []
## -_-_-_-_-_-_-_-_-_-_-_-_-_-
count_material = 0
count_product = 0
count_maufacturer = 0
count_designer = 0
count_description = 0
count_dimension = 0
count_packing = 0
count_price = 0
## search for PRODUCT (NAME/TITLE)
for values in df["OCR"]:
try:
search_product.append((re.search(r'Product[\s\S]+', values).group()).split("\n")[0].split(":")[1])
count_product = count_product + 1
except:
search_product.append("")
df["PRODUCT"] = search_product
## search for MANUFACTURER
for values in df["OCR"]:
try:
search_manufacturer.append((re.search(r'Manufacturer[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_maufacturer = count_maufacturer + 1
except:
search_manufacturer.append("")
df["MANUFACTURER"] = search_manufacturer
## search for DESIGNER
for values in df["OCR"]:
try:
search_designer.append((re.search(r'Designer[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_designer = count_designer + 1
except:
search_designer.append("")
df["DESIGNER"] = search_designer
## search for MATERIALS
for values in df["OCR"]:
try:
search_material.append((re.search(r'Material[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_material = count_material + 1
except:
search_material.append("")
df["MATERIAL"] = search_material
#search for DESCRIPTION:
for values in df["OCR"]:
try:
search_description.append((re.search(r'Description[\S\s]+', values).group()).split(":")[1])
count_description = count_description + 1
except:
search_description.append("")
df["DESCRIPTION"] = search_description
#search for DIMENSIONS
for values in df["OCR"]:
try:
search_dimensions.append((re.search(r'Dimensions[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_dimension = count_dimension + 1
except:
search_dimensions.append("")
df["DIMENSIONS"] = search_dimensions
#search for PACKING
for values in df["OCR"]:
try:
search_packing.append((re.search(r'Packing[\S\s]+', values).group()).split('\n\n')[0].split(":")[1])
count_packing = count_packing + 1
except:
search_packing.append("")
df["PACKING"] = search_packing
#search for PRICE
for values in df["OCR"]:
try:
search_price.append((re.search(r'Price[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_price = count_price + 1
except:
search_price.append("")
df["PRICE"] = search_price
#search for DELIVERY DAYS
for values in df["OCR"]:
try:
search_delivery.append((re.search(r'Delivery[\S\s]+', values).group()).split("\n\n")[0].split(":")[1])
count_delivery = count_delivery + 1
except:
search_delivery.append("")
df["DELIVERY"] = search_delivery
df.drop(columns="OCR", inplace=True)
print(df)
If the layout of text on your image is fixed then you can simply read the full Image but pass only half of that image array to tesseract.
import cv2
img = cv2.imread(inputPath)
_, width, _ = img.shape
half = width//2
cut = img[: half: , :]
temp_path = r'path/where/you/want/your/cropped/image/to/be/saved'
cv2.imwrite(temp_path, cut)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
os.remove(temp_path) #removing cut image from the directory
Alternate Approach
You can pass the image array cut to the tesseract instead of saving it and then removing it. In that case, remember to convert the image array cut to RGB format since open cv uses BGR format by default while reading images.
rgb_arr = cv2.cvtColor(cut, cv2.COLOR_BGR2RGB)
All these things can be done with PIL also. In PIL you can use crop() to extract the required part of the image. Also by default, it reads images in the RGB format and can be passed directly to tesseract if you are following the alternate approach as mentioned above
You can call api.SetRectangle method passing the coordinates of the right half before recognition.
I have a pandas multilevel dataframe df to contain the quarterly financial report data for about 2000+ stocks from year 2006 to 2012 . And I am trying to figure out a way to quickly calculate the 'average' values for each data point.
demo_data() is the function to generate the demo data (df = demo_data(stk_qty=2000, col_num=200) can be used to simulate the financial report data):
def demo_data(stk_qty, col_num):
''' generate demo data, return multilevel dataframe '''
import random
import pandas as pd
rpt_date_template = [(yr+qt) for yr in map(str, range(2006, 2013)) for qt in ['0331','0630','0930','1231']]
stk_id_list = ['STK'+str(x).zfill(3) for x in range(0, stk_qty)]
stk_id_column, rpt_date_column = [], []
for i in range(stk_qty):
stk_rpt_date_list = rpt_date_template[random.randint(0,8):] # rpt dates with random start
stk_id_column.extend([stk_id_list[i]] * len(stk_rpt_date_list))
rpt_date_column.extend(stk_rpt_date_list)
index_name = ['STK_ID', 'RPT_Date']
col_name = ['COL'+str(x).zfill(3) for x in range(col_num)]
first_level_dt = stk_id_column
second_level_dt = rpt_date_column
dt = pd.DataFrame(np.random.randn(len(stk_id_column), col_num), columns=col_name)
dt[index_name[0]] = first_level_dt
dt[index_name[1]] = second_level_dt
multilevel_df = dt.set_index(index_name, drop=True, inplace=False)
return multilevel_df
Here is a sample data. (note: sw() is a method to display the four corners data of a big dataframe, source code is at: How to preview a part of a large pandas DataFrame? )
>>> df = demo_data(5,3)
>>> df.sw()
COL000 COL001 COL002
STK_ID RPT_Date
STK000 20060630 1.8196 0.9519 -1.0526
20060930 -0.4074 -0.9025 1.3562
20061231 -1.1750 0.4190 -1.2976
20070331 -0.5609 1.5190 0.4893
20070630 0.4580 -0.3804 0.3705
20070930 -0.4711 -1.1953 -0.0609
20071231 0.3363 1.1949 1.2802
20080331 1.6359 0.8355 -0.2763
20080630 0.2697 -0.8236 -1.7095
20080930 0.6178 -0.3742 -1.1646
.......................................
STK004 20111231 -0.3198 1.6972 -1.3281
20120331 -1.1905 -0.4597 0.3695
20120630 -0.8253 -0.0502 -0.2862
20120930 0.0059 -1.8535 -1.2107
20121231 0.5762 -0.2872 0.0993
Index : ['STK_ID', 'RPT_Date']
Column: COL000,COL001,COL002
row: 117 col: 3
The customized average function I want is named as my_avg() and defined as below rules:
1. Q1's average value is (Q4_of_previous_yr + Q1)/2
2. Q2's average value is (Q4_of_previous_yr + Q1 + Q2)/3
3. Q3's average value is (Q4_of_previous_yr + Q1 + Q2 + Q3)/4
4. Q4's average value is (Q4_of_previous_yr + Q1 + Q2 + Q3 + Q4)/5
5. if some of the data points are not provided, just calculate the normal average of available data points
so the my_avg(df) will have below output for each STK_ID:
STK_ID RPT_Date COL000 COL001 COL002
STK000 20060630 1.819619705 0.951918984 -1.052639309
20060930 0.706112476 0.024688028 0.151757352
20061231 0.079077767 0.156125083 -0.331359614
20070331 -0.867930112 0.969000466 -0.404129827
20070630 -0.425943376 0.519205768 -0.145929753
20070930 -0.437234418 0.090579744 -0.124681449
20071231 -0.282524858 0.3114374 0.156297097
20080331 0.986121631 1.015202552 0.501971496
.......................................
STK004 20111231 xxxxx xxxxxxx xxxxxxx
How to write the code for my_avg() ?
Reference:
I try to write a temp_solution_avg() function. But it has three issues:
1. the average calculation not include 'Q4_of_previous_yr' data point, so the result is not what I want.
2. data's 'RPT_Date' must start with Q1 ('xxxx0331'), otherwise first yr's data is wrong
3. the calculation speed is very very slow.
In [3]: df = demo_data(500,100)
In [4]: timeit temp_solution_avg(df)
1 loops, best of 3: 66.3 s per loop
def temp_solution_avg(df):
''' return the average , Q1: not change, Q2 : (df.Q1 + df.Q2)/2 ,
Q3: (df.Q1 + df.Q2 + df.Q3)/3, Q4 : (df.Q1 + df.Q2 + df.Q3 + df.Q4)/4
data's 'RPT_Date' must start with Q1 ('xxxx0331'), otherwise first yr's
data is wrong .
'''
dt = df.reset_index()
dt['yr'] = dt['RPT_Date'].str[0:4]
dt['temp_stk_id'] = dt['STK_ID']
dt = dt.set_index(['STK_ID','RPT_Date'], drop=True, inplace=False)
rst = dt.groupby(['temp_stk_id','yr']).transform(pd.expanding_mean)
return rst
I'm using ReportLab to write tables in PDF documents and am very pleased with the results (despite not having a total grasp on flowables just yet).
However, I have not been able to figure out how to make a table that spans a page break have its column headings repeated.
The code below creates a test.pdf in C:\Temp that has a heading row followed by 99 rows of data.
The heading row looks great on the first page but I would like that to repeat at the top of the second and third pages.
I'm keen to hear of any approaches that have been used to accomplish that using the SimpleDocTemplate.
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Frame, Spacer
from reportlab.lib import colors
from reportlab.lib.units import cm
from reportlab.lib.pagesizes import A3, A4, landscape, portrait
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY
from reportlab.pdfgen import canvas
pdfReportPages = "C:\\Temp\\test.pdf"
doc = SimpleDocTemplate(pdfReportPages, pagesize=A4)
# container for the "Flowable" objects
elements = []
styles=getSampleStyleSheet()
styleN = styles["Normal"]
# Make heading for each column
column1Heading = Paragraph("<para align=center>COLUMN ONE HEADING</para>",styles['Normal'])
column2Heading = Paragraph("<para align=center>COLUMN TWO HEADING</para>",styles['Normal'])
row_array = [column1Heading,column2Heading]
tableHeading = [row_array]
tH = Table(tableHeading, [6 * cm, 6 * cm]) # These are the column widths for the headings on the table
tH.hAlign = 'LEFT'
tblStyle = TableStyle([('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)])
tblStyle.add('BACKGROUND',(0,0),(-1,-1),colors.lightblue)
tH.setStyle(tblStyle)
elements.append(tH)
# Assemble rows of data for each column
for i in range(1,100):
column1Data = Paragraph("<para align=center> " + "Row " + str(i) + " Column 1 Data" + "</font> </para>",styles['Normal'])
column2Data = Paragraph("<para align=center> " + "Row " + str(i) + " Column 2 Data" + "</font> </para>",styles['Normal'])
row_array = [column1Data,column2Data]
tableRow = [row_array]
tR=Table(tableRow, [6 * cm, 6 * cm])
tR.hAlign = 'LEFT'
tR.setStyle(TableStyle([('BACKGROUND',(0,0),(-1,-1),colors.white),
('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)]))
elements.append(tR)
del tR
elements.append(Spacer(1, 0.3 * cm))
doc.build(elements)
From the documentation (yes, I know, but it's sometimes hard to locate this stuff in the manual):
The repeatRows argument specifies the number of leading rows that
should be repeated when the Table is asked to split itself.
So when you create the table, this is one of the arguments you can pass, and it will turn the first n rows into header rows that repeat. You'll find this part of the text on page 77, but the section relating to creating a Table starts on page 76.
http://www.reportlab.com/docs/reportlab-userguide.pdf
This is the code I developed, after following Gordon's advice to reconsider using repeatRows, and it works!
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Frame, Spacer
from reportlab.lib import colors
from reportlab.lib.units import cm
from reportlab.lib.pagesizes import A3, A4, landscape, portrait
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.enums import TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY
from reportlab.pdfgen import canvas
pdfReportPages = "C:\\Temp\\test.pdf"
doc = SimpleDocTemplate(pdfReportPages, pagesize=A4)
# container for the "Flowable" objects
elements = []
styles=getSampleStyleSheet()
styleN = styles["Normal"]
# Make heading for each column and start data list
column1Heading = "COLUMN ONE HEADING"
column2Heading = "COLUMN TWO HEADING"
# Assemble data for each column using simple loop to append it into data list
data = [[column1Heading,column2Heading]]
for i in range(1,100):
data.append([str(i),str(i)])
tableThatSplitsOverPages = Table(data, [6 * cm, 6 * cm], repeatRows=1)
tableThatSplitsOverPages.hAlign = 'LEFT'
tblStyle = TableStyle([('TEXTCOLOR',(0,0),(-1,-1),colors.black),
('VALIGN',(0,0),(-1,-1),'TOP'),
('LINEBELOW',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(-1,-1),1,colors.black),
('BOX',(0,0),(0,-1),1,colors.black)])
tblStyle.add('BACKGROUND',(0,0),(1,0),colors.lightblue)
tblStyle.add('BACKGROUND',(0,1),(-1,-1),colors.white)
tableThatSplitsOverPages.setStyle(tblStyle)
elements.append(tableThatSplitsOverPages)
doc.build(elements)
Use the repeatRows=1 when you create the Table...
from reportlab.platypus import Table
Table(data,repeatRows=1)
I always like to have something you can cut & paste into a .py file to run and test. So here it is...
import os
import pandas as pd
import numpy as np
import reportlab.platypus
import reportlab.lib.styles
from reportlab.lib import colors
from reportlab.lib.units import mm
from reportlab.lib.pagesizes import letter, landscape
reportoutputfilepath = os.path.join('.\\test.pdf')
pdf_file = reportlab.platypus.SimpleDocTemplate(
reportoutputfilepath,
pagesize=landscape(letter),
rightMargin=10,
leftMargin=10,
topMargin=38,
bottomMargin=23
)
ts_tables = [
('ALIGN', (4,0), (-1,-1), 'RIGHT'),
('LINEBELOW', (0,0), (-1,0), 1, colors.purple),
('FONT', (0,0), (-1,0), 'Times-Bold'),
('LINEABOVE', (0,-1), (-1,-1), 1, colors.purple),
('FONT', (0,-1), (-1,-1), 'Times-Bold'),
('BACKGROUND',(1,1),(-2,-2),colors.white),
('TEXTCOLOR',(0,0),(1,-1),colors.black),
('FONTSIZE', (0,0),(-1,-1), 8),
]
df = pd.DataFrame(np.random.randint(0,1000,size=(1000, 4)), columns=list('ABCD'))
lista = [df.columns[:,].values.astype(str).tolist()] + df.values.tolist()
#Here is where you put repeatRows=1
table = reportlab.platypus.Table(lista, colWidths=(20*mm, 20*mm, 20*mm, 20*mm),repeatRows=1)
table_style = reportlab.platypus.TableStyle(ts_tables)
table.setStyle(table_style)
elements = []
elements.append(table)
# Build the PDF
pdf_file.build(elements)
print reportoutputfilepath
t1 = Table(lista, colWidths=220, rowHeights=20, repeatRows=1)
just type repeatRows=1
I found this solution to repeat easily the header on a table which is on two pages. Add this line in your CSS for your table:
-fs-table-paginate: paginate;
I also found a class for FPDF which seems powerful (i don't need it for the moment, so I didn't test it)
http://interpid.eu/fpdf-table