Steganography program - converting python 2 to 3, syntax error in: base64.b64decode("".join(chars)) - ascii

I have problem with the syntax in the last part of steg program. I tried to convert python 2 version (of the working code) to python 3, and this is the last part of it:
flag = base64.b64decode("".join(chars)) <- error
print(flag)
The program 1. encrypts the message in the Last Significiant Bits of the image as saves it as a new image. Then 2.decrypts the message, which is stored in "flag", and prints it.
* can the error be caused by the wrong type of input?:
message = input("Your message: ")
BELOW: UNHIDING PROGRAM
#coding: utf-8
import base64
from PIL import Image
image = Image.open("after.png")
extracted = ''
pixels = image.load()
#Iterating in 1st row
for x in range(0,image.width):
r,g,b = pixels[x,0]
# Storing LSB of each color
extracted += bin(r)[-1]
extracted += bin(g)[-1]
extracted += bin(b)[-1]
chars = []
for i in range(len(extracted)/8):
byte = extracted[i*8:(i+1)*8]
chars.append(chr(int(''.join([str(bit) for bit in byte]), 2)))
flag = base64.b64decode(''.join(chars))
print flag
BELOW: HIDING PROGRAM:
import bitarray
import base64
from PIL import Image
with Image.open('before.png') as im:
pixels=im.load()
message = input("Your message: ")
encoded_message = base64.b64encode(message.encode('utf-8'))
#Convert the message into an array of bits
ba = bitarray.bitarray()
ba.frombytes(encoded_message)
bit_array = [int(i) for i in ba]
#Duplicate the original picture
im = Image.open("before.png")
im.save("after.png")
im = Image.open("after.png")
width, height = im.size
pixels = im.load()
#Hide message in the first row
i = 0
for x in range(0,width):
r,g,b = pixels[x,0]
#print("[+] Pixel : [%d,%d]"%(x,0))
#print("[+] \tBefore : (%d,%d,%d)"%(r,g,b))
#Default values in case no bit has to be modified
new_bit_red_pixel = 255
new_bit_green_pixel = 255
new_bit_blue_pixel = 255
if i<len(bit_array):
#Red pixel
r_bit = bin(r)
r_last_bit = int(r_bit[-1])
r_new_last_bit = r_last_bit & bit_array[i]
new_bit_red_pixel = int(r_bit[:-1]+str(r_new_last_bit),2)
i += 1
if i<len(bit_array):
#Green pixel
g_bit = bin(g)
g_last_bit = int(g_bit[-1])
g_new_last_bit = g_last_bit & bit_array[i]
new_bit_green_pixel = int(g_bit[:-1]+str(g_new_last_bit),2)
i += 1
if i<len(bit_array):
#Blue pixel
b_bit = bin(b)
b_last_bit = int(b_bit[-1])
b_new_last_bit = b_last_bit & bit_array[i]
new_bit_blue_pixel = int(b_bit[:-1]+str(b_new_last_bit),2)
i += 1
pixels[x,0] = (new_bit_red_pixel,new_bit_green_pixel,new_bit_blue_pixel)
#print("[+] \tAfter: (%d,%d,%d)"%(new_bit_red_pixel,new_bit_green_pixel,new_bit_blue_pixel))
im.save('after.png')
error
ValueError: string argument should contain only ASCII characters

help for base64.b64decode says:
b64decode(s, altchars=None, validate=False)
Decode the Base64 encoded bytes-like object or ASCII string s.
...
Considering that in Python 2 there were "normal" strs and unicode-strs (u-prefixed), I suggest taking closer look at what produce "".join(chars). Does it contain solely ASCII characters?
I suggest adding:
print("Codes:",[ord(c) for c in chars])
directly before:
flag = base64.b64decode("".join(chars))
If there will be number >127 inside codes, that mean it might not work as it is fit only for pure ASCII strs.

Related

how to add symbols to the multiple stock data

#i have scraped data below is my code, now i want to add a column of symbols to the respective company data, plz guide me how the symbol can be added to the respective firm data
#code below
from time import sleep
import pandas as pd
import os
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
symbols =['FATIMA',
'SSGC',
'FCCL',
'ISL',
'KEL',
'NCL',
'DGKC',
'SNGP',
'NML',
'ENGRO',
'HUMNL',
'CHCC',
'ATRL',
'HUBC',
'ASTL',
'PIBTL',
'OGDC',
'EFERT',
'FFC',
'NCPL',
'KTML',
'PSO',
'LUCK',
'SEARL',
'KOHC',
'ABOT',
'AICL',
'HASCOL',
'PTC',
'KAPCO',
'PIOC',
'POL',
'SHEL',
'GHGL',
'HCAR',
'DCR',
'BWCL',
'MTL',
'GLAXO',
'PKGS',
'SHFA','MARI',
'ICI',
'ACPL',
'PSMC',
'SPWL',
'THALL',
'BNWM',
'EFUG',
'GADT',
'AABS']
company = 1
for ThisSymbol in symbols :
# Get first symbol from the above python list
company = 2
# In the URL, make symbol as variable
url = 'http://www.scstrade.com/stockscreening/SS_CompanySnapShotYF.aspx?symbol=' + ThisSymbol
browser.get(url)
sleep(2)
# The below command will get all the contents from the url
html = browser.execute_script("return document.documentElement.outerHTML")
# So we will supply the contents to beautiful soup and we tell to consider this text as a html, with the following command
soup = BeautifulSoup (html, "html.parser")
for rn in range(0,9) :
plist = []
r = soup.find_all('tr')[rn]
# Condition: if first row, then th, otherwise td
if (rn==0) :
celltag = 'th'
else :
celltag = 'td'
# Now use the celltag instead of using fixed td or th
col = r.find_all(celltag)
print()
if col[i] == 0:
print ("")
else:
for i in range(0,4) :
cell = col[i].text
clean = cell.replace('\xa0 ', '')
clean = clean.replace (' ', '')
plist.append(clean)
# If first row, create df, otherwise add to it
if (rn == 0) :
df = pd.DataFrame(plist)
else :
df2 = pd.DataFrame(plist)
colname = 'y' + str(2019-rn)
df[colname] = df2
if (company == 1):
dft = df.T
# Get header Column
head = dft.iloc[0]
# Exclude first row from the data
dft = dft[1:]
dft.columns = head
dft = dft.reset_index()
# Assign Headers
dft = dft.drop(['index'], axis = 'columns')
else:
dft2 = df.T
# Get header Column
head = dft2.iloc[0]
# Exclude first row from the data
dft2 = dft2[1:]
dft2.columns = head
dft2 = dft2.reset_index()
# Assign Headers
dft2 = dft2.drop(['index'], axis = 'columns')
dft['Symbol'] = ThisSymbol
dft = dft.append(dft2, sort=['Year','Symbol'])
company = company +1
dft
my output looks this, i want to have a symbol column to each respective firm data
Symbol,i have added
dft['Symbol'] = ThisSymbol
but it add just first company from the list to all companies data
enter image description here

OCR only image on one side

I was wondering if there's a way to only OCR the document on the right (ignoring the left) without having to split the images in PS or any other image editor?
The problem is that sometimes there is text on the images. However, they are polluting my results as I only need to rear the right-hand side.
Kind regards,
O.
## PREPROCESSING (load and read images to OCR and transform them into a DataFrame)
import pytesseract as tess
from tesserocr import PyTessBaseAPI, RIL
import os
from PIL import Image
import pandas as pd
import re
import tesserocr
path = "/Users/oliviervandhuynslager/PycharmProjects/Design Centre/assets/img/" ##path to directory (folder) where the images are located
count = 0
fileName = [] #create empty list that will contain the original filenames
fullText = [] #create empty list to store the OCR results per file
for imageName in os.listdir(path):
count = count + 1
fileName.append(imageName)
# fileName.sort()#generate list from texts.
with PyTessBaseAPI(lang='eng') as api:
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
d = {"FILENAME":fileName, "OCR": fullText}
df = pd.DataFrame(d)
##Generate empty lists
search_material = []
search_product = []
search_manufacturer = []
search_designer = []
search_description = []
search_dimensions = []
search_packing = []
search_price = []
search_delivery = []
## -_-_-_-_-_-_-_-_-_-_-_-_-_-
count_material = 0
count_product = 0
count_maufacturer = 0
count_designer = 0
count_description = 0
count_dimension = 0
count_packing = 0
count_price = 0
## search for PRODUCT (NAME/TITLE)
for values in df["OCR"]:
try:
search_product.append((re.search(r'Product[\s\S]+', values).group()).split("\n")[0].split(":")[1])
count_product = count_product + 1
except:
search_product.append("")
df["PRODUCT"] = search_product
## search for MANUFACTURER
for values in df["OCR"]:
try:
search_manufacturer.append((re.search(r'Manufacturer[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_maufacturer = count_maufacturer + 1
except:
search_manufacturer.append("")
df["MANUFACTURER"] = search_manufacturer
## search for DESIGNER
for values in df["OCR"]:
try:
search_designer.append((re.search(r'Designer[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_designer = count_designer + 1
except:
search_designer.append("")
df["DESIGNER"] = search_designer
## search for MATERIALS
for values in df["OCR"]:
try:
search_material.append((re.search(r'Material[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_material = count_material + 1
except:
search_material.append("")
df["MATERIAL"] = search_material
#search for DESCRIPTION:
for values in df["OCR"]:
try:
search_description.append((re.search(r'Description[\S\s]+', values).group()).split(":")[1])
count_description = count_description + 1
except:
search_description.append("")
df["DESCRIPTION"] = search_description
#search for DIMENSIONS
for values in df["OCR"]:
try:
search_dimensions.append((re.search(r'Dimensions[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_dimension = count_dimension + 1
except:
search_dimensions.append("")
df["DIMENSIONS"] = search_dimensions
#search for PACKING
for values in df["OCR"]:
try:
search_packing.append((re.search(r'Packing[\S\s]+', values).group()).split('\n\n')[0].split(":")[1])
count_packing = count_packing + 1
except:
search_packing.append("")
df["PACKING"] = search_packing
#search for PRICE
for values in df["OCR"]:
try:
search_price.append((re.search(r'Price[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_price = count_price + 1
except:
search_price.append("")
df["PRICE"] = search_price
#search for DELIVERY DAYS
for values in df["OCR"]:
try:
search_delivery.append((re.search(r'Delivery[\S\s]+', values).group()).split("\n\n")[0].split(":")[1])
count_delivery = count_delivery + 1
except:
search_delivery.append("")
df["DELIVERY"] = search_delivery
df.drop(columns="OCR", inplace=True)
print(df)
If the layout of text on your image is fixed then you can simply read the full Image but pass only half of that image array to tesseract.
import cv2
img = cv2.imread(inputPath)
_, width, _ = img.shape
half = width//2
cut = img[: half: , :]
temp_path = r'path/where/you/want/your/cropped/image/to/be/saved'
cv2.imwrite(temp_path, cut)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
os.remove(temp_path) #removing cut image from the directory
Alternate Approach
You can pass the image array cut to the tesseract instead of saving it and then removing it. In that case, remember to convert the image array cut to RGB format since open cv uses BGR format by default while reading images.
rgb_arr = cv2.cvtColor(cut, cv2.COLOR_BGR2RGB)
All these things can be done with PIL also. In PIL you can use crop() to extract the required part of the image. Also by default, it reads images in the RGB format and can be passed directly to tesseract if you are following the alternate approach as mentioned above
You can call api.SetRectangle method passing the coordinates of the right half before recognition.

Get position of token in berts output layer

We are interested in the bert vectors for each token. With bert vector we mean the word vector for a specific token in berts output layer. So we would like to find out which token produces which bert vector. We wrote some code but we are not sure if it is correct or how to test it.
So in the code we process a sentence with bert. We construct a list of position ids and hand them to the model. Afterwards we use the same position ids to map the tokens to the output layer. Then there is some code that produces calculates the character offsets of each vector in the input sentence.
Is this the correct way how to use position_ids to generate
from transformers import BertModel, BertConfig, BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def sentence_to_vector(input_sentence):
tokens_encoded = tokenizer.encode(input_sentence, add_special_tokens=True)
input_ids = torch.tensor(tokens_encoded).unsqueeze(0) # Batch size 1
seq_length = input_ids.size(1)
# code to construct position_ids from here:
# https://github.com/huggingface/transformers/blob/8da280ebbeca5ebd7561fd05af78c65df9161f92/pytorch_pretrained_bert/modeling.py#L188:L189
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
outputs = model(input_ids, position_ids=position_ids)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
# from the BertModel documentation (example at the bottom):
# The last hidden-state is the first element of the output tuple
# https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
#ttv = {} # token to vector
#for i in position_ids[0]:
# ttv[tokens[i]] = outputs[0][0][position_ids[0][i]]
data = []
last_offset = 0
for i in range(0, len(position_ids[0])):
token = tokens[position_ids[0][i]]
vector = outputs[0][0][position_ids[0][i]]
pos_begin = None
pos_end = None
if not token == "[CLS]" and not token == "[SEP]":
pos_begin = input_sentence.find(token, last_offset)
pos_end = pos_begin + len(token)
last_offset = pos_end
data.append({
"token": token,
"pos_begin": pos_begin,
"pos_end": pos_end,
"vector": vector
})
return data
input_sentence = "do the chicken dance!"
data = sentence_to_vector(input_sentence)
for token in data:
print(token["token"] + "\t" + str(token["pos_begin"]) + "\t" + str(token["pos_end"]) + "\t" + str(token["vector"][0:3]) + "..." )

Why spatial transformer network (STN) is not working on image

I am trying to subject a one image array to STN using code from https://github.com/kevinzakka/spatial-transformer-network :
def STNfn(x):
import tensorflow as tf
print(x.shape)
B,W,H,C = x.shape
# identity transform
initial = np.array([[1., 0, 0], [0, 1., 0]])
initial = initial.astype('float32').flatten()
# localization network
n_fc = 6
W_fc1 = tf.Variable(tf.zeros([H*W*C, n_fc]), name='W_fc1')
b_fc1 = tf.Variable(initial_value=initial, name='b_fc1')
h_fc1 = tf.matmul(tf.zeros([B, H*W*C]), W_fc1) + b_fc1
# spatial transformer layer
from stn import spatial_transformer_network as transformer
h_trans = transformer(x, h_fc1)
return h_trans
fname = 'testimage.jpg'
img = plt.imread(fname)
img = STNfn(np.array([img]))
However, I am getting following error:
TypeError: Input 'y' of 'Mul' Op has type uint8
that does not match type float32 of argument 'x'.
I have tried to replace float32 with np.uint8, but it does not help.
Where is the problem and how can it be solved?
n_fc = 6 has to be a float32 maybe? Not familar with Python, in Java it is like 6.0f for float and just 6 is integer.

PySide: Formatted text in QComboBox

I have this snippet of code:
import PySide.QtGui
app = PySide.QtGui.QApplication('')
wnd = PySide.QtGui.QWidget()
mly = PySide.QtGui.QVBoxLayout()
combo = PySide.QtGui.QComboBox()
table = [ ('Lorem','ipsum','dolor','sit','amet'),
('Aliquam','sodales','nunc','eget','lorem'),
('Vivamus','et','sapien','mattis','vulputate'),
('Integer','ac','dolor','commodo','cursus'),
('Sed','sed','erat','non','magna'),
('Duis','vestibulum','eu','tortor','euismod') ]
combo.clear()
for (one,two,three,four,five) in table:
combo.addItem('%-12s | %-12s | %-12s | %-12s | %-12s' % (one,two,three,four,five))
mly.addWidget(combo)
mly.addStretch(1)
wnd.setLayout(mly)
wnd.show()
app.exec_()
I have obtained this (1) and I'm looking for something like (2): all columns tabulated.
The combobox have the standard proportional font (MS Shell Dlg 2 from QtDesigner). I don't want to use monospaced font.
I have tried to calculate the maximum width in pixels of every column with combo.fontMetrics().boundingRect(text).width() and fill every column with spaces:
borde = ' '
unspc = ' '
maxwdt = {0:0, 1:0, 2:0, 3:0, 4:0}
for lstlin in table:
for (ndx,val) in enumerate(lstlin):
unwdt = combo.fontMetrics().boundingRect(borde + val + borde).width()
if (unwdt > maxwdt[ndx]):
maxwdt[ndx] = unwdt
combo.clear()
for lstlin in table:
txtlin = ''
for (ndx,val) in enumerate(lstlin):
txtcmp = borde + val + borde
while (combo.fontMetrics().boundingRect(txtcmp).width() < maxwdt[ndx]):
txtcmp += unspc
txtlin += txtcmp + '|'
combo.addItem(txtlin)
and I have obtained (3).
There are any other method to format a text with proportional font for use in a QComboBox?. Thanks.
Your algorithm is fine, but it can only be as accurate as the width of a standard space in the proportional font you are using.
To get more accurate results, use the thinnest possible whitespace character available. For fonts with good unicode support, this will be the HAIR SPACE U+200A.
On Linux (using the DejaVu Sans font) I can exactly reproduce (3) by changing the following two lines in your example script:
# hair-space
unspc = '\u200a'
borde = unspc * 10

Resources