I want to download after image crawling for multiple pages - image

I want to download after image crawling for multiple pages. However, all images cannot be downloaded because they are overwritten in [for syntax].
Below is my code. What is wrong?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq
for page in range(2,4):
baseUrl = 'https://onepiecetreasurecruise.fr/Artwork/index.php?page=index'
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
listimg = []
for i in img:
listimg.append(i['src'])
n = 1
for index, img_link in enumerate(listimg):
img_data = rq.get(img_link).content
with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
f.write(img_data)
n += 1

Another way is to download all the pictures.
from simplified_scrapy import Spider, SimplifiedDoc, utils, SimplifiedMain
class ImageSpider(Spider):
name = 'onepiecetreasurecruise'
start_urls = ['https://onepiecetreasurecruise.fr/Artwork/index.php?page=index']
# refresh_urls = True
concurrencyPer1s = 0.5 # set download speed
imgPath = 'images/'
def __init__(self):
Spider.__init__(self, self.name) # necessary
utils.createDir(self.imgPath) # create image dir
def afterResponse(self, response, url, error=None, extra=None):
try: # save images
flag = utils.saveResponseAsFile(response, self.imgPath, 'image')
if flag: return None
except Exception as err:
print(err)
return Spider.afterResponse(self, response, url, error, extra)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# image urls
urls = doc.body.getElements('p', value='card-text').a
if (urls):
for u in urls:
u['header']={'Referer': url['url']}
self.saveUrl(urls)
# next page urls
u = doc.body.getElementByText('Suivant',tag='a')
if (u):
u['href'] = utils.absoluteUrl(url.url,u.href)
self.saveUrl(u)
return True
SimplifiedMain.startThread(ImageSpider()) # start download

I fixed the indents in your code. This works for me. It downloads 30 images.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq
listimg = [] # all images
for page in range(2,4):
baseUrl = 'https://onepiecetreasurecruise.fr/Artwork/index.php?page=index'
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
for i in img:
listimg.append(i['src'])
n = 1
for index, img_link in enumerate(listimg):
img_data = rq.get(img_link).content
with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
f.write(img_data)
n += 1

Related

How to get all installed font path?

How to get all installed font path with pywin32?
I can only find a way with registry key, but I would prefer to directly use GDI or DirectWrite.
Edit:
I am not sure, but from what I can see, here is how it would maybe be possible with GDI:
Create Factory: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-dwritecreatefactory
GetSystemFontCollection: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefactory-getsystemfontcollection
Do a for loop with GetFontFamilyCount: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontcollection-getfontfamilycount
GetFontFamily: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontcollection-getfontfamily
GetMatchingFonts (the param weight, stretch, style can be anything. These param seems to only change the order or the return list): https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontfamily-getmatchingfonts
Do a for loop with GetFontCount:https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontlist-getfontcount
GetFont: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontlist-getfont
CreateFontFace: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefont-createfontface
GetFiles: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontface-getfiles
GetReferenceKey: https://learn.microsoft.com/en-us/windows/win32/api/dwrite/nf-dwrite-idwritefontfile-getreferencekey
Again DWriteCreateFactory but with uuidof IDWriteLocalFontFileLoader
GetFilePathFromKey: https://learn.microsoft.com/en-us/windows/win32/directwrite/idwritelocalfontfileloader-getfilepathfromkey
I found an solution.
I used DirectWrite API.
This code depends on pyglet librairy.
import sys
import time
from ctypes import byref, c_uint32, create_unicode_buffer
from pyglet.font.directwrite import (
DWriteCreateFactory,
DWRITE_FACTORY_TYPE_ISOLATED,
IDWriteFactory,
IDWriteFont,
IDWriteFontCollection,
IDWriteFontFace,
IDWriteFontFamily,
IDWriteFontFile,
IDWriteFontFileLoader,
IDWriteLocalFontFileLoader,
IID_IDWriteFactory,
IID_IDWriteLocalFontFileLoader,
)
from pyglet.libs.win32.types import c_void_p
from typing import List
def get_fonts_filepath() -> List[str]:
"""
Return an list of all the font installed.
"""
write_factory = IDWriteFactory()
DWriteCreateFactory(
DWRITE_FACTORY_TYPE_ISOLATED, IID_IDWriteFactory, byref(write_factory)
)
fonts_path = set()
sys_collection = IDWriteFontCollection()
write_factory.GetSystemFontCollection(byref(sys_collection), 0)
collection_count = sys_collection.GetFontFamilyCount()
for i in range(collection_count):
family = IDWriteFontFamily()
sys_collection.GetFontFamily(i, byref(family))
font_count = family.GetFontCount()
for j in range(font_count):
font = IDWriteFont()
family.GetFont(j, byref(font))
font_face = IDWriteFontFace()
font.CreateFontFace(byref(font_face))
file_ct = c_uint32()
font_face.GetFiles(byref(file_ct), None)
font_files = (IDWriteFontFile * file_ct.value)()
font_face.GetFiles(byref(file_ct), font_files)
pff = font_files[0]
key_data = c_void_p()
ff_key_size = c_uint32()
pff.GetReferenceKey(byref(key_data), byref(ff_key_size))
loader = IDWriteFontFileLoader()
pff.GetLoader(byref(loader))
try:
local_loader = IDWriteLocalFontFileLoader()
loader.QueryInterface(
IID_IDWriteLocalFontFileLoader, byref(local_loader)
)
except OSError: # E_NOTIMPL
font.Release()
font_face.Release()
loader.Release()
pff.Release()
continue
path_len = c_uint32()
local_loader.GetFilePathLengthFromKey(
key_data, ff_key_size, byref(path_len)
)
buffer = create_unicode_buffer(path_len.value + 1)
local_loader.GetFilePathFromKey(key_data, ff_key_size, buffer, len(buffer))
font.Release()
font_face.Release()
loader.Release()
local_loader.Release()
pff.Release()
fonts_path.add(buffer.value)
family.Release()
sys_collection.Release()
write_factory.Release()
return list(fonts_path)
def main():
start = time.time()
fonts_path_dwrite = get_fonts_filepath()
print(time.time() - start)
print(fonts_path_dwrite)
if __name__ == "__main__":
sys.exit(main())

Python multiprocessing result does not save on disk

def get_rstname_links(pref_urls, pref_name):
rstname_links = []
for link in tqdm.tqdm(pref_urls):
HEADERS = {"User-Agent": random.choice(AGENT_LIST)}
try:
html_content = requests.get(
link, proxies=proxies, headers=HEADERS, timeout=100, verify=False
).text
soup = BeautifulSoup(html_content, "html.parser")
hyperlinks = soup.find_all("div", {"class": "rstname"})
for hyperlink in hyperlinks:
links = hyperlink.find_all("a")
for link in links:
href = link.get("href")
print(href)
rstname_links.append(href)
except:
pass
with open(f"{ABSOLUTE_PATH}output/urls/{pref_name}.txt", "w") as f:
for link in rstname_links:
f.write("https://example.com" + link + "\n")
print(f"{pref_name} URL EXTRACTION DONE!")
multiprocessing code:
chunk_size = math.ceil(len(pref_urls) / (mp.cpu_count() * 20))
range_of_chunk = list(
chunks(range(0, len(pref_urls)), chunk_size)
)
processes = []
for chunk in range_of_chunk:
process = multiprocessing.Process(
target=get_rstname_links,
args=(
pref_urls[chunk[0] : chunk[-1]],
pref_name,
),
)
processes.append(process)
process.start()
for idx, proc in enumerate(processes):
proc.join()
Here, after all the processes completed the files are supposed to write on disk as per the code in the first function, but it's not saving on the disk.
Any solution to this problem?
Okay, I actually solved the problem like this:
def get_rstname_links(pref_urls):
rstname_links = []
for link in tqdm.tqdm(pref_urls):
HEADERS = {"User-Agent": random.choice(AGENT_LIST)}
try:
html_content = requests.get(
link, proxies=proxies, headers=HEADERS, timeout=100, verify=False
).text
# parse the html content
soup = BeautifulSoup(html_content, "html.parser")
# get all the hyperlinks in the html content list <li>
hyperlinks = soup.find_all("div", {"class": "rstname"})
for hyperlink in hyperlinks:
links = hyperlink.find_all("a")
for link in links:
href = link.get("href")
print(href)
rstname_links.append(href)
except:
pass
return rstname_links
changed multiprocessing code:
def url_download(pref_name):
# parse the arguments
base_links = get_hyperlinks(f"https://example.com/{pref_name}/")
pref_urls = visit_links(base_links)
# rstname_links = get_rstname_links(pref_urls)
pool = Pool(mp.cpu_count() * 50)
rstname_links = list(
tqdm.tqdm(
pool.imap(get_rstname_links, chunks(pref_urls, 100)),
total=math.ceil(len(pref_urls) / 100),
)
)
pool.close()
pool.join()
rstname_links = [item for sublist in rstname_links for item in sublist]
print(f"{pref_name} : {len(rstname_links)}")
return rstname_links
Then, in the main method, write the rstname_links data on the disk.

how to add symbols to the multiple stock data

#i have scraped data below is my code, now i want to add a column of symbols to the respective company data, plz guide me how the symbol can be added to the respective firm data
#code below
from time import sleep
import pandas as pd
import os
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
symbols =['FATIMA',
'SSGC',
'FCCL',
'ISL',
'KEL',
'NCL',
'DGKC',
'SNGP',
'NML',
'ENGRO',
'HUMNL',
'CHCC',
'ATRL',
'HUBC',
'ASTL',
'PIBTL',
'OGDC',
'EFERT',
'FFC',
'NCPL',
'KTML',
'PSO',
'LUCK',
'SEARL',
'KOHC',
'ABOT',
'AICL',
'HASCOL',
'PTC',
'KAPCO',
'PIOC',
'POL',
'SHEL',
'GHGL',
'HCAR',
'DCR',
'BWCL',
'MTL',
'GLAXO',
'PKGS',
'SHFA','MARI',
'ICI',
'ACPL',
'PSMC',
'SPWL',
'THALL',
'BNWM',
'EFUG',
'GADT',
'AABS']
company = 1
for ThisSymbol in symbols :
# Get first symbol from the above python list
company = 2
# In the URL, make symbol as variable
url = 'http://www.scstrade.com/stockscreening/SS_CompanySnapShotYF.aspx?symbol=' + ThisSymbol
browser.get(url)
sleep(2)
# The below command will get all the contents from the url
html = browser.execute_script("return document.documentElement.outerHTML")
# So we will supply the contents to beautiful soup and we tell to consider this text as a html, with the following command
soup = BeautifulSoup (html, "html.parser")
for rn in range(0,9) :
plist = []
r = soup.find_all('tr')[rn]
# Condition: if first row, then th, otherwise td
if (rn==0) :
celltag = 'th'
else :
celltag = 'td'
# Now use the celltag instead of using fixed td or th
col = r.find_all(celltag)
print()
if col[i] == 0:
print ("")
else:
for i in range(0,4) :
cell = col[i].text
clean = cell.replace('\xa0 ', '')
clean = clean.replace (' ', '')
plist.append(clean)
# If first row, create df, otherwise add to it
if (rn == 0) :
df = pd.DataFrame(plist)
else :
df2 = pd.DataFrame(plist)
colname = 'y' + str(2019-rn)
df[colname] = df2
if (company == 1):
dft = df.T
# Get header Column
head = dft.iloc[0]
# Exclude first row from the data
dft = dft[1:]
dft.columns = head
dft = dft.reset_index()
# Assign Headers
dft = dft.drop(['index'], axis = 'columns')
else:
dft2 = df.T
# Get header Column
head = dft2.iloc[0]
# Exclude first row from the data
dft2 = dft2[1:]
dft2.columns = head
dft2 = dft2.reset_index()
# Assign Headers
dft2 = dft2.drop(['index'], axis = 'columns')
dft['Symbol'] = ThisSymbol
dft = dft.append(dft2, sort=['Year','Symbol'])
company = company +1
dft
my output looks this, i want to have a symbol column to each respective firm data
Symbol,i have added
dft['Symbol'] = ThisSymbol
but it add just first company from the list to all companies data
enter image description here

OCR only image on one side

I was wondering if there's a way to only OCR the document on the right (ignoring the left) without having to split the images in PS or any other image editor?
The problem is that sometimes there is text on the images. However, they are polluting my results as I only need to rear the right-hand side.
Kind regards,
O.
## PREPROCESSING (load and read images to OCR and transform them into a DataFrame)
import pytesseract as tess
from tesserocr import PyTessBaseAPI, RIL
import os
from PIL import Image
import pandas as pd
import re
import tesserocr
path = "/Users/oliviervandhuynslager/PycharmProjects/Design Centre/assets/img/" ##path to directory (folder) where the images are located
count = 0
fileName = [] #create empty list that will contain the original filenames
fullText = [] #create empty list to store the OCR results per file
for imageName in os.listdir(path):
count = count + 1
fileName.append(imageName)
# fileName.sort()#generate list from texts.
with PyTessBaseAPI(lang='eng') as api:
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
d = {"FILENAME":fileName, "OCR": fullText}
df = pd.DataFrame(d)
##Generate empty lists
search_material = []
search_product = []
search_manufacturer = []
search_designer = []
search_description = []
search_dimensions = []
search_packing = []
search_price = []
search_delivery = []
## -_-_-_-_-_-_-_-_-_-_-_-_-_-
count_material = 0
count_product = 0
count_maufacturer = 0
count_designer = 0
count_description = 0
count_dimension = 0
count_packing = 0
count_price = 0
## search for PRODUCT (NAME/TITLE)
for values in df["OCR"]:
try:
search_product.append((re.search(r'Product[\s\S]+', values).group()).split("\n")[0].split(":")[1])
count_product = count_product + 1
except:
search_product.append("")
df["PRODUCT"] = search_product
## search for MANUFACTURER
for values in df["OCR"]:
try:
search_manufacturer.append((re.search(r'Manufacturer[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_maufacturer = count_maufacturer + 1
except:
search_manufacturer.append("")
df["MANUFACTURER"] = search_manufacturer
## search for DESIGNER
for values in df["OCR"]:
try:
search_designer.append((re.search(r'Designer[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_designer = count_designer + 1
except:
search_designer.append("")
df["DESIGNER"] = search_designer
## search for MATERIALS
for values in df["OCR"]:
try:
search_material.append((re.search(r'Material[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_material = count_material + 1
except:
search_material.append("")
df["MATERIAL"] = search_material
#search for DESCRIPTION:
for values in df["OCR"]:
try:
search_description.append((re.search(r'Description[\S\s]+', values).group()).split(":")[1])
count_description = count_description + 1
except:
search_description.append("")
df["DESCRIPTION"] = search_description
#search for DIMENSIONS
for values in df["OCR"]:
try:
search_dimensions.append((re.search(r'Dimensions[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_dimension = count_dimension + 1
except:
search_dimensions.append("")
df["DIMENSIONS"] = search_dimensions
#search for PACKING
for values in df["OCR"]:
try:
search_packing.append((re.search(r'Packing[\S\s]+', values).group()).split('\n\n')[0].split(":")[1])
count_packing = count_packing + 1
except:
search_packing.append("")
df["PACKING"] = search_packing
#search for PRICE
for values in df["OCR"]:
try:
search_price.append((re.search(r'Price[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_price = count_price + 1
except:
search_price.append("")
df["PRICE"] = search_price
#search for DELIVERY DAYS
for values in df["OCR"]:
try:
search_delivery.append((re.search(r'Delivery[\S\s]+', values).group()).split("\n\n")[0].split(":")[1])
count_delivery = count_delivery + 1
except:
search_delivery.append("")
df["DELIVERY"] = search_delivery
df.drop(columns="OCR", inplace=True)
print(df)
If the layout of text on your image is fixed then you can simply read the full Image but pass only half of that image array to tesseract.
import cv2
img = cv2.imread(inputPath)
_, width, _ = img.shape
half = width//2
cut = img[: half: , :]
temp_path = r'path/where/you/want/your/cropped/image/to/be/saved'
cv2.imwrite(temp_path, cut)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
os.remove(temp_path) #removing cut image from the directory
Alternate Approach
You can pass the image array cut to the tesseract instead of saving it and then removing it. In that case, remember to convert the image array cut to RGB format since open cv uses BGR format by default while reading images.
rgb_arr = cv2.cvtColor(cut, cv2.COLOR_BGR2RGB)
All these things can be done with PIL also. In PIL you can use crop() to extract the required part of the image. Also by default, it reads images in the RGB format and can be passed directly to tesseract if you are following the alternate approach as mentioned above
You can call api.SetRectangle method passing the coordinates of the right half before recognition.

Defining URL list for crawler, syntax issues

I'm currently running the following code:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
def hltv_match_list(max_offset):
offset = 0
while offset < max_offset:
url = 'http://www.hltv.org/?pageid=188&offset=' + str(offset)
base = "http://www.hltv.org/"
soup = BeautifulSoup(requests.get("http://www.hltv.org/?pageid=188&offset=0").content, 'html.parser')
cont = soup.select("div.covMainBoxContent a[href*=matchid=]")
href = urljoin(base, (a["href"] for a in cont))
# print([urljoin(base, a["href"]) for a in cont])
get_hltv_match_data(href)
offset += 50
def get_hltv_match_data(matchid_url):
source_code = requests.get(matchid_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for teamid in soup.findAll("div.covSmallHeadline a[href*=teamid=]"):
print teamid.string
hltv_match_list(5)
Errors:
File "C:/Users/mdupo/PycharmProjects/HLTVCrawler/Crawler.py", line 12, in hltv_match_list
href = urljoin(base, (a["href"] for a in cont))
File "C:\Python27\lib\urlparse.py", line 261, in urljoin
urlparse(url, bscheme, allow_fragments)
File "C:\Python27\lib\urlparse.py", line 143, in urlparse
tuple = urlsplit(url, scheme, allow_fragments)
File "C:\Python27\lib\urlparse.py", line 182, in urlsplit
i = url.find(':')
AttributeError: 'generator' object has no attribute 'find'
Process finished with exit code 1
I think I'm having trouble with the href = urljoin(base, (a["href"] for a in cont)) part as I'm trying to create a url list I can feed into get_hltv_match_datato then capture various items within that page. Am I going about this wrong?
Cheers
You need to join each href as per your commented code:
urls = [urljoin(base,a["href"]) for a in cont]
You are trying to join the base url to a generator i.e (a["href"] for a in cont) which makes no sense.
You should also be passing the url to requests or you are going to be requesting the same page over and over.
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

Resources