Using scrapy to download google images from multiple urls - image

I am trying to download images from multiple urls from a search in google images.
However, i only want 15 images from each url.
class imageSpider(BaseSpider):
name = "image"
start_urls = [
'https://google.com/search?q=simpsons&tbm=isch'
'https://google.com/search?q=futurama&tbm=isch'
]
def parse(self,response):
hxs = HtmlXPathSelector(response)
items = []
images = hxs.select("//div[#id='ires']//div//a[#href]")
count = 0
for image in images:
count += 1
item = ImageItem()
image_url = image.select(".//img[#src]")[0].extract()
import urlparse
image_absolute_url = urlparse.urljoin(response.url, image_url.strip())
index = image_absolute_url.index("src")
changedUrl = image_absolute_url[index+5:len(image_absolute_url)-2]
item['image_urls'] = [changedUrl]
index1 = site['url'].index("search?q=")
index2 = site['url'].index("&tbm=isch")
imageName = site['url'][index1+9:index2]
download(changedUrl,imageName + str(count)+".png")
items.append(item)
if count == 15:
break
return items
The download function downloads the images (i have code for that. that's not the problem).
The problem is that when i break, it stops at the first url and never continues on to the next url. How could i make it download 15 images for the first url and then 15 images for the 2nd url. I am using break because there are about 1000 images in every google images page and i don't want that many.

The problem is not about break statement. you have missed a comma in start_urls.
it should be like this:
start_urls = [
'http://google.com/search?q=simpsons&tbm=isch',
'http://google.com/search?q=futurama&tbm=isch'
]

Related

Extracting web address and using for loop

I am trying to extract websites of the members from https://www.mhi.org/members. So, I wrote a code to visit the member page one by one and extract the web addresses. I am using BeautifulSoup Library to extract.
However my problem is not implementation of BeautifulSoup but in the for loop. The above link has 15 members per page. When I try to run the code for one page it just returns the web address of only one member. I have imported all the desired libraries.
url = input('Enter URL:')
#position = int(input('Enter position:'))-1
html = urlopen(url).read()
lst1 = list()
lst = list()
lst2 = list()
lst3 = list()
lst4 = list()
soup = BeautifulSoup(html,"html.parser")
url1 = "https://www.mhi.org"
conn = sqlite3.connect('list.sqlite')
cur = conn.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS weblinks (URL TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS weblinks1 (website TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS weblinks2 (website1 TEXT UNIQUE)''')
#print(soup)
for link in soup.find_all('a'):
lst.append(link.get('href'))
for links in lst:
if 'members' in links:
url2 = urllib.parse.urljoin(url1, links)
lst2.append(url2)
#print(lst2)
url3 = lst2[4:18]
print(url3)
for x in url3:
html1 = urlopen(x).read()
soup1 = BeautifulSoup(html1,"html.parser")
for url4 in soup1.find_all('a'):
url4 = url4.get('href')
if url4 not in lst3:
lst3.append(url4)
url6 = lst3.pop(108)
print(url6)
So, the last "for loop" where url6 is the desired output. It just prints output one web address.
Please advise what am I missing here.

Iterating through Beautiful Soup

I have a list of urls and I am trying to scrape each of them to output in a single data frame. my code is below:
res=[]
for link in url_list:
html = urlopen(link)
soup = BeautifulSoup(html,'html.parser')
headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
headers = headers[1:]
rows = soup.findAll('tr')[2:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
stats = pd.DataFrame(player_stats, columns = headers)
I only get output for one web page. Why is that?
You are overwriting your dataframe each time in your for loop. If you have a lot of pages, you can collect the individual dataframes and concatenate them.
df_hold_list = []
for link in url_list:
html = urlopen(link)
soup = BeautifulSoup(html,'html.parser')
headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
headers = headers[1:]
rows = soup.findAll('tr')[2:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
df_hold_list.append(pd.DataFrame(player_stats, columns = headers))
stats = pd.concat(df_hold_list)

OCR only image on one side

I was wondering if there's a way to only OCR the document on the right (ignoring the left) without having to split the images in PS or any other image editor?
The problem is that sometimes there is text on the images. However, they are polluting my results as I only need to rear the right-hand side.
Kind regards,
O.
## PREPROCESSING (load and read images to OCR and transform them into a DataFrame)
import pytesseract as tess
from tesserocr import PyTessBaseAPI, RIL
import os
from PIL import Image
import pandas as pd
import re
import tesserocr
path = "/Users/oliviervandhuynslager/PycharmProjects/Design Centre/assets/img/" ##path to directory (folder) where the images are located
count = 0
fileName = [] #create empty list that will contain the original filenames
fullText = [] #create empty list to store the OCR results per file
for imageName in os.listdir(path):
count = count + 1
fileName.append(imageName)
# fileName.sort()#generate list from texts.
with PyTessBaseAPI(lang='eng') as api:
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
d = {"FILENAME":fileName, "OCR": fullText}
df = pd.DataFrame(d)
##Generate empty lists
search_material = []
search_product = []
search_manufacturer = []
search_designer = []
search_description = []
search_dimensions = []
search_packing = []
search_price = []
search_delivery = []
## -_-_-_-_-_-_-_-_-_-_-_-_-_-
count_material = 0
count_product = 0
count_maufacturer = 0
count_designer = 0
count_description = 0
count_dimension = 0
count_packing = 0
count_price = 0
## search for PRODUCT (NAME/TITLE)
for values in df["OCR"]:
try:
search_product.append((re.search(r'Product[\s\S]+', values).group()).split("\n")[0].split(":")[1])
count_product = count_product + 1
except:
search_product.append("")
df["PRODUCT"] = search_product
## search for MANUFACTURER
for values in df["OCR"]:
try:
search_manufacturer.append((re.search(r'Manufacturer[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_maufacturer = count_maufacturer + 1
except:
search_manufacturer.append("")
df["MANUFACTURER"] = search_manufacturer
## search for DESIGNER
for values in df["OCR"]:
try:
search_designer.append((re.search(r'Designer[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_designer = count_designer + 1
except:
search_designer.append("")
df["DESIGNER"] = search_designer
## search for MATERIALS
for values in df["OCR"]:
try:
search_material.append((re.search(r'Material[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_material = count_material + 1
except:
search_material.append("")
df["MATERIAL"] = search_material
#search for DESCRIPTION:
for values in df["OCR"]:
try:
search_description.append((re.search(r'Description[\S\s]+', values).group()).split(":")[1])
count_description = count_description + 1
except:
search_description.append("")
df["DESCRIPTION"] = search_description
#search for DIMENSIONS
for values in df["OCR"]:
try:
search_dimensions.append((re.search(r'Dimensions[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_dimension = count_dimension + 1
except:
search_dimensions.append("")
df["DIMENSIONS"] = search_dimensions
#search for PACKING
for values in df["OCR"]:
try:
search_packing.append((re.search(r'Packing[\S\s]+', values).group()).split('\n\n')[0].split(":")[1])
count_packing = count_packing + 1
except:
search_packing.append("")
df["PACKING"] = search_packing
#search for PRICE
for values in df["OCR"]:
try:
search_price.append((re.search(r'Price[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_price = count_price + 1
except:
search_price.append("")
df["PRICE"] = search_price
#search for DELIVERY DAYS
for values in df["OCR"]:
try:
search_delivery.append((re.search(r'Delivery[\S\s]+', values).group()).split("\n\n")[0].split(":")[1])
count_delivery = count_delivery + 1
except:
search_delivery.append("")
df["DELIVERY"] = search_delivery
df.drop(columns="OCR", inplace=True)
print(df)
If the layout of text on your image is fixed then you can simply read the full Image but pass only half of that image array to tesseract.
import cv2
img = cv2.imread(inputPath)
_, width, _ = img.shape
half = width//2
cut = img[: half: , :]
temp_path = r'path/where/you/want/your/cropped/image/to/be/saved'
cv2.imwrite(temp_path, cut)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
os.remove(temp_path) #removing cut image from the directory
Alternate Approach
You can pass the image array cut to the tesseract instead of saving it and then removing it. In that case, remember to convert the image array cut to RGB format since open cv uses BGR format by default while reading images.
rgb_arr = cv2.cvtColor(cut, cv2.COLOR_BGR2RGB)
All these things can be done with PIL also. In PIL you can use crop() to extract the required part of the image. Also by default, it reads images in the RGB format and can be passed directly to tesseract if you are following the alternate approach as mentioned above
You can call api.SetRectangle method passing the coordinates of the right half before recognition.

normalize-space not working on scrapy

I am trying to extract chapter titles and their subtitles from a web page in the url. This is my spider
import scrapy
from ..items import ContentsPageSFBItem
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
#allowed_domains = ["web"]
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = ContentsPageSFBItem()
item['content_item'] = response.xpath('normalize-space(//ol[#class="detail-toc"]//*/text())').extract();
length = len(response.xpath('//ol[#class="detail-toc"]//*/text()').extract()); #extract()
full_url_list = list();
title_list = list();
for i in range(1,length+1):
full_url_list.append(response.url)
item["full_url"] = full_url_list
title = response.xpath('//title[1]/text()').extract();
for j in range(1,length+1):
title_list.append(title)
item["title"] = title_list
return item
Even though I use the normalise fucntion in my xpath to remove the spaces, I get the following result in my csv
content_item,full_url,title
"
,Chapter 1,
,
,
,Instructor Introduction,
,00:01:00,
,
,
,Course Overview,
How do I get the result with at most only one new line after each entry?
If you want to get all text within Table of Contents section you need to change your xpath expression in item['content_item'] to:
item['content_item'] = response.xpath('//ol[#class="detail-toc"]//a/text()').extract()
You can rewrite you spider code like this:
import scrapy
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = dict() # change dict to your scrapy item
for link in response.xpath('//ol[#class="detail-toc"]//a'):
item['link_text'] = link.xpath('text()').extract_first()
item['link_url'] = response.urljoin(link.xpath('#href').extract_first())
yield item
# Output:
{'link_text': 'About This E-Book', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/pref00.html#pref00'}
{'link_text': 'Title Page', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/title.html#title'}

Detect duplicate videos from YouTube

In consideration to my M.tech Project
I want to know if there is any algorithm to detect duplicate videos from youtube.
For example (here are links of two videos):
random user upload
upload by official channel
Amongst these second is official video and T-series has it's copyright.
Is youtube officially doing something to remove duplicate videos from youtube?
Not only videos, there exists duplicate youtube channels also.
Sometimes the original video has less number of views than that of pirated version.
So, while searching found this
(see page number [49] of pdf)
What I learnt from the given link
Original vs copyright infringed video detection Classifier is used.
Given a query, firstly top k search results are being retrieved.Thereafter three parameters are used to classify the videos
Number of subscribers
user profile
username popularity
and on the basis of these parameters, original video is identified as described in the link.
EDIT 1:
There are basically two different objectives
To identify original video with the above method
To eliminate the duplicate videos
obviously identifying original video is easier than finding out all the duplicate videos.
So i preferred to first find out the original video.
Approach which i can think till now
to improve the accuracy:
We can first find out the original videos with above method
And then use the most popular publicized frames(may be multiple) of that video to search on google image. This method therefore retrieves the list of duplicate videos in google image search results.
After getting these duplicate videos, we can once again check frame by frame and reach a level of satisfaction(yes retrieved videos were "exact or "almost" duplicate copy of original video)
Will this approach work?
if not, is there any better algorithm, to improve upon the given method?
Please write in the comments section if i am unable to explain my approach clearly.
I will soon add some more details.
I've recently hacked together a small tool for that purpose. It's still work in progress but usually pretty accurate. The idea is to simply compare time between brightness maxima in the center of the video. Therefore it should work with different resolutions, frame rates and rotation of the video.
ffmpeg is used for decoding, imageio as bridge to python, numpy/scipy for maxima computation and some k-nearest-neighbor library (annoy, cyflann, hnsw) for comparison.
At the moment it's not polished at all so you should know a little python to run it or simply copy the idea.
Me too had the same problem.. So wrote a program myself..
Problem is I had videos of various formats and resolution.. So needed to take hash of each video frame and compare.
https://github.com/gklc811/duplicate_video_finder
you can just change the directories at top and you are good to go..
from os import path, walk, makedirs, rename
from time import clock
from imagehash import average_hash
from PIL import Image
from cv2 import VideoCapture, CAP_PROP_FRAME_COUNT, CAP_PROP_FRAME_WIDTH, CAP_PROP_FRAME_HEIGHT, CAP_PROP_FPS
from json import dump, load
from multiprocessing import Pool, cpu_count
input_vid_dir = r'C:\Users\gokul\Documents\data\\'
json_dir = r'C:\Users\gokul\Documents\db\\'
analyzed_dir = r'C:\Users\gokul\Documents\analyzed\\'
duplicate_dir = r'C:\Users\gokul\Documents\duplicate\\'
if not path.exists(json_dir):
makedirs(json_dir)
if not path.exists(analyzed_dir):
makedirs(analyzed_dir)
if not path.exists(duplicate_dir):
makedirs(duplicate_dir)
def write_to_json(filename, data):
file_full_path = json_dir + filename + ".json"
with open(file_full_path, 'w') as file_pointer:
dump(data, file_pointer)
return
def video_to_json(filename):
file_full_path = input_vid_dir + filename
start = clock()
size = round(path.getsize(file_full_path) / 1024 / 1024, 2)
video_pointer = VideoCapture(file_full_path)
frame_count = int(VideoCapture.get(video_pointer, int(CAP_PROP_FRAME_COUNT)))
width = int(VideoCapture.get(video_pointer, int(CAP_PROP_FRAME_WIDTH)))
height = int(VideoCapture.get(video_pointer, int(CAP_PROP_FRAME_HEIGHT)))
fps = int(VideoCapture.get(video_pointer, int(CAP_PROP_FPS)))
success, image = video_pointer.read()
video_hash = {}
while success:
frame_hash = average_hash(Image.fromarray(image))
video_hash[str(frame_hash)] = filename
success, image = video_pointer.read()
stop = clock()
time_taken = stop - start
print("Time taken for ", file_full_path, " is : ", time_taken)
data_dict = dict()
data_dict['size'] = size
data_dict['time_taken'] = time_taken
data_dict['fps'] = fps
data_dict['height'] = height
data_dict['width'] = width
data_dict['frame_count'] = frame_count
data_dict['filename'] = filename
data_dict['video_hash'] = video_hash
write_to_json(filename, data_dict)
return
def multiprocess_video_to_json():
files = next(walk(input_vid_dir))[2]
processes = cpu_count()
print(processes)
pool = Pool(processes)
start = clock()
pool.starmap_async(video_to_json, zip(files))
pool.close()
pool.join()
stop = clock()
print("Time Taken : ", stop - start)
def key_with_max_val(d):
max_value = 0
required_key = ""
for k in d:
if d[k] > max_value:
max_value = d[k]
required_key = k
return required_key
def duplicate_analyzer():
files = next(walk(json_dir))[2]
data_dict = {}
for file in files:
filename = json_dir + file
with open(filename) as f:
data = load(f)
video_hash = data['video_hash']
count = 0
duplicate_file_dict = dict()
for key in video_hash:
count += 1
if key in data_dict:
if data_dict[key] in duplicate_file_dict:
duplicate_file_dict[data_dict[key]] = duplicate_file_dict[data_dict[key]] + 1
else:
duplicate_file_dict[data_dict[key]] = 1
else:
data_dict[key] = video_hash[key]
if duplicate_file_dict:
duplicate_file = key_with_max_val(duplicate_file_dict)
duplicate_percentage = ((duplicate_file_dict[duplicate_file] / count) * 100)
if duplicate_percentage > 50:
file = file[:-5]
print(file, " is dup of ", duplicate_file)
src = analyzed_dir + file
tgt = duplicate_dir + file
if path.exists(src):
rename(src, tgt)
# else:
# print("File already moved")
def mv_analyzed_file():
files = next(walk(json_dir))[2]
for filename in files:
filename = filename[:-5]
src = input_vid_dir + filename
tgt = analyzed_dir + filename
if path.exists(src):
rename(src, tgt)
# else:
# print("File already moved")
if __name__ == '__main__':
mv_analyzed_file()
multiprocess_video_to_json()
mv_analyzed_file()
duplicate_analyzer()

Resources