def get_rstname_links(pref_urls, pref_name):
rstname_links = []
for link in tqdm.tqdm(pref_urls):
HEADERS = {"User-Agent": random.choice(AGENT_LIST)}
try:
html_content = requests.get(
link, proxies=proxies, headers=HEADERS, timeout=100, verify=False
).text
soup = BeautifulSoup(html_content, "html.parser")
hyperlinks = soup.find_all("div", {"class": "rstname"})
for hyperlink in hyperlinks:
links = hyperlink.find_all("a")
for link in links:
href = link.get("href")
print(href)
rstname_links.append(href)
except:
pass
with open(f"{ABSOLUTE_PATH}output/urls/{pref_name}.txt", "w") as f:
for link in rstname_links:
f.write("https://example.com" + link + "\n")
print(f"{pref_name} URL EXTRACTION DONE!")
multiprocessing code:
chunk_size = math.ceil(len(pref_urls) / (mp.cpu_count() * 20))
range_of_chunk = list(
chunks(range(0, len(pref_urls)), chunk_size)
)
processes = []
for chunk in range_of_chunk:
process = multiprocessing.Process(
target=get_rstname_links,
args=(
pref_urls[chunk[0] : chunk[-1]],
pref_name,
),
)
processes.append(process)
process.start()
for idx, proc in enumerate(processes):
proc.join()
Here, after all the processes completed the files are supposed to write on disk as per the code in the first function, but it's not saving on the disk.
Any solution to this problem?
Okay, I actually solved the problem like this:
def get_rstname_links(pref_urls):
rstname_links = []
for link in tqdm.tqdm(pref_urls):
HEADERS = {"User-Agent": random.choice(AGENT_LIST)}
try:
html_content = requests.get(
link, proxies=proxies, headers=HEADERS, timeout=100, verify=False
).text
# parse the html content
soup = BeautifulSoup(html_content, "html.parser")
# get all the hyperlinks in the html content list <li>
hyperlinks = soup.find_all("div", {"class": "rstname"})
for hyperlink in hyperlinks:
links = hyperlink.find_all("a")
for link in links:
href = link.get("href")
print(href)
rstname_links.append(href)
except:
pass
return rstname_links
changed multiprocessing code:
def url_download(pref_name):
# parse the arguments
base_links = get_hyperlinks(f"https://example.com/{pref_name}/")
pref_urls = visit_links(base_links)
# rstname_links = get_rstname_links(pref_urls)
pool = Pool(mp.cpu_count() * 50)
rstname_links = list(
tqdm.tqdm(
pool.imap(get_rstname_links, chunks(pref_urls, 100)),
total=math.ceil(len(pref_urls) / 100),
)
)
pool.close()
pool.join()
rstname_links = [item for sublist in rstname_links for item in sublist]
print(f"{pref_name} : {len(rstname_links)}")
return rstname_links
Then, in the main method, write the rstname_links data on the disk.
Related
I want to read docx/pdf/txt the file from the url and if docx/pdf/txt file has a non-english characters then it will stop read file. I am able to read all type of file through url but I am facing a problem in detecting a non-english characters. If there a non-english character in file then it will skip that character.
So, is there any way to read a file through url without skipping non-english words?
here is my code to read file in windows.
request=requests.get("url of file")
resp = request.json()
file_path=(resp["data"]["file_path"])
_id=(resp["data"]["_id"])
number_of_question=(resp["data"]["number_of_question"])
file_type=(resp["data"]["file_type"])
def get_context(total_pages):
page_data = []
for i in range(0,total_pages):
page = read_pdf.pages[i]
parts = []
def visitor_body(text, cm, tm, fontDict, fontSize):
y = tm[5]
if y > 70 and y < 780:
parts.append(text)
page.extract_text(visitor_text=visitor_body)
text_body = "".join(parts).strip()
page_data.append({"page_no":i+1, "page_content":text_body})
return page_data
if file_type=="txt":
response = urllib.request.urlopen(file_path)
html = response.read()
Text = html.decode('utf8')
elif file_type=="pdf":
response = requests.get(file_path)
my_raw_data = response.content
with open("my_pdf.pdf", 'wb') as my_data:
my_data.write(my_raw_data)
open_pdf_file = open("my_pdf.pdf", 'rb')
read_pdf = PyPDF2.PdfReader(open_pdf_file)
total_pages = len(read_pdf.pages)
Text = get_context(total_pages)
print(Text)
elif file_type=="docx":
response = requests.get(file_path)
my_raw_data = response.content
with open("my_doc.docx", "wb") as text_file:
text_file.write(my_raw_data)
pythoncom.CoInitialize()
docx2pdf.convert("my_doc.docx")
open_pdf_file = open("my_doc.pdf", 'rb')
read_pdf = PyPDF2.PdfReader("my_doc.pdf")
total_pages = len(read_pdf.pages)
Text = get_context(total_pages)
else:
print("Invalid File Type")
here is the output that I am getting.
[{'page_no': 1, 'page_content': ''}]
example of non-english file.
https://www.dropbox.com/s/utq3y82qrbenisg/%E0%A4%95%E0%A4%B2-%E0%A4%95%E0%A5%80-%E0%A4%95%E0%A4%B2-%E0%A4%B8%E0%A5%8B%E0%A4%9A%E0%A5%87%E0%A4%82%E0%A4%97%E0%A5%87-.pdf?dl=0
I want to download after image crawling for multiple pages. However, all images cannot be downloaded because they are overwritten in [for syntax].
Below is my code. What is wrong?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq
for page in range(2,4):
baseUrl = 'https://onepiecetreasurecruise.fr/Artwork/index.php?page=index'
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
listimg = []
for i in img:
listimg.append(i['src'])
n = 1
for index, img_link in enumerate(listimg):
img_data = rq.get(img_link).content
with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
f.write(img_data)
n += 1
Another way is to download all the pictures.
from simplified_scrapy import Spider, SimplifiedDoc, utils, SimplifiedMain
class ImageSpider(Spider):
name = 'onepiecetreasurecruise'
start_urls = ['https://onepiecetreasurecruise.fr/Artwork/index.php?page=index']
# refresh_urls = True
concurrencyPer1s = 0.5 # set download speed
imgPath = 'images/'
def __init__(self):
Spider.__init__(self, self.name) # necessary
utils.createDir(self.imgPath) # create image dir
def afterResponse(self, response, url, error=None, extra=None):
try: # save images
flag = utils.saveResponseAsFile(response, self.imgPath, 'image')
if flag: return None
except Exception as err:
print(err)
return Spider.afterResponse(self, response, url, error, extra)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# image urls
urls = doc.body.getElements('p', value='card-text').a
if (urls):
for u in urls:
u['header']={'Referer': url['url']}
self.saveUrl(urls)
# next page urls
u = doc.body.getElementByText('Suivant',tag='a')
if (u):
u['href'] = utils.absoluteUrl(url.url,u.href)
self.saveUrl(u)
return True
SimplifiedMain.startThread(ImageSpider()) # start download
I fixed the indents in your code. This works for me. It downloads 30 images.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq
listimg = [] # all images
for page in range(2,4):
baseUrl = 'https://onepiecetreasurecruise.fr/Artwork/index.php?page=index'
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
for i in img:
listimg.append(i['src'])
n = 1
for index, img_link in enumerate(listimg):
img_data = rq.get(img_link).content
with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
f.write(img_data)
n += 1
I am trying to extract chapter titles and their subtitles from a web page in the url. This is my spider
import scrapy
from ..items import ContentsPageSFBItem
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
#allowed_domains = ["web"]
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = ContentsPageSFBItem()
item['content_item'] = response.xpath('normalize-space(//ol[#class="detail-toc"]//*/text())').extract();
length = len(response.xpath('//ol[#class="detail-toc"]//*/text()').extract()); #extract()
full_url_list = list();
title_list = list();
for i in range(1,length+1):
full_url_list.append(response.url)
item["full_url"] = full_url_list
title = response.xpath('//title[1]/text()').extract();
for j in range(1,length+1):
title_list.append(title)
item["title"] = title_list
return item
Even though I use the normalise fucntion in my xpath to remove the spaces, I get the following result in my csv
content_item,full_url,title
"
,Chapter 1,
,
,
,Instructor Introduction,
,00:01:00,
,
,
,Course Overview,
How do I get the result with at most only one new line after each entry?
If you want to get all text within Table of Contents section you need to change your xpath expression in item['content_item'] to:
item['content_item'] = response.xpath('//ol[#class="detail-toc"]//a/text()').extract()
You can rewrite you spider code like this:
import scrapy
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = dict() # change dict to your scrapy item
for link in response.xpath('//ol[#class="detail-toc"]//a'):
item['link_text'] = link.xpath('text()').extract_first()
item['link_url'] = response.urljoin(link.xpath('#href').extract_first())
yield item
# Output:
{'link_text': 'About This E-Book', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/pref00.html#pref00'}
{'link_text': 'Title Page', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/title.html#title'}
I'm currently running the following code:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
def hltv_match_list(max_offset):
offset = 0
while offset < max_offset:
url = 'http://www.hltv.org/?pageid=188&offset=' + str(offset)
base = "http://www.hltv.org/"
soup = BeautifulSoup(requests.get("http://www.hltv.org/?pageid=188&offset=0").content, 'html.parser')
cont = soup.select("div.covMainBoxContent a[href*=matchid=]")
href = urljoin(base, (a["href"] for a in cont))
# print([urljoin(base, a["href"]) for a in cont])
get_hltv_match_data(href)
offset += 50
def get_hltv_match_data(matchid_url):
source_code = requests.get(matchid_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for teamid in soup.findAll("div.covSmallHeadline a[href*=teamid=]"):
print teamid.string
hltv_match_list(5)
Errors:
File "C:/Users/mdupo/PycharmProjects/HLTVCrawler/Crawler.py", line 12, in hltv_match_list
href = urljoin(base, (a["href"] for a in cont))
File "C:\Python27\lib\urlparse.py", line 261, in urljoin
urlparse(url, bscheme, allow_fragments)
File "C:\Python27\lib\urlparse.py", line 143, in urlparse
tuple = urlsplit(url, scheme, allow_fragments)
File "C:\Python27\lib\urlparse.py", line 182, in urlsplit
i = url.find(':')
AttributeError: 'generator' object has no attribute 'find'
Process finished with exit code 1
I think I'm having trouble with the href = urljoin(base, (a["href"] for a in cont)) part as I'm trying to create a url list I can feed into get_hltv_match_datato then capture various items within that page. Am I going about this wrong?
Cheers
You need to join each href as per your commented code:
urls = [urljoin(base,a["href"]) for a in cont]
You are trying to join the base url to a generator i.e (a["href"] for a in cont) which makes no sense.
You should also be passing the url to requests or you are going to be requesting the same page over and over.
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
My target is to crawl image url and image alt tag using scrapy . I tried many combinations but still didn't achieve it.
Here is what i tried
def parse_item(self, response):
sel = Selector(response)
item = imageItem()
item['crawl_time'] = time.asctime( time.localtime(time.time()))
item['crawl_date'] = time.asctime( time.localtime(time.strftime("%Y%m%d")))
item['url'] = response.url
for img in hxs.select('//img'):
item['title'] = node.xpath("#alt").extract()
item['iurl'] = node.xpath("#src").extract()
if response.meta['depth'] == 1:
exit
return item
Some issues there:
You already have sel selector. But you use hxs in the loop
in the loop, you are using node instead of img
does it make more sense that each loop should yield one image item
This is my tested and working code:
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
for img in images:
item = imageItem()
item['url'] = response.url
title = img.xpath('./#alt').extract() or ''
item_title = title[0] if title else ''
item['title'] = item_title
iurl = img.xpath('./#src').extract() or ''
item_iurl = iurl[0] if iurl else ''
item['iurl'] = item_iurl
yield item
Here is the below code using which I achieved the result , but depth is still 1
class MySpider(CrawlSpider):
name = 'imageaggr'
start_urls = ['http://www.dmoz.org/','http://timesofindia.indiatimes.com/','http://www.nytimes.com','http://www.washingtonpost.com/','http://www.jpost.com','http://www.rediff.com/']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('', ), deny=('defghi\.txt')), callback='parse_item'),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
# Rule(SgmlLinkExtractor(allow=('\.cms','\.html' )), deny=('parse_item\.html'))),
#Rule(SgmlLinkExtractor(allow=('news', )), callback='parse_item'),
)
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
image_count = len(images)
count = 0
while(count < image_count):
item = imageItem()
item['url'] = response.url
title = sel.xpath('//img/#alt').extract()[count] or ''
if title == '':
break
item['title'] = title
iurl = sel.xpath('//img/#src').extract()[count] or ''
item['iurl'] = iurl
item['crawl_time'] = time.asctime( time.localtime(time.time()))
crawl_date = time.strftime("%Y%m%d")
item['crawl_date'] = crawl_date
count = count + 1
return item