Scrapy how to use xpaths in a for-loop - xpath

I can't understand where is the problem with my xpath expressions in the for-loop in the parse_reviews function. Why is there no output?
import scrapy
class BookingSpider(scrapy.Spider):
name = 'booking-hotel-spider'
allowed_domains = ['booking.com']
start_urls = [
'https://www.booking.com/hotel/ch/vision-apartment-milita-rstrasse.de.html?aid=356980;label=gog235jc-1FCAIoLDgcSAdYA2gsiAEBmAEHuAEHyAEP2AEB6AEB-AECiAIBqAIDuAK7q7DyBcACAQ;sid=9132b14809ec97a2f9b60ecaf2954252;breadcrumb=hotel;srpvid=ca2699ebdc4e00cf&'
]
# get reviews page of a hotel
def parse(self, response):
reviewsurl = response.xpath('//a[#class="hp_nav_reviews_link toggle_review track_review_link_zh"]/#href')
url = response.urljoin(reviewsurl[0].extract())
url = url.replace('blockdisplay4', 'tab-reviews')
yield scrapy.Request(url, callback=self.parse_reviews)
# parse its reviews
def parse_reviews(self, response):
for rev in response.xpath('//li[starts-with(#class,"review_list_new_item")]'):
author = rev.xpath('.//span[#class="bui-avatar-block__title"]/text()').extract()
print(author)
authorcountry = rev.xpath('.//span[#class="bui-avatar-block__subtitle"]/text()').extract()
print(authorcountry)
title = rev.xpath('.//div[#class="bui-grid__column-10"]//h3/text()').extract()
print(title)
EDIT:
Disired output:
['Maike', 'Eduard', 'Andrearick', 'Alexander', 'Elena', 'Katia', 'Chris', 'Marianna', 'Kam', 'Rachel', 'Maike', 'Eduard', 'Andrearick', 'Alexander', 'Elena', 'Katia', 'Chris', 'Marianna', 'Kam', 'Rachel', 'Maike', 'Eduard', 'Andrearick', 'Alexander', 'Elena', 'Katia', 'Chris', 'Marianna', 'Kam', 'Rachel']
['Deutschland', 'Deutschland', 'Deutschland', 'Österreich', 'Bulgarien', 'Großbritannien', 'Großbritannien', 'Italien', 'Hongkong', 'Malaysia', 'Deutschland', 'Deutschland', 'Deutschland', 'Österreich', 'Bulgarien', 'Großbritannien', 'Großbritannien', 'Italien', 'Hongkong', 'Malaysia', 'Deutschland', 'Deutschland', 'Deutschland', 'Österreich', 'Bulgarien', 'Großbritannien', 'Großbritannien', 'Italien', 'Hongkong', 'Malaysia']
For now I couldn't find a right xpath for title.

Try changing your title to:
title = rev.xpath('.//div[#class="c-review-block__row"]//h3/text()')

Related

Expand lines with multiple data entries into individual lines with one piece of data each

I have a file in which the first column is an identifier and the rest of each line contains zero to multiple digits separated by single spaces.
For example:
SOAP.k35.scaffold280 0003723
SOAP.k35.scaffold421
SOAP.k35.scaffold429 0004930 0016021
TRINITY_DN23171_c1_g1_i2 0006457 0005509 0030246 0051082 0005788
SOAP.k35.scaffold599 0007411 0033627 0035001 0016321 0007507 0035011 0007498 0045886 0030155 0030334 0045995 0034446 0005102 0030424 0005604 0030054 0036062 0008021
I would like to have each trailing digit entry on its own line with the appropriate first column identifier (i.e. SOAP... or TRINITY....) leading each line with an appended " = " between each first column identifier and the given number for that line. I'd also like to remove lines that contain no digits after the first column identifier.
As an example of what I would the result of the processed text above to be:
SOAP.k35.scaffold280 = 0003723
SOAP.k35.scaffold429 = 0004930
SOAP.k35.scaffold429 = 0016021
TRINITY_DN23171_c1_g1_i2 = 0006457
TRINITY_DN23171_c1_g1_i2 = 0005509
TRINITY_DN23171_c1_g1_i2 = 0030246
...
and so forth.
My primary issue is knowing how to store that first column identifier to insert ahead of any new line characters I am inserting when parsing lines by the numerical data entries.
Any help is greatly appreciated.
simply,
$ awk '{for(i=2;i<=NF;i++) print $1,"=",$i}' file
SOAP.k35.scaffold280 = 0003723
SOAP.k35.scaffold429 = 0004930
SOAP.k35.scaffold429 = 0016021
TRINITY_DN23171_c1_g1_i2 = 0006457
TRINITY_DN23171_c1_g1_i2 = 0005509
TRINITY_DN23171_c1_g1_i2 = 0030246
TRINITY_DN23171_c1_g1_i2 = 0051082
TRINITY_DN23171_c1_g1_i2 = 0005788
...
Could you please try following.
awk '(/^SOAP/ || /^TRINITY/){for(i=2;i<=NF;i++){print $1" = "$i}}' Input_file
In case you don't want to strict awk program only for lines which are starting either with string SOAP or TRINITY then try following.
awk '{for(i=2;i<=NF;i++){print $1" = "$i}}' Input_file
Output will be as follows.
SOAP.k35.scaffold280 = 0003723
SOAP.k35.scaffold429 = 0004930
SOAP.k35.scaffold429 = 0016021
TRINITY_DN23171_c1_g1_i2 = 0006457
TRINITY_DN23171_c1_g1_i2 = 0005509
TRINITY_DN23171_c1_g1_i2 = 0030246
TRINITY_DN23171_c1_g1_i2 = 0051082
TRINITY_DN23171_c1_g1_i2 = 0005788
SOAP.k35.scaffold599 = 0007411
SOAP.k35.scaffold599 = 0033627
SOAP.k35.scaffold599 = 0035001
SOAP.k35.scaffold599 = 0016321
SOAP.k35.scaffold599 = 0007507
SOAP.k35.scaffold599 = 0035011
SOAP.k35.scaffold599 = 0007498
SOAP.k35.scaffold599 = 0045886
SOAP.k35.scaffold599 = 0030155
SOAP.k35.scaffold599 = 0030334
SOAP.k35.scaffold599 = 0045995
SOAP.k35.scaffold599 = 0034446
SOAP.k35.scaffold599 = 0005102
SOAP.k35.scaffold599 = 0030424
SOAP.k35.scaffold599 = 0005604
SOAP.k35.scaffold599 = 0030054
SOAP.k35.scaffold599 = 0036062
SOAP.k35.scaffold599 = 0008021
You can try Perl also
$ perl -ne ' ($x)=$_=~m/(^\S+)/; while( /\s(\d+)/g ) { print "$x = $1\n" } ' scottc.txt
SOAP.k35.scaffold280 = 0003723
SOAP.k35.scaffold429 = 0004930
SOAP.k35.scaffold429 = 0016021
TRINITY_DN23171_c1_g1_i2 = 0006457
TRINITY_DN23171_c1_g1_i2 = 0005509
TRINITY_DN23171_c1_g1_i2 = 0030246
TRINITY_DN23171_c1_g1_i2 = 0051082
TRINITY_DN23171_c1_g1_i2 = 0005788
SOAP.k35.scaffold599 = 0007411
SOAP.k35.scaffold599 = 0033627
SOAP.k35.scaffold599 = 0035001
SOAP.k35.scaffold599 = 0016321
SOAP.k35.scaffold599 = 0007507
SOAP.k35.scaffold599 = 0035011
. . . . .
. . . . .

scrapy / python 3.5 : targeting and filtering

i want to extract the following field : movie's,director's,actors' name
on the page allocine.fr
This will help me to make my template for further scraps.
Here is my bad working code (inside spiders directory)
from scrapy.contrib.spiders import CrawlSpider, Rule
from cinefil.items import Article
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor ==> depreciated
from scrapy.linkextractors import LinkExtractor
from scrapy import log
class CinefilSpider(CrawlSpider):
name="cinefil"
allowed_domains = ["allocine.fr"]
start_urls = ["http://www.allocine.fr/film/fichefilm_gen_cfilm=29007.html"]
rules = [
Rule(LinkExtractor(allow=('(/film/)((?!:).)*$'),), callback="parse_item", follow=False)
]
def parse_item(self, response):
ROOTPATH = '//div[#class="meta-body-item"]'
item = Article()
casiers = response.xpath(ROOTPATH).extract()
for matos in casiers:
print("\n----- ------ ------ -------- ---------")
print(matos)
return item
For extracting the movie's,director's,actors' name on the page allocine.fr
Movie name
#get from <div class="titlebar-title titlebar-title-lg">
>>> movie=response.xpath('//div[#class="titlebar-title titlebar-title-lg"]/text()').extract_first()
>>> movie
u'Spider-Man'
Director name
#start from
#<span itemprop="director">
#<a>
#<span itemprop="name">
>>> director=response.xpath('//span[#itemprop="director"]/a/span[#itemprop="name"]/text()').extract()
>>> director
u'Sam Raimi'
Actors name
#Take the word "Avec" as landmark and get its siblings <spans>
>>> movie_stars=response.xpath('//span[contains(text(),"Avec")]/following-sibling::span/text()').extract()
>>> movie_stars
[u'Tobey Maguire', u'Willem Dafoe', u'Kirsten Dunst', u' plus ']
#remove last item 'plus'
>>> movie_stars.pop()
u' plus '
>>> movie_stars
[u'Tobey Maguire', u'Willem Dafoe', u'Kirsten Dunst']
And the items.py should be declared as :
import scrapy
class Movie(scrapy.Item):
name = scrapy.Field()
director = scrapy.Field()
actors = scrapy.Field()

normalize-space not working on scrapy

I am trying to extract chapter titles and their subtitles from a web page in the url. This is my spider
import scrapy
from ..items import ContentsPageSFBItem
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
#allowed_domains = ["web"]
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = ContentsPageSFBItem()
item['content_item'] = response.xpath('normalize-space(//ol[#class="detail-toc"]//*/text())').extract();
length = len(response.xpath('//ol[#class="detail-toc"]//*/text()').extract()); #extract()
full_url_list = list();
title_list = list();
for i in range(1,length+1):
full_url_list.append(response.url)
item["full_url"] = full_url_list
title = response.xpath('//title[1]/text()').extract();
for j in range(1,length+1):
title_list.append(title)
item["title"] = title_list
return item
Even though I use the normalise fucntion in my xpath to remove the spaces, I get the following result in my csv
content_item,full_url,title
"
,Chapter 1,
,
,
,Instructor Introduction,
,00:01:00,
,
,
,Course Overview,
How do I get the result with at most only one new line after each entry?
If you want to get all text within Table of Contents section you need to change your xpath expression in item['content_item'] to:
item['content_item'] = response.xpath('//ol[#class="detail-toc"]//a/text()').extract()
You can rewrite you spider code like this:
import scrapy
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = dict() # change dict to your scrapy item
for link in response.xpath('//ol[#class="detail-toc"]//a'):
item['link_text'] = link.xpath('text()').extract_first()
item['link_url'] = response.urljoin(link.xpath('#href').extract_first())
yield item
# Output:
{'link_text': 'About This E-Book', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/pref00.html#pref00'}
{'link_text': 'Title Page', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/title.html#title'}

Finding correct xpath

I want to find from the page the correct xpath for the string
"Untersuchung polymerischer Stabilsation in kolloidalen Systemen unter Verwendung neuartiger feldtheoretischer Simulationen" for one item and the other item of string is "Entwicklung hämatopoietischer Stammzellen aus humanen ES- und iPS-Zellen"
name = 'lianjia'
allowed_domains = ["gepris.dfg.de/gepris/"]
start_urls =['http://gepris.dfg.de/gepris/institution/5000', 'http://gepris.dfg.de/gepris/institution/5008']
My Scrapy program is
def parse(self, response):
for sel in response.xpath("//div[#id='detailseite']"):
lianjia = lianjiaItem()
lianjia['item1'] = sel.xpath("").extract()
lianjia['item2'] = sel.xpath("").extract()
lianjia['item1'] = sel.xpath("//li[#id=4]/ul/li/ul/li[1]/div/div/a/text()").extract()
lianjia['item2'] = sel.xpath("//li[#id=4]/ul/li/ul/li[2]/div/div/a/text()").extract()

How to scrape all the image url and alt tag within it using scrapy

My target is to crawl image url and image alt tag using scrapy . I tried many combinations but still didn't achieve it.
Here is what i tried
def parse_item(self, response):
sel = Selector(response)
item = imageItem()
item['crawl_time'] = time.asctime( time.localtime(time.time()))
item['crawl_date'] = time.asctime( time.localtime(time.strftime("%Y%m%d")))
item['url'] = response.url
for img in hxs.select('//img'):
item['title'] = node.xpath("#alt").extract()
item['iurl'] = node.xpath("#src").extract()
if response.meta['depth'] == 1:
exit
return item
Some issues there:
You already have sel selector. But you use hxs in the loop
in the loop, you are using node instead of img
does it make more sense that each loop should yield one image item
This is my tested and working code:
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
for img in images:
item = imageItem()
item['url'] = response.url
title = img.xpath('./#alt').extract() or ''
item_title = title[0] if title else ''
item['title'] = item_title
iurl = img.xpath('./#src').extract() or ''
item_iurl = iurl[0] if iurl else ''
item['iurl'] = item_iurl
yield item
Here is the below code using which I achieved the result , but depth is still 1
class MySpider(CrawlSpider):
name = 'imageaggr'
start_urls = ['http://www.dmoz.org/','http://timesofindia.indiatimes.com/','http://www.nytimes.com','http://www.washingtonpost.com/','http://www.jpost.com','http://www.rediff.com/']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('', ), deny=('defghi\.txt')), callback='parse_item'),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
# Rule(SgmlLinkExtractor(allow=('\.cms','\.html' )), deny=('parse_item\.html'))),
#Rule(SgmlLinkExtractor(allow=('news', )), callback='parse_item'),
)
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
image_count = len(images)
count = 0
while(count < image_count):
item = imageItem()
item['url'] = response.url
title = sel.xpath('//img/#alt').extract()[count] or ''
if title == '':
break
item['title'] = title
iurl = sel.xpath('//img/#src').extract()[count] or ''
item['iurl'] = iurl
item['crawl_time'] = time.asctime( time.localtime(time.time()))
crawl_date = time.strftime("%Y%m%d")
item['crawl_date'] = crawl_date
count = count + 1
return item

Resources