Scrapy doesn't get all data - xpath

im trying to scrape this page:
http://binpar.caicyt.gov.ar/cgi-bin/koha/opac-detail.pl?biblionumber=98723
with this code:
def parse_web9(self, response): #Conicet!!
for publication in response.css('div#wrap > div.main > div.container-fluid > div.row-fluid > div.span9 > div#catalogue_detail_biblio > div.record'):
pubtitle = publication.xpath('./h1[#class="title"]/text()').extract_first()
author = publication.xpath('./span[#class="results_summary publisher"]/span/span/a/text()').extract()
isxn = publication.xpath('./span[#class="results_summary issn"]/span/text()').re(r'\d+-\d+')
yield{
'titulo_publicacion': pubtitle,
'anio_publicacion': None,
'isbn': isxn,
'nombre_autor': author,
'url_link' : None
}
But I 'm getting only the title of the publication, I'm not sure why.
Cheers!

You should get the inner fields by property attributes:
$ scrapy shell http://binpar.caicyt.gov.ar/cgi-bin/koha/opac-detail.pl?biblionumber=98723
>>> for publication in response.css('div#wrap > div.main > div.container-fluid > div.row-fluid > div.span9 > div#catalogue_detail_biblio > div.record'):
... author = publication.css("span[property=contributor] span[property=name]::text").extract_first()
... title = publication.css("h1[property=name]::text").extract_first()
... issn = publication.css("span[property=issn]::text").extract_first()
... print(author, title, issn)
...
(u'Asociaci\xf3n Filat\xe9lica de la Rep\xfablica Argentina', u'AFRA, bolet\xedn informativo. ', u'0001-1193.')

Related

How I get href links for elements has no href?

Want to get video name and link from https://bubble.io/videos but I can not find href links, any ideas ?
WebAutomation.LaunchChrome.LaunchChrome Url: $'''https://bubble.io/videos''' WindowState: WebAutomation.BrowserWindowState.Normal ClearCache: False ClearCookies: False WaitForPageToLoadTimeout: 60 Timeout: 60 BrowserInstance=> Browser
LOOP FOREACH CurrentItem IN DataFromWebPage
WebAutomation.ExtractData.ExtractListUsingPagingFromNumberOfPages BrowserInstance: Browser Control: $'''html > body > div:eq(1) > div:eq(3) > div > div:eq(1) > div:eq(0) > div''' ExtractionParameters: {[$'''div > div > div > div:eq(1)''', $'''Own Text''', $''''''] } MaxWebPagesToProcess: 50000 PagerCssSelector: $'''html > body > div:eq(1) > div:eq(3) > div > div:eq(1) > div:eq(1) > div:eq(2)''' PostProcessData: True TimeoutInSeconds: 60 ExtractedData=> DataFromWebPage
WebAutomation.GoToWebPage.GoToWebPage BrowserInstance: Browser Url: DataFromWebPage WaitForPageToLoadTimeout: 60
END
File.WriteToCSVFile.WriteCSV VariableToWrite: DataFromWebPage CSVFile: $'''C:\\Users\\myuser\\Documents\\bubble.csv''' CsvFileEncoding: File.CSVEncoding.UTF8 IncludeColumnNames: False IfFileExists: File.IfFileExists.Overwrite ColumnsSeparator: File.CSVColumnsSeparator.SystemDefault

TomlDecodeError: Key name found without value. Reached end of line When deploying on Heroku

Am getting this error message when deploying a streamlit app to Heroku and do not know how to troubleshoot it. I have deployed apps using this setup previously, and removing the config.toml file has always fixed it. But this time is different.
I went to Heroku's website to find help and no luck so far: https://discuss.streamlit.io/t/tomldecodeerror-key-name-found-without-value-reached-end-of-line-when-deploying-on-heroku/12285
It looks like this questions has left others hanging as well, since this question and this one have the same error message and there has not been an answer so far.
Here is my setup.sh file, which I believe is causing the problem:
mkdir -p ~/.streamlit/
echo "\
[general]\n\
email = \"myemailgmail.com\"\n\
" > ~/.streamlit/credentials.toml
echo "\
[server]\n\
headless = true\n\
enableCORS=false\n\
port = $PORT\n\
" > ~/.streamlit/config.toml
Here is my Procfile:
web: sh setup.sh && streamlit run app.py
Here is the application file itself, app.py:
import streamlit as st
import pandas as pd
import pickle
import plotly.express as px
#st.cache
def load_data(n_rows=3000):
data = pd.read_csv('https://raw.githubusercontent.com/JonathanBechtel/dat-02-22/main/ClassMaterial/Unit3/data/ks2.csv', nrows=n_rows)
return data
#st.cache
def group_data(x_axis, y_axis):
result = data.groupby(x_axis)[y_axis].mean()
return result
#st.cache
def load_model():
with open('mod.pkl', 'rb') as mod:
pipe = pickle.load(mod)
return pipe
st.title("Understanding Kickstarter Applications -- See How Easy It Is Cindy?")
section = st.sidebar.radio('Section', ['Data Explorer', 'Model Predictions'])
n_rows = st.sidebar.number_input("Enter Number of Rows To Load", min_value=1000, max_value=100000, step=1000)
data = load_data(n_rows)
if section == 'Data Explorer':
chart_type = st.sidebar.selectbox('Chart Type', ['Bar', 'Line', 'Strip'])
st.write(data)
x_axis = st.sidebar.selectbox('Choose Column for X-Axis', ['category', 'main_category', 'country'])
y_axis = st.sidebar.selectbox('Choose Column for y-axis', ['state', 'goal'])
st.header(f"Average value for {y_axis} for column {x_axis}")
if chart_type == 'Bar':
result = group_data(x_axis, y_axis)
st.bar_chart(result)
elif chart_type == 'Line':
result = group_data(x_axis, y_axis)
st.line_chart(result)
else:
result = data[[x_axis, y_axis]]
st.plotly_chart(px.strip(result, x=x_axis, y=y_axis, color=x_axis))
elif section == 'Model Predictions':
with open('mod.pkl', 'rb') as mod:
pipe = pickle.load(mod)
print(pipe)
category = st.sidebar.selectbox('Select A Category', data['category'].unique().tolist())
main_category = st.sidebar.selectbox('Select a Main Category', data['main_category'].unique().tolist())
funding_amount = st.sidebar.number_input('Enter Your Funding Amount', min_value=0, value=1000, step=500)
sample = pd.DataFrame({
'category': [category],
'main_category': [main_category],
'funding_amount': [funding_amount]
})
prediction = pipe.predict_proba(sample)
st.header(f"Predicted Probability of Campaign Successs: {prediction[0][1]:.2%}")
My requirements.txt file reads like this:
streamlit == 0.67.0
scikit-learn
pandas
category_encoders == 2.*
xgboost == 1.3.*
protobuf == 3.13.0
plotly == 4.12.0
Any recommendations are welcome, because all wells have been run dry at this point.
You can try :
mkdir -p ~/.streamlit/
echo "[server]\nheadless = true\nport = $PORT\nenableCORS = false\n" > ~/.streamlit/config.toml

normalize-space not working on scrapy

I am trying to extract chapter titles and their subtitles from a web page in the url. This is my spider
import scrapy
from ..items import ContentsPageSFBItem
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
#allowed_domains = ["web"]
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = ContentsPageSFBItem()
item['content_item'] = response.xpath('normalize-space(//ol[#class="detail-toc"]//*/text())').extract();
length = len(response.xpath('//ol[#class="detail-toc"]//*/text()').extract()); #extract()
full_url_list = list();
title_list = list();
for i in range(1,length+1):
full_url_list.append(response.url)
item["full_url"] = full_url_list
title = response.xpath('//title[1]/text()').extract();
for j in range(1,length+1):
title_list.append(title)
item["title"] = title_list
return item
Even though I use the normalise fucntion in my xpath to remove the spaces, I get the following result in my csv
content_item,full_url,title
"
,Chapter 1,
,
,
,Instructor Introduction,
,00:01:00,
,
,
,Course Overview,
How do I get the result with at most only one new line after each entry?
If you want to get all text within Table of Contents section you need to change your xpath expression in item['content_item'] to:
item['content_item'] = response.xpath('//ol[#class="detail-toc"]//a/text()').extract()
You can rewrite you spider code like this:
import scrapy
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = dict() # change dict to your scrapy item
for link in response.xpath('//ol[#class="detail-toc"]//a'):
item['link_text'] = link.xpath('text()').extract_first()
item['link_url'] = response.urljoin(link.xpath('#href').extract_first())
yield item
# Output:
{'link_text': 'About This E-Book', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/pref00.html#pref00'}
{'link_text': 'Title Page', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/title.html#title'}

Scrapy Pagination Fails

Hello this is my first ever post ,
So I am trying to make a Web Spider that will follow the links in invia.cz and copy all the titles from the hotel.
import scrapy
y=0
class invia(scrapy.Spider):
name = 'Kreta'
start_urls = ['https://dovolena.invia.cz/?d_start_from=13.01.2017&sort=nl_sell&page=1']
def parse(self, response):
for x in range (1, 9):
yield {
'titles':response.css("#main > div > div > div > div.col.col-content > div.product-list > div > ul > li:nth-child(%d)>div.head>h2>a>span.name::text"%(x)).extract() ,
}
if (response.css('#main > div > div > div > div.col.col-content >
div.product-list > div > p >
a.next').extract_first()):
y=y+1
go = ["https://dovolena.invia.cz/d_start_from=13.01.2017&sort=nl_sell&page=%d" % y]
print go
yield scrapy.Request(
response.urljoin(go),
callback=self.parse
)
In this website pages are loading with AJAX so I change the value of the URL manually, incremented by one only if the next button appears in the page.
In the scrapy shell when I test if the button appears and the conditions everything is good but when I start the spider it only crawls the first page.
It's my first spider ever so thanks in advance.
Also the errol log Error Log1 Error Log
Your usage of "global" y variable is not only peculiar but won't work either
You're using y to calculate how many times parse was called. Ideally you don't want to access anything outside of the functions scope, so you can achieve the same thing with using request.meta attribute:
def parse(self, response):
y = response.meta.get('index', 1) # default is page 1
y += 1
# ...
#next page
url = 'http://example.com/?p={}'.format(y)
yield Request(url, self.parse, meta={'index':y})
Regarding your pagination issue, your next page url css selector is incorrect since the <a> node you're selecting doesn't have a absolute href attached to it, also this issue makes your y issue obsolete. To solve this try:
def parse(self, response):
next_page = response.css("a.next::attr(data-page)").extract_first()
# replace "page=1" part of the url with next number
url = re.sub('page=\d+', 'page=' + next_page, response.url)
yield Request(url, self.parse, meta={'index':y})
EDIT: Here's the whole working spider:
import scrapy
import re
class InviaSpider(scrapy.Spider):
name = 'invia'
start_urls = ['https://dovolena.invia.cz/?d_start_from=13.01.2017&sort=nl_sell&page=1']
def parse(self, response):
names = response.css('span.name::text').extract()
for name in names:
yield {'name': name}
# next page
next_page = response.css("a.next::attr(data-page)").extract_first()
url = re.sub('page=\d+', 'page=' + next_page, response.url)
yield scrapy.Request(url, self.parse)

How to scrape all the image url and alt tag within it using scrapy

My target is to crawl image url and image alt tag using scrapy . I tried many combinations but still didn't achieve it.
Here is what i tried
def parse_item(self, response):
sel = Selector(response)
item = imageItem()
item['crawl_time'] = time.asctime( time.localtime(time.time()))
item['crawl_date'] = time.asctime( time.localtime(time.strftime("%Y%m%d")))
item['url'] = response.url
for img in hxs.select('//img'):
item['title'] = node.xpath("#alt").extract()
item['iurl'] = node.xpath("#src").extract()
if response.meta['depth'] == 1:
exit
return item
Some issues there:
You already have sel selector. But you use hxs in the loop
in the loop, you are using node instead of img
does it make more sense that each loop should yield one image item
This is my tested and working code:
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
for img in images:
item = imageItem()
item['url'] = response.url
title = img.xpath('./#alt').extract() or ''
item_title = title[0] if title else ''
item['title'] = item_title
iurl = img.xpath('./#src').extract() or ''
item_iurl = iurl[0] if iurl else ''
item['iurl'] = item_iurl
yield item
Here is the below code using which I achieved the result , but depth is still 1
class MySpider(CrawlSpider):
name = 'imageaggr'
start_urls = ['http://www.dmoz.org/','http://timesofindia.indiatimes.com/','http://www.nytimes.com','http://www.washingtonpost.com/','http://www.jpost.com','http://www.rediff.com/']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('', ), deny=('defghi\.txt')), callback='parse_item'),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
# Rule(SgmlLinkExtractor(allow=('\.cms','\.html' )), deny=('parse_item\.html'))),
#Rule(SgmlLinkExtractor(allow=('news', )), callback='parse_item'),
)
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
image_count = len(images)
count = 0
while(count < image_count):
item = imageItem()
item['url'] = response.url
title = sel.xpath('//img/#alt').extract()[count] or ''
if title == '':
break
item['title'] = title
iurl = sel.xpath('//img/#src').extract()[count] or ''
item['iurl'] = iurl
item['crawl_time'] = time.asctime( time.localtime(time.time()))
crawl_date = time.strftime("%Y%m%d")
item['crawl_date'] = crawl_date
count = count + 1
return item

Resources