xpath could not recognize predicate for a tag - xpath

I try to use scrapy xpath to scrape a page, but it seems it cannot capture the tag with predicates when I use a for loop,
# This package will contain the spiders of your Scrapy project
from cunyfirst.items import CunyfirstSectionItem
import scrapy
import json
class CunyfristsectionSpider(scrapy.Spider):
name = "cunyfirst-section-spider"
start_urls = ["file:///Users/haowang/Desktop/section.htm"]
def parse(self, response):
url = response.url
yield scrapy.Request(url, self.parse_page)
def parse_page(self, response):
n = -1
for section in response.xpath("//a[contains(#name,'MTG_CLASS_NBR')]"):
print(response.xpath("//a[#name ='MTG_CLASSNAME$10']/text()"))
n += 1
class_num = section.xpath('text()').extract_first()
# print(class_num)
classname = "MTG_CLASSNAME$" + str(n)
date = "MTG_DAYTIME$" + str(n)
instr = "MTG_INSTR$" + str(n)
print(classname)
class_name = response.xpath("//a[#name = classname]/text()")
I am looking for a tags with name as "MTG_CLASSNAME$" + str(n), with n being 0,1,2..., and I am getting empty output from my xpath query. Not sure why...
PS.
I am basically trying to scrape course and their info from https://hrsa.cunyfirst.cuny.edu/psc/cnyhcprd/GUEST/HRMS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL?FolderPath=PORTAL_ROOT_OBJECT.HC_CLASS_SEARCH_GBL&IsFolder=false&IgnoreParamTempl=FolderPath%252cIsFolder&PortalActualURL=https%3a%2f%2fhrsa.cunyfirst.cuny.edu%2fpsc%2fcnyhcprd%2fGUEST%2fHRMS%2fc%2fCOMMUNITY_ACCESS.CLASS_SEARCH.GBL&PortalContentURL=https%3a%2f%2fhrsa.cunyfirst.cuny.edu%2fpsc%2fcnyhcprd%2fGUEST%2fHRMS%2fc%2fCOMMUNITY_ACCESS.CLASS_SEARCH.GBL&PortalContentProvider=HRMS&PortalCRefLabel=Class%20Search&PortalRegistryName=GUEST&PortalServletURI=https%3a%2f%2fhome.cunyfirst.cuny.edu%2fpsp%2fcnyepprd%2f&PortalURI=https%3a%2f%2fhome.cunyfirst.cuny.edu%2fpsc%2fcnyepprd%2f&PortalHostNode=ENTP&NoCrumbs=yes
with filter applied: Kingsborough CC, fall 18, BIO
Thanks!

Well... I've visited the website you put in the question description, I used element inspection and searched for "MTG_CLASSNAME" and I got 0 matches...
So I will give you some tools:
In your settings.py set that:
LOG_FILE = "log.txt"
LOG_STDOUT=True
then print the response body ( response.body ) where you should ( in the top of parse_page function in this case ) and search it in log.txt
Check there if there is what you are looking for.
If there is, use this https://www.freeformatter.com/xpath-tester.html (
or similar ) to check your xpath statement.
In addition, change for section in response.xpath("//a[contains(#name,'MTG_CLASS_NBR')]"):
by for section in response.xpath("//a[contains(#name,'MTG_CLASS_NBR')]").extract():, this will raise an error when you get the data that you are looking for.

Related

Human readable iterables in Sphinx documentation

Sphinx-autodoc flattens dicts, lists, and tuples - making long ones barely readable. Pretty-print format isn't always desired either, as some nested containers are better kept flattened than columned. Is there a way to display iterables as typed in source code?
Get it straight from source, and add an .rst command for it:
# conf.py
from importlib import import_module
from docutils import nodes
from sphinx import addnodes
from inspect import getsource
from docutils.parsers.rst import Directive
class PrettyPrintIterable(Directive):
required_arguments = 1
def run(self):
def _get_iter_source(src, varname):
# 1. identifies target iterable by variable name, (cannot be spaced)
# 2. determines iter source code start & end by tracking brackets
# 3. returns source code between found start & end
start = end = None
open_brackets = closed_brackets = 0
for i, line in enumerate(src):
if line.startswith(varname):
if start is None:
start = i
if start is not None:
open_brackets += sum(line.count(b) for b in "([{")
closed_brackets += sum(line.count(b) for b in ")]}")
if open_brackets > 0 and (open_brackets - closed_brackets == 0):
end = i + 1
break
return '\n'.join(src[start:end])
module_path, member_name = self.arguments[0].rsplit('.', 1)
src = getsource(import_module(module_path)).split('\n')
code = _get_iter_source(src, member_name)
literal = nodes.literal_block(code, code)
literal['language'] = 'python'
return [addnodes.desc_name(text=member_name),
addnodes.desc_content('', literal)]
def setup(app):
app.add_directive('pprint', PrettyPrintIterable)
Example .rst and result:
(:autodata: with empty :annotation: is to exclude the original flattened dictionary).
Some code borrowed from this answer.

normalize-space not working on scrapy

I am trying to extract chapter titles and their subtitles from a web page in the url. This is my spider
import scrapy
from ..items import ContentsPageSFBItem
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
#allowed_domains = ["web"]
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = ContentsPageSFBItem()
item['content_item'] = response.xpath('normalize-space(//ol[#class="detail-toc"]//*/text())').extract();
length = len(response.xpath('//ol[#class="detail-toc"]//*/text()').extract()); #extract()
full_url_list = list();
title_list = list();
for i in range(1,length+1):
full_url_list.append(response.url)
item["full_url"] = full_url_list
title = response.xpath('//title[1]/text()').extract();
for j in range(1,length+1):
title_list.append(title)
item["title"] = title_list
return item
Even though I use the normalise fucntion in my xpath to remove the spaces, I get the following result in my csv
content_item,full_url,title
"
,Chapter 1,
,
,
,Instructor Introduction,
,00:01:00,
,
,
,Course Overview,
How do I get the result with at most only one new line after each entry?
If you want to get all text within Table of Contents section you need to change your xpath expression in item['content_item'] to:
item['content_item'] = response.xpath('//ol[#class="detail-toc"]//a/text()').extract()
You can rewrite you spider code like this:
import scrapy
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = dict() # change dict to your scrapy item
for link in response.xpath('//ol[#class="detail-toc"]//a'):
item['link_text'] = link.xpath('text()').extract_first()
item['link_url'] = response.urljoin(link.xpath('#href').extract_first())
yield item
# Output:
{'link_text': 'About This E-Book', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/pref00.html#pref00'}
{'link_text': 'Title Page', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/title.html#title'}

For loop while using scrapy

I am trying to crawl a website. I want to do it for different dates. So i am storing date in a list. But while trying to access items of list, crawler works only for 1st value in list. Please help. following is my code:
class SpidyQuotesViewStateSpider(scrapy.Spider):
name = 'retail_price'
def start_requests(self):
print "start request"
urls = "http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx"
yield scrapy.Request(url=urls, callback=self.parse)
def parse(self, response):
dated = ["05/03/2017","04/03/2017"]
urls = "http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx"
#frmdata =
cookies1 ={}
val = response.headers.getlist('Set-Cookie')
print "login session values" ,response.headers.getlist('Set-Cookie')
if(len(val) != 0):
cookies1['ASP.NET_SessionId'] = str(response.headers.getlist('Set-Cookie')[0].split(";")[0].split("=")[1])
cookies1['path'] = str(response.headers.getlist('Set-Cookie')[0].split(";")[1].split("=")[1])
print cookies1;
for i in range(len(dated)):
yield scrapy.FormRequest(url=urls, callback=self.parse1, formdata={'ctl00$MainContent$btn_getdata1':"Get Data",
'ctl00$MainContent$Txt_FrmDate':dated[i],
'ctl00$MainContent$Ddl_Rpt_Option0':"Daily Prices",
'ctl00$MainContent$Rbl_Rpt_type':"Price report",
'ctl00$MainContent$ddl_Language':"English",
'ctl00$MainContent$Ddl_Rpt_type':"Retail",
'__EVENTVALIDATION':"IwZyKgfTXVzxiHxiPXGk/W8XQZBDb0EOPxJh6s8hofq0ffqOpiHSH77CafcxySF3PbkYgSMNFCJhLM2cGnL6SxT0PJuGDCJtV0V8Y4a94UErUCiSANiin+4uKckk9v9Ux8JqTVeaipppmlH+wyks2U9SgPfkNUsqw4eHCkDyB5akNNZImRIixOHHVY3JSXGkwXn7ueK9w+AgnqJzpXaWdMr9J1++M4VAFImSNF8brFSfPHe5kb/qzkGIwUr/KRouaRYK8WLWZh/Mbl9xwREwhDSxWJSOdihSE0WWoaqSMtpaR99rDDCsD3mdJqfu0aPIlREupTZRzlrmztXU0eS3949YW+ywdTRvykaMNgOW2Q4saYP5j/niKbRW6GiDnaLV2A38X/HW80+trrsjwJr9tjTKVFyikf6s/3gzyiTp11ivSkwIY2b3hutjYn7OfTDo",
#'__EVENTVALIDATION':"HqVo2xHk04clYwnBposXbZGhbIr181A7RbyeZv74Cia7rXSKmpOpbeSnn3XXnoDJKRxMK0W9nxKZFfkNje+P/K7gE5HVjHJr9Gr0Gs46TntzKDsvzyii8jZ7e0fdZgQCJKoXxQNgR2vNkWqChKcEldBuMHCOgJRqCNCF/JPFKpdKZoIWr7GU8rhzwLijf/Gkm+FuTULs/fl2HHK6Z1QQEozzEHFsDwzl0G4IiN//eNYfHuUBXKZ3wdZzPqG0s53WHEuSBzhqBC9AtCJOs4ZZhdtwFh8iyTJ4PlsLP9DLHYHRCOAd72UO0UH8gT7gAkKVo1I4L540DilowOR9SttH7MM/oOs9qhKlnG61FgqkYGW8zGzF/yNEXO+beVAK1RVvuO+FDnuq/g36TRnUieei5GpAZ+96CSoCIxykdvHx8R+smTNF/5erlowV4ci+tcI7",
'__VIEWSTATEENCRYPTED':"",
'__VIEWSTATEGENERATOR':"85862B00",
'__VIEWSTATE':"+a+3jrBEKxDdkPOzx2wXwKaTMWvCB60WPaHRfJUAZQrdFIpxSqFr5VseTclpGzeHXdxaFnxJe/PkxDKYa7sj3Wiv/os1bNeX0IEB3s45eFsHYWGiU8cvsXCGa5z7rrGRDL5hotg7k/MuUWj8w27xXZO423MN5OsHS+wh+tC/5/Xix+w3zxuQhi8jR5DnreimHbhGZn1sYaKYIGCc8mDIDRNl+w1OZ058F+3LAx96QUu5BYiMYOmrlyxrb9b2yPTmmIrI4NtC4ClBQlxuST5wMDP3vUqqWMhn4auk8ev5gHyPestCRrsAXWs07wDNnikemMwo/4wPiTEbnZQV6SLcDUw0gZpXjXwLI7mhsVjEyVNaQnJp6+Wi6FLsAEEMlFYmQut3JecpVIUkjF9uYSN2GLIbXHPs37AiEXPeQ8E/GyBMx3z1X5l8sw/xSNmFgYQC3riajn8V0+SdkuV2PbNbYKtc+uoSCNLppLYCqiOv5eWanGvAQro2Q67FBA4w2xY+V/K8mzHaGMLoDBxJxLslWyJpL5cX0C6qoXVUu8B028auAQM4eVzH1YPF5qrJiCDo",
#'__VIEWSTATE':"W+m8kNAS6QHiRPo+zFj00EDs/Dbq+y/XvtCmSNwOIkGKlikAlphT8HBAWQDskSm1vdNterBuo0Hy7m4xPbXMOnyEm6IlseXO3jPw+ofnI2WHAKknLil+GeS0IfMWGeoD5aNyiz3zh1jZkKU7R7hQsxwARoHRyjhf8UCooFbkVvL6ddHVYZbH5LcocmCF1BTOCqYN5y5yzfDfYbp3KNW9kH53pdmwCsjiEirdxxUGDoG1Ke3JBEXfSl+4XubirHSR8z+VlFmPPXZGU8mMogwq9Eg822RYjvbwvZG74djcf7kdfB9KXCPO9u6cWIjLiW+cfXHSXD+1XYFVf9ATU2/NV4YbUzsI4PJRwoGD4BryUNIm2JFeT4c8F4REYTA16shxz5mDTFQ6rbmg6SmqP8G9gAc2Hr9ABD8+2BUNabGhNZ8wDIZArfYS4pl5DNrlPlpqeCjhmvv0znKAJSOac3pCUej8G90ZGwQKOPORWbNVzQShoH7QvrXV8pCklcia6psuAGO+Oj72oDWPxedE4DjdjX5TbLoW4bzsk/YNfUv4JpjGR8DWpG8IFYJG9CCjMEYb",
'__LASTFOCUS':"",
'__EVENTARGUMENT':"",
'__EVENTTARGET':"",
'ctl00_MainContent_ToolkitScriptManager1_HiddenField':";;AjaxControlToolkit,+Version=4.1.51116.0,+Culture=neutral,+PublicKeyToken=28f01b0e84b6d53e:en-US:fd384f95-1b49-47cf-9b47-2fa2a921a36a:475a4ef5:addc6819:5546a2b:d2e10b12:effe2a26:37e2e5c9:5a682656:c7029a2:e9e598a9"},method='POST',cookies = cookies1)
def parse1(self, response):
path1 = "id('Panel1')"
value1 = response.xpath(path1).extract_first()
print value1
First of all, you are sending the spider more time on the same site, though with different form parameters. You have therefore to use dont_filter=True in the request, otherwise Scrapy blocks duplicate calls.
Then it seems to me that the site you are scraping don't allow you to make more than one request during the same session. Try for example to go to http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx with your browser, compile the form, get the data and than to go back to the initial page: It's impossible. So you have to modify your spider. Here's a very rough code just to give an idea. It works for me, but please don't use it in production!
class SpidyQuotesViewStateSpider(scrapy.Spider):
name = 'retail_price'
urls = "http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx"
def start_requests(self):
dated = ["01/03/2017","05/03/2017","04/03/2017"]
for i in dated:
request = scrapy.Request(url=self.urls, dont_filter=True, callback=self.parse)
request.meta['question'] = i
yield request
def parse(self, response):
thedate = response.meta['question']
cookies1 ={}
val = response.headers.getlist('Set-Cookie')
print("login session values" ,response.headers.getlist('Set-Cookie'))
if(len(val) != 0):
cookies1['ASP.NET_SessionId'] = str(str(response.headers.getlist('Set-Cookie')[0]).split(";")[0].split("=")[1])
cookies1['path'] = str(str(response.headers.getlist('Set-Cookie')[0]).split(";")[1].split("=")[1])
yield scrapy.FormRequest(url=self.urls, dont_filter=True, callback=self.parse1, formdata={'ctl00$MainContent$btn_getdata1':"Get Data",
'ctl00$MainContent$Txt_FrmDate': thedate,
'ctl00$MainContent$Ddl_Rpt_Option0':"Daily Prices",
'ctl00$MainContent$Rbl_Rpt_type':"Price report",
'ctl00$MainContent$ddl_Language':"English",
'ctl00$MainContent$Ddl_Rpt_type':"Retail",
'__EVENTVALIDATION':"IwZyKgfTXVzxiHxiPXGk/W8XQZBDb0EOPxJh6s8hofq0ffqOpiHSH77CafcxySF3PbkYgSMNFCJhLM2cGnL6SxT0PJuGDCJtV0V8Y4a94UErUCiSANiin+4uKckk9v9Ux8JqTVeaipppmlH+wyks2U9SgPfkNUsqw4eHCkDyB5akNNZImRIixOHHVY3JSXGkwXn7ueK9w+AgnqJzpXaWdMr9J1++M4VAFImSNF8brFSfPHe5kb/qzkGIwUr/KRouaRYK8WLWZh/Mbl9xwREwhDSxWJSOdihSE0WWoaqSMtpaR99rDDCsD3mdJqfu0aPIlREupTZRzlrmztXU0eS3949YW+ywdTRvykaMNgOW2Q4saYP5j/niKbRW6GiDnaLV2A38X/HW80+trrsjwJr9tjTKVFyikf6s/3gzyiTp11ivSkwIY2b3hutjYn7OfTDo",
#'__EVENTVALIDATION':"HqVo2xHk04clYwnBposXbZGhbIr181A7RbyeZv74Cia7rXSKmpOpbeSnn3XXnoDJKRxMK0W9nxKZFfkNje+P/K7gE5HVjHJr9Gr0Gs46TntzKDsvzyii8jZ7e0fdZgQCJKoXxQNgR2vNkWqChKcEldBuMHCOgJRqCNCF/JPFKpdKZoIWr7GU8rhzwLijf/Gkm+FuTULs/fl2HHK6Z1QQEozzEHFsDwzl0G4IiN//eNYfHuUBXKZ3wdZzPqG0s53WHEuSBzhqBC9AtCJOs4ZZhdtwFh8iyTJ4PlsLP9DLHYHRCOAd72UO0UH8gT7gAkKVo1I4L540DilowOR9SttH7MM/oOs9qhKlnG61FgqkYGW8zGzF/yNEXO+beVAK1RVvuO+FDnuq/g36TRnUieei5GpAZ+96CSoCIxykdvHx8R+smTNF/5erlowV4ci+tcI7",
'__VIEWSTATEENCRYPTED':"",
'__VIEWSTATEGENERATOR':"85862B00",
'__VIEWSTATE':"+a+3jrBEKxDdkPOzx2wXwKaTMWvCB60WPaHRfJUAZQrdFIpxSqFr5VseTclpGzeHXdxaFnxJe/PkxDKYa7sj3Wiv/os1bNeX0IEB3s45eFsHYWGiU8cvsXCGa5z7rrGRDL5hotg7k/MuUWj8w27xXZO423MN5OsHS+wh+tC/5/Xix+w3zxuQhi8jR5DnreimHbhGZn1sYaKYIGCc8mDIDRNl+w1OZ058F+3LAx96QUu5BYiMYOmrlyxrb9b2yPTmmIrI4NtC4ClBQlxuST5wMDP3vUqqWMhn4auk8ev5gHyPestCRrsAXWs07wDNnikemMwo/4wPiTEbnZQV6SLcDUw0gZpXjXwLI7mhsVjEyVNaQnJp6+Wi6FLsAEEMlFYmQut3JecpVIUkjF9uYSN2GLIbXHPs37AiEXPeQ8E/GyBMx3z1X5l8sw/xSNmFgYQC3riajn8V0+SdkuV2PbNbYKtc+uoSCNLppLYCqiOv5eWanGvAQro2Q67FBA4w2xY+V/K8mzHaGMLoDBxJxLslWyJpL5cX0C6qoXVUu8B028auAQM4eVzH1YPF5qrJiCDo",
#'__VIEWSTATE':"W+m8kNAS6QHiRPo+zFj00EDs/Dbq+y/XvtCmSNwOIkGKlikAlphT8HBAWQDskSm1vdNterBuo0Hy7m4xPbXMOnyEm6IlseXO3jPw+ofnI2WHAKknLil+GeS0IfMWGeoD5aNyiz3zh1jZkKU7R7hQsxwARoHRyjhf8UCooFbkVvL6ddHVYZbH5LcocmCF1BTOCqYN5y5yzfDfYbp3KNW9kH53pdmwCsjiEirdxxUGDoG1Ke3JBEXfSl+4XubirHSR8z+VlFmPPXZGU8mMogwq9Eg822RYjvbwvZG74djcf7kdfB9KXCPO9u6cWIjLiW+cfXHSXD+1XYFVf9ATU2/NV4YbUzsI4PJRwoGD4BryUNIm2JFeT4c8F4REYTA16shxz5mDTFQ6rbmg6SmqP8G9gAc2Hr9ABD8+2BUNabGhNZ8wDIZArfYS4pl5DNrlPlpqeCjhmvv0znKAJSOac3pCUej8G90ZGwQKOPORWbNVzQShoH7QvrXV8pCklcia6psuAGO+Oj72oDWPxedE4DjdjX5TbLoW4bzsk/YNfUv4JpjGR8DWpG8IFYJG9CCjMEYb",
'__LASTFOCUS':"",
'__EVENTARGUMENT':"",
'__EVENTTARGET':"",
'ctl00_MainContent_ToolkitScriptManager1_HiddenField':";;AjaxControlToolkit,+Version=4.1.51116.0,+Culture=neutral,+PublicKeyToken=28f01b0e84b6d53e:en-US:fd384f95-1b49-47cf-9b47-2fa2a921a36a:475a4ef5:addc6819:5546a2b:d2e10b12:effe2a26:37e2e5c9:5a682656:c7029a2:e9e598a9"},method='POST',cookies = cookies1)
def parse1(self, response):
path1 = "id('Panel1')"
value1 = response.xpath(path1).extract_first()[:574]
print(value1)

Scrapy Pagination Fails

Hello this is my first ever post ,
So I am trying to make a Web Spider that will follow the links in invia.cz and copy all the titles from the hotel.
import scrapy
y=0
class invia(scrapy.Spider):
name = 'Kreta'
start_urls = ['https://dovolena.invia.cz/?d_start_from=13.01.2017&sort=nl_sell&page=1']
def parse(self, response):
for x in range (1, 9):
yield {
'titles':response.css("#main > div > div > div > div.col.col-content > div.product-list > div > ul > li:nth-child(%d)>div.head>h2>a>span.name::text"%(x)).extract() ,
}
if (response.css('#main > div > div > div > div.col.col-content >
div.product-list > div > p >
a.next').extract_first()):
y=y+1
go = ["https://dovolena.invia.cz/d_start_from=13.01.2017&sort=nl_sell&page=%d" % y]
print go
yield scrapy.Request(
response.urljoin(go),
callback=self.parse
)
In this website pages are loading with AJAX so I change the value of the URL manually, incremented by one only if the next button appears in the page.
In the scrapy shell when I test if the button appears and the conditions everything is good but when I start the spider it only crawls the first page.
It's my first spider ever so thanks in advance.
Also the errol log Error Log1 Error Log
Your usage of "global" y variable is not only peculiar but won't work either
You're using y to calculate how many times parse was called. Ideally you don't want to access anything outside of the functions scope, so you can achieve the same thing with using request.meta attribute:
def parse(self, response):
y = response.meta.get('index', 1) # default is page 1
y += 1
# ...
#next page
url = 'http://example.com/?p={}'.format(y)
yield Request(url, self.parse, meta={'index':y})
Regarding your pagination issue, your next page url css selector is incorrect since the <a> node you're selecting doesn't have a absolute href attached to it, also this issue makes your y issue obsolete. To solve this try:
def parse(self, response):
next_page = response.css("a.next::attr(data-page)").extract_first()
# replace "page=1" part of the url with next number
url = re.sub('page=\d+', 'page=' + next_page, response.url)
yield Request(url, self.parse, meta={'index':y})
EDIT: Here's the whole working spider:
import scrapy
import re
class InviaSpider(scrapy.Spider):
name = 'invia'
start_urls = ['https://dovolena.invia.cz/?d_start_from=13.01.2017&sort=nl_sell&page=1']
def parse(self, response):
names = response.css('span.name::text').extract()
for name in names:
yield {'name': name}
# next page
next_page = response.css("a.next::attr(data-page)").extract_first()
url = re.sub('page=\d+', 'page=' + next_page, response.url)
yield scrapy.Request(url, self.parse)

Defining URL list for crawler, syntax issues

I'm currently running the following code:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
def hltv_match_list(max_offset):
offset = 0
while offset < max_offset:
url = 'http://www.hltv.org/?pageid=188&offset=' + str(offset)
base = "http://www.hltv.org/"
soup = BeautifulSoup(requests.get("http://www.hltv.org/?pageid=188&offset=0").content, 'html.parser')
cont = soup.select("div.covMainBoxContent a[href*=matchid=]")
href = urljoin(base, (a["href"] for a in cont))
# print([urljoin(base, a["href"]) for a in cont])
get_hltv_match_data(href)
offset += 50
def get_hltv_match_data(matchid_url):
source_code = requests.get(matchid_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for teamid in soup.findAll("div.covSmallHeadline a[href*=teamid=]"):
print teamid.string
hltv_match_list(5)
Errors:
File "C:/Users/mdupo/PycharmProjects/HLTVCrawler/Crawler.py", line 12, in hltv_match_list
href = urljoin(base, (a["href"] for a in cont))
File "C:\Python27\lib\urlparse.py", line 261, in urljoin
urlparse(url, bscheme, allow_fragments)
File "C:\Python27\lib\urlparse.py", line 143, in urlparse
tuple = urlsplit(url, scheme, allow_fragments)
File "C:\Python27\lib\urlparse.py", line 182, in urlsplit
i = url.find(':')
AttributeError: 'generator' object has no attribute 'find'
Process finished with exit code 1
I think I'm having trouble with the href = urljoin(base, (a["href"] for a in cont)) part as I'm trying to create a url list I can feed into get_hltv_match_datato then capture various items within that page. Am I going about this wrong?
Cheers
You need to join each href as per your commented code:
urls = [urljoin(base,a["href"]) for a in cont]
You are trying to join the base url to a generator i.e (a["href"] for a in cont) which makes no sense.
You should also be passing the url to requests or you are going to be requesting the same page over and over.
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

Resources