How can I change the scrapy download image name in pipelines? - image

from __future__ import unicode_literals
import sys
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
import os
reload(sys)
sys.setdefaultencoding('utf-8')
class TetePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
item['image'] = []
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Items contains no images')
item['image_paths'] = image_paths
for i in item['image_paths']:
item['image'].append(item['image_titles']+i[-8:])
item['image_paths'] = item['image']
return item
#
scrapy version :1.0
This is my code,It can download images,but the image names are the result of the image url SHA1 hash.
I want to change the image name using custom name.in ths example is :item['image_titles']+i[-8:],int the scrapy shell the item['image_titles']+i[-8:] can be normal output,where is the reason?

class TetePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url, meta={'item': item})
def file_path(self, request, response=None, info=None):
item = request.meta['item']
image_guid = request.url.split('/')[-1]
image_name = item['image_titles']+image_guid[-8:]
return image_name
Change the file_path func, return the image_name, because the get_media_requests will download the image, item_completed has downloaded

Related

Possible to replace Scrapy's default lxml parser with Beautiful Soup's html5lib parser?

Question: Is there a way to integrate BeautifulSoup's html5lib parser into a scrapy project--instead of the scrapy's default lxml parser?
Scrapy's parser fails (for some elements) of my scrape pages.
This only happens every 2 out of 20 pages.
As a fix, I've added BeautifulSoup's parser to the project (which works).
That said, I feel like I'm doubling the work with conditionals and multiple parsers...at a certain point, what's the reason for using Scrapy's parser? The code does work....it feels like a hack.
I'm no expert--is there a more elegant way to do this?
Much appreciation in advance
Update: Adding a middleware class to scrapy (from the python package scrapy-beautifulsoup) works like a charm. Apparently, lxml from Scrapy is not as robust as BeautifulSoup's lxml. I didn't have to resort to the html5lib parser--which is 30X+ slower.
class BeautifulSoupMiddleware(object):
def __init__(self, crawler):
super(BeautifulSoupMiddleware, self).__init__()
self.parser = crawler.settings.get('BEAUTIFULSOUP_PARSER', "html.parser")
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_response(self, request, response, spider):
"""Overridden process_response would "pipe" response.body through BeautifulSoup."""
return response.replace(body=str(BeautifulSoup(response.body, self.parser)))
Original:
import scrapy
from scrapy.item import Item, Field
from scrapy.loader.processors import TakeFirst, MapCompose
from scrapy import Selector
from scrapy.loader import ItemLoader
from w3lib.html import remove_tags
from bs4 import BeautifulSoup
class SimpleSpider(scrapy.Spider):
name = 'SimpleSpider'
allowed_domains = ['totally-above-board.com']
start_urls = [
'https://totally-above-board.com/nefarious-scrape-page.html'
]
custom_settings = {
'ITEM_PIPELINES': {
'crawler.spiders.simple_spider.Pipeline': 400
}
}
def parse(self, response):
yield from self.parse_company_info(response)
yield from self.parse_reviews(response)
def parse_company_info(self, response):
print('parse_company_info')
print('==================')
loader = ItemLoader(CompanyItem(), response=response)
loader.add_xpath('company_name',
'//h1[contains(#class,"sp-company-name")]//span//text()')
yield loader.load_item()
def parse_reviews(self, response):
print('parse_reviews')
print('=============')
# Beautiful Soup
selector = Selector(response)
# On the Page (Total Reviews) # 49
search = '//span[contains(#itemprop,"reviewCount")]//text()'
review_count = selector.xpath(search).get()
review_count = int(float(review_count))
# Number of elements Scrapy's LXML Could find # 0
search = '//div[#itemprop ="review"]'
review_element_count = len(selector.xpath(search))
# Use Scrapy or Beautiful Soup?
if review_count > review_element_count:
# Try Beautiful Soup
soup = BeautifulSoup(response.text, "lxml")
root = soup.findAll("div", {"itemprop": "review"})
for review in root:
loader = ItemLoader(ReviewItem(), selector=review)
review_text = review.find("span", {"itemprop": "reviewBody"}).text
loader.add_value('review_text', review_text)
author = review.find("span", {"itemprop": "author"}).text
loader.add_value('author', author)
yield loader.load_item()
else:
# Try Scrapy
review_list_xpath = '//div[#itemprop ="review"]'
selector = Selector(response)
for review in selector.xpath(review_list_xpath):
loader = ItemLoader(ReviewItem(), selector=review)
loader.add_xpath('review_text',
'.//span[#itemprop="reviewBody"]//text()')
loader.add_xpath('author',
'.//span[#itemprop="author"]//text()')
yield loader.load_item()
yield from self.paginate_reviews(response)
def paginate_reviews(self, response):
print('paginate_reviews')
print('================')
# Try Scrapy
selector = Selector(response)
search = '''//span[contains(#class,"item-next")]
//a[#class="next"]/#href
'''
next_reviews_link = selector.xpath(search).get()
# Try Beautiful Soup
if next_reviews_link is None:
soup = BeautifulSoup(response.text, "lxml")
try:
next_reviews_link = soup.find("a", {"class": "next"})['href']
except Exception as e:
pass
if next_reviews_link:
yield response.follow(next_reviews_link, self.parse_reviews)
It’s a common feature request for Parsel, Scrapy’s library for XML/HTML scraping.
However, you don’t need to wait for such a feature to be implemented. You can fix the HTML code using BeautifulSoup, and use Parsel on the fixed HTML:
from bs4 import BeautifulSoup
# …
response = response.replace(body=str(BeautifulSoup(response.body, "html5lib")))
You can get a charset error using the #Gallaecio's answer, if the original page was not utf-8 encoded, because the response has set to other encoding.
So, you must first switch the encoding.
In addition, there may be a problem of character escaping.
For example, if the character < is encountered in the text of html, then it must be escaped as <. Otherwise, "lxml" will delete it and the text near it, considering it an erroneous html tag.
"html5lib" escapes characters, but is slow.
response = response.replace(encoding='utf-8',
body=str(BeautifulSoup(response.body, 'html5lib')))
"html.parser" is faster, but from_encoding must also be specified (to example 'cp1251').
response = response.replace(encoding='utf-8',
body=str(BeautifulSoup(response.body, 'html.parser', from_encoding='cp1251')))

Scrapy works in shell but spider returns empty csv

I am learning Scrapy. Now I just try to scrapy items and when I call spider:
planefinder]# scrapy crawl planefinder -o /User/spider/planefinder/pf.csv -t csv
it shows tech information and no scraped content (Crawled 0 pages .... etc), and it returns an empty csv file.
The problem is when i test xpath in scrapy shell it works:
>>> from scrapy.selector import Selector
>>> sel = Selector(response)
>>> flights = sel.xpath("//div[#class='col-md-12'][1]/div/div/table//tr")
>>> items = []
>>> for flt in flights:
... item = flt.xpath("td[1]/a/#href").extract_first()
... items.append(item)
...
>>> items
The following is my planeFinder.py code:
# -*-:coding:utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector, HtmlXPathSelector
from planefinder.items import arr_flt_Item, dep_flt_Item
class planefinder(CrawlSpider):
name = 'planefinder'
host = 'https://planefinder.net'
start_url = ['https://planefinder.net/data/airport/PEK/']
def parse(self, response):
arr_flights = response.xpath("//div[#class='col-md-12'][1]/div/div/table//tr")
dep_flights = response.xpath("//div[#class='col-md-12'][2]/div/div/table//tr")
for flight in arr_flights:
arr_item = arr_flt_Item()
arr_flt_url = flight.xpath('td[1]/a/#href').extract_first()
arr_item['arr_flt_No'] = flight.xpath('td[1]/a/text()').extract_first()
arr_item['STA'] = flight.xpath('td[2]/text()').extract_first()
arr_item['From'] = flight.xpath('td[3]/a/text()').extract_first()
arr_item['ETA'] = flight.xpath('td[4]/text()').extract_first()
yield arr_item
Please before going to CrawlSpider please check the docs for Spiders, some of the issues I've found were:
Instead of host use allowed_domains
Instead of start_url use start_urls
It seem that the page needs to have some cookies set or maybe it's using some kind of basic anti-bot protection, and you need to land somewhere else first.
Try this (I've also changed a bit :
# -*-:coding:utf-8 -*-
from scrapy import Field, Item, Request
from scrapy.spiders import CrawlSpider, Spider
class ArrivalFlightItem(Item):
arr_flt_no = Field()
arr_sta = Field()
arr_from = Field()
arr_eta = Field()
class PlaneFinder(Spider):
name = 'planefinder'
allowed_domains = ['planefinder.net']
start_urls = ['https://planefinder.net/data/airports']
def parse(self, response):
yield Request('https://planefinder.net/data/airport/PEK', callback=self.parse_flight)
def parse_flight(self, response):
flights_xpath = ('//*[contains(#class, "departure-board") and '
'./preceding-sibling::h2[contains(., "Arrivals")]]'
'//tr[not(./th) and not(./td[#class="spacer"])]')
for flight in response.xpath(flights_xpath):
arrival = ArrivalFlightItem()
arr_flt_url = flight.xpath('td[1]/a/#href').extract_first()
arrival['arr_flt_no'] = flight.xpath('td[1]/a/text()').extract_first()
arrival['arr_sta'] = flight.xpath('td[2]/text()').extract_first()
arrival['arr_from'] = flight.xpath('td[3]/a/text()').extract_first()
arrival['arr_eta'] = flight.xpath('td[4]/text()').extract_first()
yield arrival
The problem here is not understanding correctly which "Spider" to use, as Scrapy offers different custom ones.
The main one, and the one you should be using is the simple Spider and not CrawlSpider, because CrawlSpider is used for a more deep and intensive search into forums, blogs, etc.
Just change the type of spider to:
from scrapy import Spider
class plane finder(Spider):
...
Check the value of ROBOTSTXT_OBEY in your settings.py file. By default it's set to True (but not when you run shell). Set it to False if you wan't to disobey robots.txt file.

Not able to scrape more then 10 records using scrapy

I'm new to scrapy and python. I'm using scrapy for scraping the data.
The site using AJAX for pagination so I'm not able to get the data more than 10 records I'm posting my code
from scrapy import Spider
from scrapy.selector import Selector
from scrapy import Request
from justdial.items import JustdialItem
import csv
from itertools import izip
import scrapy
import re
class JustdialSpider(Spider):
name = "JustdialSpider"
allowed_domains = ["justdial.com"]
start_urls = [
"http://www.justdial.com/Mumbai/Dentists/ct-385543",
]
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
for url in self.start_urls:
yield Request(url, headers=headers)
def parse(self, response):
questions = Selector(response).xpath('//div[#class="col-sm-5 col-xs-8 store-details sp-detail paddingR0"]')
for question in questions:
item = JustdialItem()
item['name'] = question.xpath(
'//div[#class="col-sm-5 col-xs-8 store-details sp-detail paddingR0"]/h4/span/a/text()').extract()
item['contact'] = question.xpath(
'//div[#class="col-sm-5 col-xs-8 store-details sp-detail paddingR0"]/p[#class="contact-info"]/span/a/b/text()').extract()
with open('some.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(izip(item['name'], item['contact']))
f.close()
return item
# if running code above this I'm able to get 10 records of the page
# This code not working for getting data more than 10 records, Pagination using AJAX
url = 'http://www.justdial.com/functions/ajxsearch.php?national_search=0&act=pagination&city=Mumbai&search=Chemical+Dealers&where=&catid=944&psearch=&prid=&page=2&SID=&mntypgrp=0&toknbkt=&bookDate='
next_page = int(re.findall('page=(\d+)', url)[0]) + 1
next_url = re.sub('page=\d+', 'page={}'.format(next_page), url)
print next_url
def parse_ajaxurl(self, response):
# e.g. http://www.justdial.com/Mumbai/Dentists/ct-385543
my_headers = {'Referer': response.url}
yield Request("ajax_request_url",
headers=my_headers,
callback=self.parse_ajax)
Please help me
Thanks.
Actually if you disable javascript when viewing the page you'll notice that site offers traditional pagination instead of "never ending" AJAX one.
Using this you can simply find url of next page and continue:
def parse(self, response):
questions = response.xpath('//div[contains(#class,"store-details")]')
for question in questions:
item = dict()
item['name'] = question.xpath("h4/span/a/text()").extract_first()
item['contact'] = question.xpath("p[#class='contact-info']//b/text()").extract_first()
yield item
# next page
next_page = response.xpath("//a[#rel='next']/#href").extract_first()
if next_page:
yield Request(next_page)
I also fixed up your xpaths but in overal the only bit that changed is those 3 lines under # next page comment.
As a side note I've noticed you are saving to csv in spider where you can use built-in scrapy exporter command like:
scrapy crawl myspider --output results.csv

How do I convert an osx fileid to a filepath [duplicate]

I am essentially repeating a question that was asked (but not answered) in the comments of PyQt: Getting file name for file dropped in app .
What I'd like to be able to do, a la that post, is convert an output from a file drop event in pyqt that currently looks like this:
/.file/id=6571367.661326 into an actual file path (i.e. /.Documents/etc./etc./myProject/fileNeeded.extension)
so that I can make use of the file that made the attempted QDropEvent. how to do this. Any thoughts?
EDIT:
As mentioned below in the comments, this appears to be a platform specific problem. I am running Mac OS X El Capitan (10.11.2)
I figured out the solution after translating Obj-C code found in https://bugreports.qt.io/browse/QTBUG-40449. Note that this solution is only necessary for Macs running OS X Yosemite or later AND not running PyQt5 (i.e. running v.4.8 in my case).
import objc
import CoreFoundation as CF
def getUrlFromLocalFileID(self, localFileID):
localFileQString = QString(localFileID.toLocalFile())
relCFStringRef = CF.CFStringCreateWithCString(
CF.kCFAllocatorDefault,
localFileQString.toUtf8(),
CF.kCFStringEncodingUTF8
)
relCFURL = CF.CFURLCreateWithFileSystemPath(
CF.kCFAllocatorDefault,
relCFStringRef,
CF.kCFURLPOSIXPathStyle,
False # is directory
)
absCFURL = CF.CFURLCreateFilePathURL(
CF.kCFAllocatorDefault,
relCFURL,
objc.NULL
)
return QUrl(str(absCFURL[0])).toLocalFile()
To see this working in a drag and drop situation, see below:
import sys
import objc
import CoreFoundation as CF
from PyQt4.QtGui import *
from PyQt4.QtCore import *
class MyListWidget(QListWidget):
def __init__(self, parent):
super(MyListWidget, self).__init__(parent)
self.setAcceptDrops(True)
self.setDragDropMode(QAbstractItemView.InternalMove)
def getUrlFromLocalFileID(self, localFileID):
localFileQString = QString(localFileID.toLocalFile())
relCFStringRef = CF.CFStringCreateWithCString(
CF.kCFAllocatorDefault,
localFileQString.toUtf8(),
CF.kCFStringEncodingUTF8
)
relCFURL = CF.CFURLCreateWithFileSystemPath(
CF.kCFAllocatorDefault,
relCFStringRef,
CF.kCFURLPOSIXPathStyle,
False # is directory
)
absCFURL = CF.CFURLCreateFilePathURL(
CF.kCFAllocatorDefault,
relCFURL,
objc.NULL
)
return QUrl(str(absCFURL[0])).toLocalFile()
def dragEnterEvent(self, event):
if event.mimeData().hasUrls():
event.acceptProposedAction()
else:
super(MyListWidget, self).dragEnterEvent(event)
def dragMoveEvent(self, event):
super(MyListWidget, self).dragMoveEvent(event)
def dropEvent(self, event):
if event.mimeData().hasUrls():
event.setDropAction(Qt.CopyAction)
event.accept()
links = []
for url in event.mimeData().urls():
if QString(url.toLocalFile()).startsWith('/.file/id='):
url = self.getUrlFromLocalFileID(url)
links.append(url)
else:
links.append(str(url.toLocalFile()))
for link in links:
self.addItem(link)
else:
super(MyListWidget,self).dropEvent(event)
class MyWindow(QWidget):
def __init__(self):
super(MyWindow,self).__init__()
self.setGeometry(100,100,300,400)
self.setWindowTitle("Filenames")
self.list = MyListWidget(self)
layout = QVBoxLayout(self)
layout.addWidget(self.list)
self.setLayout(layout)
if __name__ == '__main__':
app = QApplication(sys.argv)
app.setStyle("plastique")
window = MyWindow()
window.show()
sys.exit(app.exec_())

sendOSCMsg is not defined on Kivy (Windows Shell)

I tried another attempt with this
kivy 1.9.0
from kivy.app import App
from kivy.uix.floatlayout import FloatLayout
from kivy.uix.boxlayout import BoxLayout
from kivy.uix.button import Button
from kivy.uix.gridlayout import GridLayout
from kivy.lang import Builder
from kivy.uix.widget import Widget
from simpleOSC import initOSCClient, initOSCServer, closeOSC, \
setOSCHandler, sendOSCMsg
class OscShowcase(BoxLayout):
pass
def __init__(self, **kwargs):
super(OscShowcase, self).__init__(**kwargs)
#self.but_Osc = Button(text='Press to show Osc')
#self.but_Osc.bind(on_release=self.send_Osc)
#self.add_widget(self.but_Osc)
def send_Osc(self, *l):
pass
#sendOSCMsg('/chaine_en_dur/', [2.0])
def sendOSCMsg( address='/print', data=[] ) :
m = OSCMessage()
m.setAddress(address)
for d in data :
m.append(d)
basic_client.send(m)
class OscWidget(GridLayout):
def __init__(self, **kwargs):
super(OscWidget, self).__init__(**kwargs)
class TestOscApp(App):
def build(self):
return OscShowcase()
if __name__ == '__main__':
host = '127.0.0.1'
sport = 9000
rport = 9001
# osc
initOSCClient(host, sport)
initOSCServer(host, rport)
TestOscApp().run()
.kv file
<OscShowcase>:
BoxLayout:
OscWidget:
Button:
text: 'OSC'
pos: (700, 500)
# on_release : sendOSCMsg('')
# sendOSCMsg: '/chaine_en_dur/', [2.0]
# on_release : self.but_Osc.bind()
group: 'OscButton'
on_press: sendOSCMsg('2')
I still get an error "NameError: name 'sendOSCMsg is not defined" when I press the button. Is anybody can help me to understand why? I would like to send osc messages out to Max MSP
Kv Lang has some scopes, you can read more about it here
There are three keywords specific to Kv language:
app: always refers to the instance of your application.
root: refers to the base widget/template in the current rule
self: always refer to the current widget
You can run a method from TestOscApp with app.method_name() and from OscShowcase with root.method_name()
So, just update your kv to call sendOSCMsg from OscShowcase:
on_press: root.sendOSCMsg('2')

Resources