I am scraping the Google Play store for the app reviews. I am able to get only 40 reviews only. The problem is in the xhr path upon reaching scrapy throws error:
Http status code not handled or not allowed
Code:
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import BaseSpider
from scrapy.http import Request
class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
class aqaqspider(BaseSpider):
name = "gaana"
allowed_domains = ["play.google.com"]
start_urls = [
"https://play.google.com/store/apps/details?id=com.gaana&hl=en",
]
page = 1
def parse(self, response):
products = response.xpath('//div[#class="single-review"]')
if not products:
raise CloseSpider("No more products!")
for product in products:
item = Product()
#item['brand'] = product.xpath(".//span[contains(#class, 'qa-brandName')]/text()").extract()[0].strip()
item['title'] = product.xpath('.//.//div/div/span[#class="author-name"]/a/text()').extract()[0].strip()
yield item
self.page += 1
yield Request(url="https://play.google.com/store/getreviews?authuser=1" ,
headers={"Referer": "https://play.google.com/store/apps/details?id=com.gaana&hl=en", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse,
dont_filter=True)
Please don't say that it is against their Terms of Use. I know that but I need to learn things and move on. I am not exploiting anything.
Related
I want the user to click through all customer's stories to read up on an individual story. But getting this error:
**TypeError at /stories/story_id
story_detail() missing 1 required positional argument: 'story_id'**
My Views.py:
def all_stories(request):
""" A view to show all stories, including search function """
stories = Stories.objects.all()
context = {
'stories': stories,
}
return render(request, 'stories/stories.html', context)
def story_detail(request, story_id):
""" A view to show individual story detail """
story_detail = get_object_or_404(Story, pk=story_id)
context = {
'story': story_detail,
}
return render(request, 'stories/story_detail.html', context)
My urls.py:
from django.urls import path
from . import views
urlpatterns = [
path('', views.all_stories, name='stories'),
path('story_id', views.story_detail, name='story_detail'),
]
I'm not sure what I have done or can do to fix this, can somebody help?
i need to scrape a url using scrapy and i cant scroll down the website to load all the elements.
i try to seach the next page information but i cant found it
my code of the spider is:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#class="info-ficha"]/div[1]/a')),
callback='parse_item', follow=False)
}
def parse_item(self, response):
item = appinformaticaItem()
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item
anyone can help me?
Change allow to allow=(r'/moviles/.*.html'),follow=True and put your allowed_domains. And try this.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
# from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
allowed_domains = ["appinformatica.com"]
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(r'/moviles/.*\.html'), ),
callback='parse_item', follow=True)
}
def parse_item(self, response):
item = {}
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item
I created a form using mysql EnumField but this field does not appear on the HTML page when it is created.
forms.py - see transaction_type_1 that does not appear on he HTML page.
from django import forms
from .models import tledger_account, tfirst_free_number
from django_mysql.models import EnumField
from datetime import datetime
class Transactions(forms.Form):
transaction_no = forms.IntegerField(initial=1)
description = forms.CharField(max_length=50)
transaction_date = forms.DateField()
sequence_1 = forms.IntegerField()
transaction_type_1 = EnumField(choices=['Debit','Credit'])
ledger_account_1 = forms.ModelChoiceField(queryset=tledger_account.objects.filter(active='Yes'))
amount_1 = forms.DecimalField(max_digits=14, decimal_places=2)
sequence_2 = forms.IntegerField()
transaction_type_2 = EnumField(choices=['Debit', 'Credit'], default='Debit')
ledger_account_2 = forms.ModelChoiceField(queryset=tledger_account.objects.filter(active='Yes'))
amount_2 = forms.DecimalField(max_digits=14, decimal_places=2)
def __init__(self, *args, **kwargs):
# Get 'initial' argument if any
#initial_arguments = kwargs.get('initial', None)
updated_initial = {}
updated_initial['transaction_no'] = tfirst_free_number.objects.get(id=1).ffn
updated_initial['transaction_date'] = datetime.today
updated_initial['sequence_1'] = 1
updated_initial['sequence_2'] = 2
updated_initial['transaction_type_1'] = 'debit'
kwargs.update(initial=updated_initial)
super(Transactions, self).__init__(*args, **kwargs)
You are trying to use a model thus the form does not really know how to display it. You should declare a valid form field type that would be then put in the database or use something like this:
https://pypi.org/project/django-enumfield/
I am trying to extract a list of all the URLs of the images from https://www.rawson.co.za/property/for-sale/cape-town.
However, all the images are available on a different page, rather than the main one.
I have been using xpath to retrieve other desired fields.
I am not quite sure how to retrieve all of the URLs in a list from those child pages. This is what I have tried:
class PropDataSpider(scrapy.Spider):
name = "rawson"
start_urls = ['https://www.rawson.co.za/property/for-sale/cape-town']
def parse(self, response):
propertes = response.xpath("//div[#class='card__main']")
for prop in propertes:
title = prop.xpath(
"./div[#class='card__body']/h3[#class='card__title']/a/text()").extract_first()
price = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='card__price']/text()").extract_first()
description = prop.xpath(
"./div[#class='card__body']/div[#class='card__synopsis']/p/text()").extract_first()
bedrooms = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='features features--inline']/ol[#class ='features__list']/li[#class ='features__item'][1]/div[#class='features__label']/text()").extract_first()
...
images = ['https://' + img for img in prop.xpath(
"main[#class='l-main']/section[#class='l-section']/div[#class='l-wrapper']/div[#class='l-section__main']/div[#class ='content-block content-block--flat']/div[#class ='gallery gallery--flat js-lightbox']/div[# class ='row row--flat']/div[#class ='col']/a[#class ='gallery__link js-lightbox-image']/img/#src")]
yield {'title': title, 'price':price, "description": description, 'bedrooms': bedrooms, 'bathrooms': bathrooms, 'garages': garages, 'images':images}
But this code does retrieves 'None' for images, which makes sense, however I am not sure how to go about it. If anyone has a suggestion it would be really appreciated. Thank you in advance!
You need to use response.meta:
def parse(self, response):
propertes = response.xpath("//div[#class='card__main']")
for prop in propertes:
property_url = prop.xpath(
"./div[#class='card__body']/h3[#class='card__title']/a/#href").extract_first()
title = prop.xpath(
"./div[#class='card__body']/h3[#class='card__title']/a/text()").extract_first()
price = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='card__price']/text()").extract_first()
description = prop.xpath(
"./div[#class='card__body']/div[#class='card__synopsis']/p/text()").extract_first()
bedrooms = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='features features--inline']/ol[#class ='features__list']/li[#class ='features__item'][1]/div[#class='features__label']/text()").extract_first()
yield scrapy.Request(
url=property_url,
callback=self.parse_property,
meta={
'title': title,
'price': price,
'description': description,
'bedrooms': bedrooms,
}
)
def parse_property(self, response):
title = response.meta["title"]
price = response.meta["price"]
description = response.meta["description"]
bedrooms = response.meta["bedrooms"]
images = response.xpath('//a[contains(#class, "gallery__link ")]/#href').getall()
yield {'title': title, 'price':price, "description": description, 'bedrooms': bedrooms, 'bathrooms': bathrooms, 'garages': garages, 'images':images}
I have the following models:
class Recipe(models.Model):
....
class Ingredient(models.Model):
....
class RecipePosition(models.Model):
recipe = models.ForeignKey(Recipe,related_name='recipe_positions', on_delete=models.CASCADE)
ingredient = models.ForeignKey(Ingredient,related_name='ingredient_recipeposition',on_delete=models.PROTECT) ....
in my views.py i am trying to create an inlineformset so that i can edit all the Reciposition related to particular Recipe:
def recipe_ingredients_formset_update(request,slug=None):
instance = get_object_or_404(Recipe.objects.prefetch_related('recipe_positions__ingredient'), slug=slug)
RecipeIngredientsFormSet = inlineformset_factory(Recipe,RecipePosition,form=RecipePoistionForm, can_delete=True, extra=5)
if request.method == "POST":
formset = RecipeIngredientsFormSet(request.POST, request.FILES, instance=instance)
helper = RecipePositionCreateFormSetHelper()
if formset.is_valid():
formset.save()
# Do something. Should generally end with a redirect. For example:
messages.success(request, "Successfully Updated", extra_tags='alert')
return HttpResponseRedirect('')
else:
formset = RecipeIngredientsFormSet(instance=instance)
helper = RecipePositionCreateFormSetHelper()
context = {
"instance":instance,
"formset":formset,
"helper":helper,
"url":instance.get_absolute_url_recipe_update_inline_bulk_ingredients()
}
return render(request, 'recipe_recipositions_bulk_edit.html', context)
I searched on net, but not able to understand. I am using Django Debug toolbar.
If i have 56 RecipePosition items for a particular Recipe. it took me 36 seconds to load