Which XPath to use while scraping Google Play store?

Which XPath to use while scraping Google Play store? - xpath

I am scraping the Google Play store for the app reviews. I am able to get only 40 reviews only. The problem is in the xhr path upon reaching scrapy throws error:
Http status code not handled or not allowed
Code:
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import BaseSpider
from scrapy.http import Request
class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
class aqaqspider(BaseSpider):
name = "gaana"
allowed_domains = ["play.google.com"]
start_urls = [
"https://play.google.com/store/apps/details?id=com.gaana&hl=en",
]
page = 1
def parse(self, response):
products = response.xpath('//div[#class="single-review"]')
if not products:
raise CloseSpider("No more products!")
for product in products:
item = Product()
#item['brand'] = product.xpath(".//span[contains(#class, 'qa-brandName')]/text()").extract()[0].strip()
item['title'] = product.xpath('.//.//div/div/span[#class="author-name"]/a/text()').extract()[0].strip()
yield item
self.page += 1
yield Request(url="https://play.google.com/store/getreviews?authuser=1" ,
headers={"Referer": "https://play.google.com/store/apps/details?id=com.gaana&hl=en", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse,
dont_filter=True)
Please don't say that it is against their Terms of Use. I know that but I need to learn things and move on. I am not exploiting anything.

Related

TypeError at /stories/story_id story_detail() missing 1 required positional argument: 'story_id'

I want the user to click through all customer's stories to read up on an individual story. But getting this error:
**TypeError at /stories/story_id
story_detail() missing 1 required positional argument: 'story_id'**
My Views.py:
def all_stories(request):
""" A view to show all stories, including search function """
stories = Stories.objects.all()
context = {
'stories': stories,
}
return render(request, 'stories/stories.html', context)
def story_detail(request, story_id):
""" A view to show individual story detail """
story_detail = get_object_or_404(Story, pk=story_id)
context = {
'story': story_detail,
}
return render(request, 'stories/story_detail.html', context)
My urls.py:
from django.urls import path
from . import views
urlpatterns = [
path('', views.all_stories, name='stories'),
path('story_id', views.story_detail, name='story_detail'),
]
I'm not sure what I have done or can do to fix this, can somebody help?

how to scrape using scrapy with infinite loop without next page information

i need to scrape a url using scrapy and i cant scroll down the website to load all the elements.
i try to seach the next page information but i cant found it
my code of the spider is:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#class="info-ficha"]/div[1]/a')),
callback='parse_item', follow=False)
}
def parse_item(self, response):
item = appinformaticaItem()
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item
anyone can help me?

Change allow to allow=(r'/moviles/.*.html'),follow=True and put your allowed_domains. And try this.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
# from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
allowed_domains = ["appinformatica.com"]
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(r'/moviles/.*\.html'), ),
callback='parse_item', follow=True)
}
def parse_item(self, response):
item = {}
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[#id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item

Django mysql EnumField does not appear on HTML page

I created a form using mysql EnumField but this field does not appear on the HTML page when it is created.
forms.py - see transaction_type_1 that does not appear on he HTML page.
from django import forms
from .models import tledger_account, tfirst_free_number
from django_mysql.models import EnumField
from datetime import datetime
class Transactions(forms.Form):
transaction_no = forms.IntegerField(initial=1)
description = forms.CharField(max_length=50)
transaction_date = forms.DateField()
sequence_1 = forms.IntegerField()
transaction_type_1 = EnumField(choices=['Debit','Credit'])
ledger_account_1 = forms.ModelChoiceField(queryset=tledger_account.objects.filter(active='Yes'))
amount_1 = forms.DecimalField(max_digits=14, decimal_places=2)
sequence_2 = forms.IntegerField()
transaction_type_2 = EnumField(choices=['Debit', 'Credit'], default='Debit')
ledger_account_2 = forms.ModelChoiceField(queryset=tledger_account.objects.filter(active='Yes'))
amount_2 = forms.DecimalField(max_digits=14, decimal_places=2)
def __init__(self, *args, **kwargs):
# Get 'initial' argument if any
#initial_arguments = kwargs.get('initial', None)
updated_initial = {}
updated_initial['transaction_no'] = tfirst_free_number.objects.get(id=1).ffn
updated_initial['transaction_date'] = datetime.today
updated_initial['sequence_1'] = 1
updated_initial['sequence_2'] = 2
updated_initial['transaction_type_1'] = 'debit'
kwargs.update(initial=updated_initial)
super(Transactions, self).__init__(*args, **kwargs)

You are trying to use a model thus the form does not really know how to display it. You should declare a valid form field type that would be then put in the database or use something like this:
https://pypi.org/project/django-enumfield/

How to extract all images that are on the different page(child page) using xpath and scrapy

I am trying to extract a list of all the URLs of the images from https://www.rawson.co.za/property/for-sale/cape-town.
However, all the images are available on a different page, rather than the main one.
I have been using xpath to retrieve other desired fields.
I am not quite sure how to retrieve all of the URLs in a list from those child pages. This is what I have tried:
class PropDataSpider(scrapy.Spider):
name = "rawson"
start_urls = ['https://www.rawson.co.za/property/for-sale/cape-town']
def parse(self, response):
propertes = response.xpath("//div[#class='card__main']")
for prop in propertes:
title = prop.xpath(
"./div[#class='card__body']/h3[#class='card__title']/a/text()").extract_first()
price = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='card__price']/text()").extract_first()
description = prop.xpath(
"./div[#class='card__body']/div[#class='card__synopsis']/p/text()").extract_first()
bedrooms = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='features features--inline']/ol[#class ='features__list']/li[#class ='features__item'][1]/div[#class='features__label']/text()").extract_first()
...
images = ['https://' + img for img in prop.xpath(
"main[#class='l-main']/section[#class='l-section']/div[#class='l-wrapper']/div[#class='l-section__main']/div[#class ='content-block content-block--flat']/div[#class ='gallery gallery--flat js-lightbox']/div[# class ='row row--flat']/div[#class ='col']/a[#class ='gallery__link js-lightbox-image']/img/#src")]
yield {'title': title, 'price':price, "description": description, 'bedrooms': bedrooms, 'bathrooms': bathrooms, 'garages': garages, 'images':images}
But this code does retrieves 'None' for images, which makes sense, however I am not sure how to go about it. If anyone has a suggestion it would be really appreciated. Thank you in advance!

You need to use response.meta:
def parse(self, response):
propertes = response.xpath("//div[#class='card__main']")
for prop in propertes:
property_url = prop.xpath(
"./div[#class='card__body']/h3[#class='card__title']/a/#href").extract_first()
title = prop.xpath(
"./div[#class='card__body']/h3[#class='card__title']/a/text()").extract_first()
price = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='card__price']/text()").extract_first()
description = prop.xpath(
"./div[#class='card__body']/div[#class='card__synopsis']/p/text()").extract_first()
bedrooms = prop.xpath(
"./div[#class='card__body']/div[#class='card__footer card__footer--primary']/div[#class='features features--inline']/ol[#class ='features__list']/li[#class ='features__item'][1]/div[#class='features__label']/text()").extract_first()
yield scrapy.Request(
url=property_url,
callback=self.parse_property,
meta={
'title': title,
'price': price,
'description': description,
'bedrooms': bedrooms,
}
)
def parse_property(self, response):
title = response.meta["title"]
price = response.meta["price"]
description = response.meta["description"]
bedrooms = response.meta["bedrooms"]
images = response.xpath('//a[contains(#class, "gallery__link ")]/#href').getall()
yield {'title': title, 'price':price, "description": description, 'bedrooms': bedrooms, 'bathrooms': bathrooms, 'garages': garages, 'images':images}

django: inlineformset very slow

I have the following models:
class Recipe(models.Model):
....
class Ingredient(models.Model):
....
class RecipePosition(models.Model):
recipe = models.ForeignKey(Recipe,related_name='recipe_positions', on_delete=models.CASCADE)
ingredient = models.ForeignKey(Ingredient,related_name='ingredient_recipeposition',on_delete=models.PROTECT) ....
in my views.py i am trying to create an inlineformset so that i can edit all the Reciposition related to particular Recipe:
def recipe_ingredients_formset_update(request,slug=None):
instance = get_object_or_404(Recipe.objects.prefetch_related('recipe_positions__ingredient'), slug=slug)
RecipeIngredientsFormSet = inlineformset_factory(Recipe,RecipePosition,form=RecipePoistionForm, can_delete=True, extra=5)
if request.method == "POST":
formset = RecipeIngredientsFormSet(request.POST, request.FILES, instance=instance)
helper = RecipePositionCreateFormSetHelper()
if formset.is_valid():
formset.save()
# Do something. Should generally end with a redirect. For example:
messages.success(request, "Successfully Updated", extra_tags='alert')
return HttpResponseRedirect('')
else:
formset = RecipeIngredientsFormSet(instance=instance)
helper = RecipePositionCreateFormSetHelper()
context = {
"instance":instance,
"formset":formset,
"helper":helper,
"url":instance.get_absolute_url_recipe_update_inline_bulk_ingredients()
}
return render(request, 'recipe_recipositions_bulk_edit.html', context)
I searched on net, but not able to understand. I am using Django Debug toolbar.
If i have 56 RecipePosition items for a particular Recipe. it took me 36 seconds to load

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Which XPath to use while scraping Google Play store? - xpath

Related

TypeError at /stories/story_id story_detail() missing 1 required positional argument: 'story_id'

how to scrape using scrapy with infinite loop without next page information

Django mysql EnumField does not appear on HTML page

How to extract all images that are on the different page(child page) using xpath and scrapy

django: inlineformset very slow

Categories

Resources