Getting all reviews from a specific Windows Phone Marketplace Game - windows-phone-7

Does AppHub let us see reviews of our apps from all marketplaces at once? As I didn't find any, o took some time writing some code to print them all in a file, so i won't waste my time looking for them in every single language.
I'd appreciate any better solution. In the worst case, I'm glad to share the code with anyone who finds it usefull.
It uses BeautifulSoup.
The only parametter is the id of the app, like this:
wp7reviews.py 62289160-6970-4674-85a0-aef3dbe3f93d
Here is the code
import sys
import getopt
from urllib2 import URLError
from urllib2 import HTTPError
import urllib2
from BeautifulSoup import BeautifulStoneSoup
opts, extraparams = getopt.getopt(sys.argv[1:], '')
# starts at the second element of argv since the first one is the script name
# extraparms are extra arguments passed after all option/keywords are assigned
# opts is a list containing the pair "option"/"value"
#print 'Opts:',opts
#print 'Extra parameters:',extraparams
try:
appid = extraparams[0]
except:
#Awsome Linkit appid as default appid
appid="62289160-6970-4674-85a0-aef3dbe3f93d"
allreviewsFILE = open("allreviews.txt", "w")
def output(text):
allreviewsFILE.write(text)
#print text,
def outputln(text):
allreviewsFILE.write(text+'\n')
#print text
def geturl(lang):
return "http://catalog.zune.net/v3.2/"+lang+"/apps/"+appid
try:
request = urllib2.Request(geturl("en-us"))
fd = urllib2.urlopen(request)
content = fd.read()
fd.close()
soup = BeautifulStoneSoup(content)
try:
outputln("App title: "+soup.findAll("a:title")[0].string)
outputln("");
except:
print "Failed to get App Title"
langs = ["en-us", "en-gb", "de-de",
"fr-fr", "es-es", "it-it",
"en-au", "de-at", "fr-be",
"fr-ca", "en-ca", "en-hk",
"en-in", "en-ie", "es-mx",
"en-nz", "en-sg", "de-ch",
"fr-ch", "zh-hk", "zh-cn",
"en-hk"]
outputln("Here we got reviews from each marketplace")
for lang in langs:
request = urllib2.Request(geturl(lang)+"/reviews")
fd = urllib2.urlopen(request)
print "Fetching "+lang+"...",
content = fd.read()
fd.close()
print "OK"
soup = BeautifulStoneSoup(content)
#print soup.prettify()
contents = soup.findAll("a:content")
ratings = soup.findAll("userrating")
l = len(contents)
if l > 0:
outputln("----------- "+lang+" ---------------------------------------------------")
outputln("")
for i in range(0, l):
output(ratings[i].string+"/10 - ")
if len(contents[i].contents) > 0:
try:
outputln(contents[i].contents[0])
except:
outputln("*** Unknown chars ***")
else:
outputln("Rating only")
outputln("")
except HTTPError, e:
print("Error during request!\n")
print("Cod.: ", e.code)
except URLError, e:
print("Invalid URL!\n")
print("Message: ", e.reason)

There already is a site that gives you this information. Take a look at http://wp7reviews.tomverhoeff.com/

There is also a free WP7 app called AppTracker which allows you to track reviews from different regions, as well as translate them into your native language

Related

How can I export layered drawings from drawio to create "animated" slides in beamer?

When preparing lectures, or conference presentations with beamer, I usually use layered drawings. Then for graphics included in consecutive slides ("frames" in beamer), I simply use different sets of layers.
For graphics created in IPE, I have created a dedicated expallviews.lua script.
Unfortunately, for graphics created with diagrams.net locally run as drawio-desktop, no such automated export of various layers exists. The only way is to manually select the visible layers in GUI and then export consecutive drawings to a set of PDF files.
Is there a more convenient method to solve that problem?
The described problem has been reported in issues 405 and 737 in the drawio-desktop repository.
After reviewing those issues, I have found a method based on automated (instead of a manual via GUI) changing the visibility of layers and exporting such drawings to the set of PDF files. The proposed method is described in the comment to the issue 405. It uses a simple Python script:
#!/usr/bin/python3
"""
This script modifies the visibility of layers in the XML
file with diagram generated by drawio.
It works around the problem of lack of a possibility to export
only the selected layers from the CLI version of drawio.
Written by Wojciech M. Zabolotny 6.10.2022
(wzab01<at>gmail.com or wojciech.zabolotny<at>pw.edu.pl)
The code is published under LGPL V2 license
"""
from lxml import etree as let
import xml.etree.ElementTree as et
import xml.parsers.expat as pe
from io import StringIO
import os
import sys
import shutil
import zlib
import argparse
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--layers", help="Selected layers, \"all\", comma separated list of integers or integer ranges like \"0-3,6,7\"", default="all")
PARSER.add_argument("--layer_prefix", help="Layer name prefix", default="Layer_")
PARSER.add_argument("--outfile", help="Output file", default="output.drawio")
PARSER.add_argument("--infile", help="Input file", default="input.drawio")
ARGS = PARSER.parse_args()
INFILENAME = ARGS.infile
OUTFILENAME = ARGS.outfile
# Find all elements with 'value' starting with the layer prefix.
# Return tuples with the element and the rest of 'value' after the prefix.
def find_layers(el_start):
res = []
for el in el_start:
val = el.get('value')
if val is not None:
if val.find(ARGS.layer_prefix) == 0:
# This is a layer element. Add it, and its name
# after the prefix to the list.
res.append((el,val[len(ARGS.layer_prefix):]))
continue
# If it is not a layer element, scan its children
res.extend(find_layers(el))
return res
# Analyse the list of visible layers, and create the list
# of layers that should be visible. Customize this part
# if you want a more sophisticate method for selection
# of layers.
# Now only "all", comma separated list of integers
# or ranges of integers are supported.
def build_visible_list(layers):
if layers == "all":
return layers
res = []
for lay in layers.split(','):
# Is it a range?
s = lay.find("-")
if s > 0:
# This is a range
first = int(lay[:s])
last = int(lay[(s+1):])
res.extend(range(first,last+1))
else:
res.append(int(lay))
return res
def is_visible(layer_tuple,visible_list):
if visible_list == "all":
return True
if int(layer_tuple[1]) in visible_list:
return True
try:
EL_ROOT = et.fromstring(open(INFILENAME,"r").read())
except et.ParseError as perr:
# Handle the parsing error
ROW, COL = perr.position
print(
"Parsing error "
+ str(perr.code)
+ "("
+ pe.ErrorString(perr.code)
+ ") in column "
+ str(COL)
+ " of the line "
+ str(ROW)
+ " of the file "
+ INFILENAME
)
sys.exit(1)
visible_list = build_visible_list(ARGS.layers)
layers = find_layers(EL_ROOT)
for layer_tuple in layers:
if is_visible(layer_tuple,visible_list):
print("set "+layer_tuple[1]+" to visible")
layer_tuple[0].attrib['visible']="1"
else:
print("set "+layer_tuple[1]+" to invisible")
layer_tuple[0].attrib['visible']="0"
# Now write the modified file
t=et.ElementTree(EL_ROOT)
with open(OUTFILENAME, 'w') as f:
t.write(f, encoding='unicode')
The maintained version of that script, together with a demonstration of its use is also available in my github repository.

Cloud web scraping with requests returns nothing

I'm trying to create a cog for my Discord bot that scrapes Indeed and returns info on job postings (position, company, location, etc). My bot is hosted on Heroku, which is where the issues start. I've tested my web scraper by itself and when implemented as a cog for my Discord bot locally. It works both times. However, when I tried to deploy it on Heroku, the cog stopped working.
I read that this was because cloud-hosting services have blacklists or something for web scraping apps and functions. So I tried to use rq as suggested in this post:
https://devcenter.heroku.com/articles/python-rq
I did all the steps, added an additional worker, a worker.py file, and installed the Redis To Go addon. However, when I try to use the following, I receive nothing back:
url = get_url(job_title, location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# soup.find() returns None
I'm sure I just implemented something wrong, but can someone help me please? The full code is below:
import discord
from discord.ext import commands
import random
import requests
import time
from bs4 import BeautifulSoup
from rq import Queue
from worker import conn
ret = []
def get_url(position, location):
'''Generate url from position and location'''
template = 'https://www.indeed.com/jobs?q={}&l={}'
position = position.replace(" ", "+")
location = location.replace(" ", "+")
url = template.format(position, location)
return url
def get_jobs(job_title, location):
'''Max returned number of jobs is 15 per page.'''
global ret
url = get_url(job_title, location)
response = requests.get(url)
print(f"Responses: {response}")
### This returns <Response [200]>
soup = BeautifulSoup(response.text, "html.parser")
job_names = []
for job_name in soup.find_all("h2", class_="jobTitle"):
job_names.append(job_name.get_text())
### Each one just returns an empty list []
companies = []
for company in soup.find_all("span", class_="companyName"):
companies.append(company.get_text())
locations = []
for location in soup.find_all("div", class_="companyLocation"):
locations.append(location.get_text())
salaries = []
for salary in soup.find_all("div", class_="attribute_snippet"):
if salary.get_text().startswith("$"):
salaries.append(salary.get_text())
else:
salaries.append("Unknown")
links = []
for link in soup.find_all("a", class_=lambda value: value and value.startswith("tapItem fs-unmask result"), href=True):
link = link["href"]
link = "https://indeed.com" + link
links.append(link)
ret = [job_names, companies, locations, salaries, links]
print(ret)
### This returns [[], [], [], [], []]
class JobScraper(commands.Cog):
def __init__(self, client): # References whatever is passed through the client from discord
self.client = client
self.q = Queue(connection=conn)
#commands.command(aliases=["job", "find_job", "find_jobs", "get_job", "get_jobs"])
async def jobs(self, ctx, *, query):
'''Scrapes Indeed.com for jobs and returns them.
The input format should be "eve jobs [job title], [job location], [num returned]
e.g. eve jobs ai researcher, san francisco, 3'''
key_terms = query.split(",")
key_terms = [term.strip() for term in key_terms]
if len(key_terms) == 3:
num_jobs = int(key_terms[2])
else:
num_jobs = 15
# ret = get_jobs(key_terms[0], key_terms[1])
job = self.q.enqueue(get_jobs, key_terms[0], key_terms[1])
await ctx.send("Here is what I found:")
for i in range(num_jobs):
await ctx.send("```" +
f"\nTitle: {ret[0][i]}" +
f"\nCompany: {ret[1][i]}" +
f"\nLocation: {ret[2][i]}" +
f"\nSalary: {ret[3][i]}" +
f"\nLink: {ret[4][i]}" +
"\n```")
def setup(client):
client.add_cog(JobScraper(client))

For Loop for scraping emails for mulitple urls - BS

Below is the code for scraping emails for a single base url and I have been cracking my head on getting a simple "for loop" to do it for an array of urls or reading a list of urls (csv) into python. Can anyone modify the code so that it will do the job?
import requests
import re
from bs4 import BeautifulSoup
allLinks = [];mails=[]
url = 'https://www.smu.edu.sg/'
response = requests.get(url)
soup=BeautifulSoup(response.text,'html.parser')
links = [a.attrs.get('href') for a in soup.select('a[href]') ]
allLinks=set(links)
def findMails(soup):
for name in soup.find_all('a'):
if(name is not None):
emailText=name.text
match=bool(re.match('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$',emailText))
if('#' in emailText and match==True):
emailText=emailText.replace(" ",'').replace('\r','')
emailText=emailText.replace('\n','').replace('\t','')
if(len(mails)==0)or(emailText not in mails):
print(emailText)
mails.append(emailText)
for link in allLinks:
if(link.startswith("http") or link.startswith("www")):
r=requests.get(link)
data=r.text
soup=BeautifulSoup(data,'html.parser')
findMails(soup)
else:
newurl=url+link
r=requests.get(newurl)
data=r.text
soup=BeautifulSoup(data,'html.parser')
findMails(soup)
mails=set(mails)
if(len(mails)==0):
print("NO MAILS FOUND")

Possible to replace Scrapy's default lxml parser with Beautiful Soup's html5lib parser?

Question: Is there a way to integrate BeautifulSoup's html5lib parser into a scrapy project--instead of the scrapy's default lxml parser?
Scrapy's parser fails (for some elements) of my scrape pages.
This only happens every 2 out of 20 pages.
As a fix, I've added BeautifulSoup's parser to the project (which works).
That said, I feel like I'm doubling the work with conditionals and multiple parsers...at a certain point, what's the reason for using Scrapy's parser? The code does work....it feels like a hack.
I'm no expert--is there a more elegant way to do this?
Much appreciation in advance
Update: Adding a middleware class to scrapy (from the python package scrapy-beautifulsoup) works like a charm. Apparently, lxml from Scrapy is not as robust as BeautifulSoup's lxml. I didn't have to resort to the html5lib parser--which is 30X+ slower.
class BeautifulSoupMiddleware(object):
def __init__(self, crawler):
super(BeautifulSoupMiddleware, self).__init__()
self.parser = crawler.settings.get('BEAUTIFULSOUP_PARSER', "html.parser")
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_response(self, request, response, spider):
"""Overridden process_response would "pipe" response.body through BeautifulSoup."""
return response.replace(body=str(BeautifulSoup(response.body, self.parser)))
Original:
import scrapy
from scrapy.item import Item, Field
from scrapy.loader.processors import TakeFirst, MapCompose
from scrapy import Selector
from scrapy.loader import ItemLoader
from w3lib.html import remove_tags
from bs4 import BeautifulSoup
class SimpleSpider(scrapy.Spider):
name = 'SimpleSpider'
allowed_domains = ['totally-above-board.com']
start_urls = [
'https://totally-above-board.com/nefarious-scrape-page.html'
]
custom_settings = {
'ITEM_PIPELINES': {
'crawler.spiders.simple_spider.Pipeline': 400
}
}
def parse(self, response):
yield from self.parse_company_info(response)
yield from self.parse_reviews(response)
def parse_company_info(self, response):
print('parse_company_info')
print('==================')
loader = ItemLoader(CompanyItem(), response=response)
loader.add_xpath('company_name',
'//h1[contains(#class,"sp-company-name")]//span//text()')
yield loader.load_item()
def parse_reviews(self, response):
print('parse_reviews')
print('=============')
# Beautiful Soup
selector = Selector(response)
# On the Page (Total Reviews) # 49
search = '//span[contains(#itemprop,"reviewCount")]//text()'
review_count = selector.xpath(search).get()
review_count = int(float(review_count))
# Number of elements Scrapy's LXML Could find # 0
search = '//div[#itemprop ="review"]'
review_element_count = len(selector.xpath(search))
# Use Scrapy or Beautiful Soup?
if review_count > review_element_count:
# Try Beautiful Soup
soup = BeautifulSoup(response.text, "lxml")
root = soup.findAll("div", {"itemprop": "review"})
for review in root:
loader = ItemLoader(ReviewItem(), selector=review)
review_text = review.find("span", {"itemprop": "reviewBody"}).text
loader.add_value('review_text', review_text)
author = review.find("span", {"itemprop": "author"}).text
loader.add_value('author', author)
yield loader.load_item()
else:
# Try Scrapy
review_list_xpath = '//div[#itemprop ="review"]'
selector = Selector(response)
for review in selector.xpath(review_list_xpath):
loader = ItemLoader(ReviewItem(), selector=review)
loader.add_xpath('review_text',
'.//span[#itemprop="reviewBody"]//text()')
loader.add_xpath('author',
'.//span[#itemprop="author"]//text()')
yield loader.load_item()
yield from self.paginate_reviews(response)
def paginate_reviews(self, response):
print('paginate_reviews')
print('================')
# Try Scrapy
selector = Selector(response)
search = '''//span[contains(#class,"item-next")]
//a[#class="next"]/#href
'''
next_reviews_link = selector.xpath(search).get()
# Try Beautiful Soup
if next_reviews_link is None:
soup = BeautifulSoup(response.text, "lxml")
try:
next_reviews_link = soup.find("a", {"class": "next"})['href']
except Exception as e:
pass
if next_reviews_link:
yield response.follow(next_reviews_link, self.parse_reviews)
It’s a common feature request for Parsel, Scrapy’s library for XML/HTML scraping.
However, you don’t need to wait for such a feature to be implemented. You can fix the HTML code using BeautifulSoup, and use Parsel on the fixed HTML:
from bs4 import BeautifulSoup
# …
response = response.replace(body=str(BeautifulSoup(response.body, "html5lib")))
You can get a charset error using the #Gallaecio's answer, if the original page was not utf-8 encoded, because the response has set to other encoding.
So, you must first switch the encoding.
In addition, there may be a problem of character escaping.
For example, if the character < is encountered in the text of html, then it must be escaped as <. Otherwise, "lxml" will delete it and the text near it, considering it an erroneous html tag.
"html5lib" escapes characters, but is slow.
response = response.replace(encoding='utf-8',
body=str(BeautifulSoup(response.body, 'html5lib')))
"html.parser" is faster, but from_encoding must also be specified (to example 'cp1251').
response = response.replace(encoding='utf-8',
body=str(BeautifulSoup(response.body, 'html.parser', from_encoding='cp1251')))

Reading intended recipient from Undeliverable emails via Interop for Outlook

I've created an application, which is used to loop through the emails in an inbox and find all the undeliverable, mailbox full or delayed emails and generate a report.
The usual routine is to loop through all the emails in the inbox (up to a specified date).
If an email is undeliverable use regex to find the email. This works 95% of the time as this information is contained in the body of the Undelivered message (ReportItem).
So, my problem is I have a few emails which are returning blank emails to the report making it nigh on impossible to clean them or easily report that we have a problem with someone's email.
I have found that the information in the Internet Headers has who the mail was intended for, but cannot find anything on if it is possible to use an interop or some other object to obtain this information.
If anyone else has come across this problem and knows of a work around I would be very grateful.
Cheers
I was looking to automate an outlook mail box to move all undelivered emails and store the email address of the recipient of the undeliverable message in a list, so that I can later check if an entry of the list is present in an excel column and then remove it from the excel. I hope this helps !
I've found a Python solution for this problem. A python library that is used to connect to the outlook is win32com, so first we import all libraries that we will need:
import win32com.client
import re
import datetime as dt
from tqdm import tqdm
import time
import extract_msg
This is a good way to connect to a specific outlook account, if you have :
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
accounts= win32com.client.Dispatch("Outlook.Application").Session.Accounts
Then create a loop that iterates through the whole outlook and gets to the specified mail account:
for account in accounts:
inbox = outlook.Folders(account.DeliveryStore.DisplayName)
if account.DeliveryStore.DisplayName == 'place_your_account_name_here':
for folder in inbox.Folders:
Find the folder in outlook you wish to check by folder name,
so if you would want to iterate through Inbox, type "Inbox" instead of "Folder_name"
if folder.__str__() == "Folder_name":
messages = folder.Items
messages.Sort('[ReceivedTime]', True)
if folder.Folders.Item('Undeliverable'):
undeliverable = folder.Folders.Item('Undeliverable')
list_of_undelivered_email_addresses = my_super_function(messages,undeliverable)
After we have reached the mail items and declared the undeliverable subfolder as "undeliverable", we specify the time period for which we want to do the below function:
def my_super_function(messages,undeliverable):
list_of_undelivered_email_addresses = []
last_n_days = dt.datetime.now() - dt.timedelta(days = 25)
messages = messages.Restrict("[ReceivedTime] >= '" +last_n_days.strftime('%m/%d/%Y %H:%M %p')+"'")
rl= list()
I have found that the msot popular times of undeliverable email addresses present some sort of an error, and below the error is the original version of the email I have sent. Most of them (with very few exceptions, have a line that says:
To: "Some_email_address" ....
This is why I used this regular expression to get read the whole line after my pattern (which is "To: "")
pattern = re.compile('To: ".*\n?',re.MULTILINE)
for counter, message in enumerate(messages):
It is very important that you save the email somewhere on your PC, because otherwise as soon as you read it's body, the email gets encrypted.
message.SaveAs("undeliverable_emails.msg")
f = r'specify_the_absolute_path_where_you_want_it_saved'
try:
msg = extract_msg.Message(f)
print(counter)
Search the saved msg body for the keyword Undeliverable:
if msg.body.find("undeliverable")!= -1 or msg.body.find("Undeliverable")!= -1 or msg.subject.find("Undeliverable")!= -1 or msg.subject.find("undeliverable")!= -1 or msg.body.find("wasn't found at")!= -1:
Save the actual email to a list, so you can move it to the undeliverables subfolder later
rl.append(message)
m = re.search(pattern, msg.body)
m = m[0]
mail_final = m.split('"')[1]
list_of_undelivered_email_addresses.append(mail_final)
list_of_undelivered_email_addresses=list(filter(None, list_of_undelivered_email_addresses))
else:
print('this email is not an undeliverable one')
except:
pass
Move all mails in the list to the undeliverables folder:
if len(rl) ==0:
pass
else:
for m in tqdm(rl):
m.Move(undeliverable)
return list_of_undelivered_email_addresses
Here is the full code:
import win32com.client
import re
import datetime as dt
from tqdm import tqdm #tqdm gives you the progress bar
import time
import extract_msg
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
accounts= win32com.client.Dispatch("Outlook.Application").Session.Accounts
def my_super_function(messages,undeliverable):
list_of_undelivered_email_addresses = []
last_n_days = dt.datetime.now() - dt.timedelta(days = 25)
messages = messages.Restrict("[ReceivedTime] >= '" +last_n_days.strftime('%m/%d/%Y %H:%M %p')+"'")
rl= list()
pattern = re.compile('To: ".*\n?',re.MULTILINE)
for counter, message in enumerate(messages):
message.SaveAs("undeliverable_emails.msg")
f = r'some_absolute_path'
try:
msg = extract_msg.Message(f)
print(counter)
if msg.body.find("undeliverable")!= -1 or msg.body.find("Undeliverable")!= -1 or msg.subject.find("Undeliverable")!= -1 or msg.subject.find("undeliverable")!= -1 or msg.body.find("wasn't found at")!= -1:
rl.append(message)
m = re.search(pattern, msg.body)
m = m[0]
mail_final = m.split('"')[1]
list_of_undelivered_email_addresses.append(mail_final)
list_of_undelivered_email_addresses=list(filter(None, list_of_undelivered_email_addresses))
else:
print('else')
except:
pass
if len(rl) ==0:
pass
else:
for m in tqdm(rl):
m.Move(undeliverable)
return list_of_undelivered_email_addresses
for account in accounts:
inbox = outlook.Folders(account.DeliveryStore.DisplayName)
if account.DeliveryStore.DisplayName == 'desired_email_address':
for folder in inbox.Folders:
if folder.__str__() == "Inbox":
messages = folder.Items
messages.Sort('[ReceivedTime]', True)
if folder.Folders.Item('Undeliverable'):
undeliverable = folder.Folders.Item('Undeliverable')
list_of_undelivered_email_addresses = my_super_function(messages,undeliverable)
looks like what I want isnt part of the ReportItem properties.
The possible options are Extended IMAPI, CDO or Redemption
http://www.tech-archive.net/Archive/Outlook/microsoft.public.outlook.program_vba/2004-11/0084.html

Resources