Cloud web scraping with requests returns nothing

Cloud web scraping with requests returns nothing - heroku

I'm trying to create a cog for my Discord bot that scrapes Indeed and returns info on job postings (position, company, location, etc). My bot is hosted on Heroku, which is where the issues start. I've tested my web scraper by itself and when implemented as a cog for my Discord bot locally. It works both times. However, when I tried to deploy it on Heroku, the cog stopped working.
I read that this was because cloud-hosting services have blacklists or something for web scraping apps and functions. So I tried to use rq as suggested in this post:
https://devcenter.heroku.com/articles/python-rq
I did all the steps, added an additional worker, a worker.py file, and installed the Redis To Go addon. However, when I try to use the following, I receive nothing back:
url = get_url(job_title, location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# soup.find() returns None
I'm sure I just implemented something wrong, but can someone help me please? The full code is below:
import discord
from discord.ext import commands
import random
import requests
import time
from bs4 import BeautifulSoup
from rq import Queue
from worker import conn
ret = []
def get_url(position, location):
'''Generate url from position and location'''
template = 'https://www.indeed.com/jobs?q={}&l={}'
position = position.replace(" ", "+")
location = location.replace(" ", "+")
url = template.format(position, location)
return url
def get_jobs(job_title, location):
'''Max returned number of jobs is 15 per page.'''
global ret
url = get_url(job_title, location)
response = requests.get(url)
print(f"Responses: {response}")
### This returns <Response [200]>
soup = BeautifulSoup(response.text, "html.parser")
job_names = []
for job_name in soup.find_all("h2", class_="jobTitle"):
job_names.append(job_name.get_text())
### Each one just returns an empty list []
companies = []
for company in soup.find_all("span", class_="companyName"):
companies.append(company.get_text())
locations = []
for location in soup.find_all("div", class_="companyLocation"):
locations.append(location.get_text())
salaries = []
for salary in soup.find_all("div", class_="attribute_snippet"):
if salary.get_text().startswith("$"):
salaries.append(salary.get_text())
else:
salaries.append("Unknown")
links = []
for link in soup.find_all("a", class_=lambda value: value and value.startswith("tapItem fs-unmask result"), href=True):
link = link["href"]
link = "https://indeed.com" + link
links.append(link)
ret = [job_names, companies, locations, salaries, links]
print(ret)
### This returns [[], [], [], [], []]
class JobScraper(commands.Cog):
def __init__(self, client): # References whatever is passed through the client from discord
self.client = client
self.q = Queue(connection=conn)
#commands.command(aliases=["job", "find_job", "find_jobs", "get_job", "get_jobs"])
async def jobs(self, ctx, *, query):
'''Scrapes Indeed.com for jobs and returns them.
The input format should be "eve jobs [job title], [job location], [num returned]
e.g. eve jobs ai researcher, san francisco, 3'''
key_terms = query.split(",")
key_terms = [term.strip() for term in key_terms]
if len(key_terms) == 3:
num_jobs = int(key_terms[2])
else:
num_jobs = 15
# ret = get_jobs(key_terms[0], key_terms[1])
job = self.q.enqueue(get_jobs, key_terms[0], key_terms[1])
await ctx.send("Here is what I found:")
for i in range(num_jobs):
await ctx.send("```" +
f"\nTitle: {ret[0][i]}" +
f"\nCompany: {ret[1][i]}" +
f"\nLocation: {ret[2][i]}" +
f"\nSalary: {ret[3][i]}" +
f"\nLink: {ret[4][i]}" +
"\n```")
def setup(client):
client.add_cog(JobScraper(client))

Related

Giphy API not responding in discord.py

I made a discord bot that shows you gifs when you type a certain command but the problem is that it works fine in the first half but takes a long time to show the gifs when not used.
Basically it doesn't show the gifs instantly when not used.
Here's the code that I've written:
#client.command()
async def gif(ctx, *, q = 'dance'):
api_key = 'Some Key here'
api_instanc = giphy_client.DefaultApi()
try:
api_responce = api_instanc.gifs_search_get(api_key, q, limit = 7,rating = 'r')
lst = list(api_responce.data)
giff = random.choice(lst)
emb = discord.Embed(title = f"Requested by {ctx.author} " + q )
emb.set_image(url= f'https://media.giphy.com/media/{giff.id}/giphy.gif')
await ctx.channel.send(embed = emb)
except ApiException as e:
await ctx.channel.send("API EXCEPTION")
It doesn't show any errors but doesn't work after the long time.
Any re-write of the code with aiohttp will be appreciated because I am learning that.

I think the module you are using is not asynchronous which leads to blocking read more.
Default in the command is search = None you can use that with an if statement to check.
After that is the request for the api to get the image.
Here is the code edited to use aiohttp
# import aiohttp
# import random
#bot.command()
async def giphy(ctx, search: str = None):
api_key = ""
embed = discord.Embed(title=f"Requested by {ctx.author}")
async with aiohttp.ClientSession() as session:
# search
if search:
embed.description = search
async with session.get(f'http://api.giphy.com/v1/gifs/search?q={search}&api_key={api_key}&limit=10') as response:
data = await response.json()
gif_choice = random.randint(0, 9)
embed.set_image(url=data['data'][gif_choice]['images']['original']['url'])
# radnom
else:
async with session.get(f'http://api.giphy.com/v1/gifs/random?api_key={api_key}&limit=10') as response:
data = await response.json()
embed.set_image(url=data['data']['images']['original']['url'])
await ctx.send(embed=embed)

Discord.py Rewrite Giphy Cog Error: Unclosed Connector

I'm trying to build a Giphy cog for my discord.py bot and it's throwing the following error
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000001CA509CDA60>
Unclosed connector
connections: ['[(<aiohttp.client_proto.ResponseHandler object at 0x000001CA51897BE0>, 77055.671)]']
connector: <aiohttp.connector.TCPConnector object at 0x000001CA509CDA30>
Does anyone know what is causing this and how I would fix it?
Here is my code:
import os
import aiohttp
import random
import discord
from discord.ext import commands
from dotenv import load_dotenv
load_dotenv()
prefix = os.getenv("CLIENT_PREFIX")
giphy_api = os.getenv("GIPHY_API_KEY")
command_attrs = {'hidden': False}
class FunCog(commands.Cog, name='Fun Commands', command_attrs=command_attrs):
def __init__(self, client):
self.client = client
#commands.command(name='gif')
async def _gif(self, ctx, *, search, json=None):
embed = discord.Embed(colour=discord.Colour.blue())
session = aiohttp.ClientSession()
if search == '':
response = await session.get('https://api.giphy.com/v1/gifs/random?api_key=' + giphy_api)
data = json.loads(await response.text())
embed.set_image(url=data['data']['images']['original']['url'])
else:
search.replace(' ', '+')
response = await session.get(
'http://api.giphy.com/v1/gifs/search?q=' + search + '&api_key=' + giphy_api + '&limit=10')
data = json.loads(await response.text())
gif_choice = random.randint(0, 9)
embed.set_image(url=data['data'][gif_choice]['images']['original']['url'])
await session.close()
await ctx.send(embed=embed)
def setup(client):
client.add_cog(FunCog(client))
I omitted my Giphy API Key for security reasons.
I'm using discord.py rewrite and python 3.8.6if it helps.
Basically I want to be able to search a gif with it by tag and it will respond with a random gif from giphy for the specified tag.
--EDIT--
Moved the error logging to my Events.py file
Moved the API Keys to my .env file
Removed the tenor_api env since it won't be getting used and has nothing to do with this question
Updated the code a bit to resolve a few missing parameters.
---EDIT---
Thanks to Fixator10 for the response that fixed the issue I was having in this post.
Here is my working code if anyone wants to use it:
import os
import aiohttp
import random
import discord
import json
from discord.ext import commands
from dotenv import load_dotenv
load_dotenv()
prefix = os.getenv("CLIENT_PREFIX")
giphy_api = os.getenv("GIPHY_API_KEY")
command_attrs = {'hidden': False}
class FunCog(commands.Cog, name='Fun Commands', command_attrs=command_attrs):
def __init__(self, client):
self.client = client
self.session = aiohttp.ClientSession()
def cog_unload(self):
self.client.loop.create_task(self.session.close())
#commands.command(name='gif')
async def _gif(self, ctx, *, search):
session = self.session
embed = discord.Embed(colour=discord.Color.dark_gold())
if search == '':
response = await session.get('https://api.giphy.com/v1/gifs/random?api_key=' + giphy_api)
data = json.loads(await response.text())
embed.set_image(url=data['data']['images']['original']['url'])
else:
search.replace(' ', '+')
response = await session.get(
'http://api.giphy.com/v1/gifs/search?q=' + search + '&api_key=' + giphy_api + '&limit=10')
data = json.loads(await response.text())
gif_choice = random.randint(0, 9)
embed.set_image(url=data['data'][gif_choice]['images']['original']['url'])
await ctx.send(embed=embed)
def setup(client):
client.add_cog(FunCog(client))
It's probably not the best since it only uses a single tag but I'll probably improve that at another time xD

This is caused by your session behavior. You dont close session if search is empty. In two words: you should close session after usage.
The most simple solution - is to use context manager:
# https://docs.aiohttp.org/en/stable/#client-example
async with aiohttp.ClientSession() as session:
async with session.get('http://python.org') as response:
Otherwise, you can create a session for your Cog and close it on unload:
class MyCog(commands.Cog):
def __init__(client):
self.client = client
self.session = aiohttp.ClientSession()
# https://discordpy.readthedocs.io/en/stable/ext/commands/api.html#discord.ext.commands.Cog.cog_unload
def cog_unload(self):
# since close is coroutine,
# and we need to call it in sync func
# lets create asyncio task
self.client.loop.create_task(self.session.close())
# or, we can use detach:
# self.session.detach()
#commands.command()
async def mycmd(ctx):
async with self.session.get("https://example.com") as response:
await ctx.send(response.status)

Attempting to rewrite a Discord.py GIF Bot from an old outdated guide

Ok, so I'm attempting to rebuild a GIF bot in Discord.py.
It is based off of some code found on a website that gave a guide on making a discord.py gif bot.
The problem is, the code is old and severely outdated with over 37 errors on it's own.
How would I go about converting this to discord.py-rewrite and basically making it a gif searching bot that can pick up random gifs by tag from GIPHY?
Here is the code I currently have:
import discord.utils
import os
import giphy_client
from giphy_client.rest import ApiException
from pprint import pprint
from discord.ext import commands
import random
from dotenv import load_dotenv
load_dotenv()
API_KEY = os.getenv("GIPHY_API_KEY")
API_TOKEN = os.getenv("GIPHY_API_TOKEN")
api_instance = giphy_client.DefaultApi()
config = {
'api_key': API_KEY,
'token': API_TOKEN,
'limit': 1,
'rating': 'g'
}
try:
api_response = api_instance.gifs_trending_get(
config['token'], limit=config['limit'], rating=config['rating'])
pprint(api_response)
except ApiException as e:
print("Exception when calling DefaultApi->gifs_trending_get: %s\n" % e)
Client = commands.Bot(command_prefix=os.getenv("CLIENT_PREFIX"))
class DiscordClient(discord.Client):
#Client.event
async def on_ready(self):
print("Connected to Discord Client as")
print(f'{self.user.name}#{self.user.discriminator}')
print("-------")
#Client.command(name="gif")
async def search_gifs(query):
try:
giphy_token = API_TOKEN
response = api_instance.gifs_search_get(giphy_token, query, limit=3, rating='g')
lst = list(response.data)
gif = random.choices(lst)
return gif[0].url
except ApiException as e:
return "Exception when calling DefaultApi->gifs_search_get: %s\n" % e
async def on_ready(self):
print("Connected to Discord Client as")
print(Client.user.name)
print("-------")
#Client.command(name='gif')
async def search_gifs(query):
try:
response = api_instance.gifs_search_get(
API_TOKEN, query, limit=3, rating='g')
lst = list(response.data)
gif = random.choices(lst)
return gif[0].url
except ApiException as e:\
return "Exception when calling DefaultApi->gifs_search_get: %s\n" % e\
#Client.command(name='8ball')
async def magic_eight_ball(ctx):
response = [
'Without a doubt.',
'Outlook good.',
'Better not tell you now.',
'Cannot predict now.',
'My reply is no.',
'Outlook not so good.',
]
gif = await search_gifs('cheese')
await ctx.send(random.choice(response))
await ctx.send('Gif URL : ' + gif)
Client.run(os.getenv("CLIENT_TOKEN"))
Here is the website I got the code I'm building off of from:
https://www.maxongzb.com/serving-gifs-with-discord-bot-reading-time-12-mins/
Here is what I currently have for the requirements.txt file:
discord.py
giphy_client
python-dotenv
Any help would be much appreciated.
I host via Heroku due to my VPS taking a dump, so that's why I have the os.getenv("...") lines in there.
I've gotten it down to around 7 errors, 8 warnings and 8 weak warnings. But that's the furthest I was able to get to.

Python Telegram bot on Heroku can't access Google Spreadsheet

I am deplying a python program on Heroku. Everything works fine locally, but it doesn't seem to run on Heroku
The bot.py file contains two variables which are saved as config in Heroku: TOKEN and SPREADSHEET, these are the Token of the telegram bot and the ID of the google Spreadsheet I am trying to access.
When the code is running on Heroku, I get this:
The code seems to be running on the Heroku server, but when I try to send the command on the Telegram bot, it doesn't work
at=info method=POST path="/" host=fff-transparency-wg.herokuapp.com request_id=d883bfc2-24a3-4cb5-b638-36b310726780 fwd="91.108.6.81" dyno=web.1 connect=1ms service=5ms status=200 bytes=172 protocol=https
The program contains the files:
bot.py
client_secret.json
Procfile
requirements.txt
runtime.txt
BOT.PY
from telegram.ext import Updater, CommandHandler, CallbackQueryHandler
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
import logging
import os
import json
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from itertools import permutations
import re
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger("telegram.bot")
def start(update, context):
context.bot.send_message(
chat_id=update.effective_chat.id, text="I'm a bot, please talk to me!")
# def help(update, context):
def call(update, context):
"""
Save Message ID, Title, Date, Time, Duration, Description, Link, Group ID, Group name, Saved by Name, Username
Expected inputs (in order): Date, Time, Duration, Title, Description, Link
Send variables to Google Sheet
Send variables to Trello Board
Send variables to Google Calendar
OPTIONAL:
Notification System
"""
message_id = update.message.message_id
user = update.message.from_user
username = user['username']
full_name = "{} {}".format(user['first_name'], user['last_name'])
groupchat = update.message.chat
message_text = update.message.text
text = format_string(message_text, "/call")
if (text == -1):
groupchat.send_message(
text="Please make sure all arguments are inserted in the correct order and separated by semicolomns:\n\n- Date (dd/mm/yy) \n- Time (GMT) \n- Duration (hour:min) \n- Title \n- Description(optional) \n- Agenda Link(optional)")
call_date = text[0]
call_time = text[1]
call_duration = text[2]
call_title = text[3]
#if text[4]: call_description = text[4]
#if text[5]: call_agenda = text[5]
calls.append_row(
[message_id, call_title, call_date, call_time, call_duration])
# def group(update, context):
def str2date(string):
"Parse a string into a datetime object."
for fmt in dateformats():
try:
return datetime.strptime(string, fmt)
except ValueError:
pass
raise ValueError("'%s' is not a recognized date/time" % string)
def format_string(message, command):
message = re.sub(command, '', message)
message.strip()
message = message.split(';')
if not(len(message) >= 4):
return -1
if command == "/call":
message[0: 1] = [' '.join(message[0: 1])]
s = message[0].strip()
try:
message[0] = str2date(s)
except ValueError:
print("invalid date/time")
return message
def dateformats():
"Yield all combinations of valid date formats."
years = ("%Y",)
months = ("%b", "%B")
days = ("%d",)
times = ("%I%p", "%I:%M%p", "%H:%M", "")
for year in years:
for month in months:
for day in days:
for args in ((day, month), (month, day)):
date = " ".join(args)
for time in times:
for combo in permutations([year, date, time]):
yield " ".join(combo).strip()
def error(update, context):
logger.warning('Update "%s" caused error "%s"', update, context.error)
def main():
TOKEN = os.environ['TOKEN']
updater = Updater(token=TOKEN, use_context=True)
dp = updater.dispatcher
PORT = int(os.environ.get('PORT', '8443'))
updater.start_webhook(listen="0.0.0.0", port=PORT, url_path=TOKEN)
updater.bot.set_webhook("https://fff-transparency-wg.herokuapp.com/" + TOKEN)
updater.idle()
# use creds to create a client to interact with the Google Drive API
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive', 'https://www.googleapis.com/auth/drive.file', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(
'client_secret.json', scope)
client = gspread.authorize(creds)
# Find a workbook by name and open the first sheet
SPREADSHEET = os.environ['SPREADSHEET']
spreadsheet = client.open_by_key(
SPREADSHEET)
groupchats = spreadsheet.get_worksheet(0)
calls = spreadsheet.get_worksheet(1)
# Commands
dp.add_handler(CommandHandler("start", start))
#dp.add_handler(CommandHandler("help", help))
dp.add_handler(CommandHandler("call", call))
#dp.add_handler(CommandHandler("group", group))
dp.add_error_handler(error)
if __name__ == '__main__':
main()
PROCFILE
web: python3 bot.py
worker: python3 bot.py
REQUIREMENTS.TXT
certifi==2019.11.28
cffi==1.14.0
chardet==3.0.4
cryptography==2.9
decorator==4.4.2
future==0.18.2
gspread==3.3.1
httplib2==0.17.1
idna==2.9
oauth2client==4.1.3
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.20
python-telegram-bot==12.5.1
requests==2.23.0
rsa==4.0
six==1.14.0
tornado==6.0.4
urllib3==1.25.8
RUNTIME.TXT
python-3.8.2

I had the same issue with my telegram bot.
I added the actual bot token string to the TOKEN variable and uploaded it to Heroku.
Then I used os.environ.get('TOKEN') command for the version that I pushed on GitHub.
I am still just a beginner in python, I don't know if this posses security issues but it removed the error code and allowed my bot to work

Getting all reviews from a specific Windows Phone Marketplace Game

Does AppHub let us see reviews of our apps from all marketplaces at once? As I didn't find any, o took some time writing some code to print them all in a file, so i won't waste my time looking for them in every single language.
I'd appreciate any better solution. In the worst case, I'm glad to share the code with anyone who finds it usefull.
It uses BeautifulSoup.
The only parametter is the id of the app, like this:
wp7reviews.py 62289160-6970-4674-85a0-aef3dbe3f93d
Here is the code
import sys
import getopt
from urllib2 import URLError
from urllib2 import HTTPError
import urllib2
from BeautifulSoup import BeautifulStoneSoup
opts, extraparams = getopt.getopt(sys.argv[1:], '')
# starts at the second element of argv since the first one is the script name
# extraparms are extra arguments passed after all option/keywords are assigned
# opts is a list containing the pair "option"/"value"
#print 'Opts:',opts
#print 'Extra parameters:',extraparams
try:
appid = extraparams[0]
except:
#Awsome Linkit appid as default appid
appid="62289160-6970-4674-85a0-aef3dbe3f93d"
allreviewsFILE = open("allreviews.txt", "w")
def output(text):
allreviewsFILE.write(text)
#print text,
def outputln(text):
allreviewsFILE.write(text+'\n')
#print text
def geturl(lang):
return "http://catalog.zune.net/v3.2/"+lang+"/apps/"+appid
try:
request = urllib2.Request(geturl("en-us"))
fd = urllib2.urlopen(request)
content = fd.read()
fd.close()
soup = BeautifulStoneSoup(content)
try:
outputln("App title: "+soup.findAll("a:title")[0].string)
outputln("");
except:
print "Failed to get App Title"
langs = ["en-us", "en-gb", "de-de",
"fr-fr", "es-es", "it-it",
"en-au", "de-at", "fr-be",
"fr-ca", "en-ca", "en-hk",
"en-in", "en-ie", "es-mx",
"en-nz", "en-sg", "de-ch",
"fr-ch", "zh-hk", "zh-cn",
"en-hk"]
outputln("Here we got reviews from each marketplace")
for lang in langs:
request = urllib2.Request(geturl(lang)+"/reviews")
fd = urllib2.urlopen(request)
print "Fetching "+lang+"...",
content = fd.read()
fd.close()
print "OK"
soup = BeautifulStoneSoup(content)
#print soup.prettify()
contents = soup.findAll("a:content")
ratings = soup.findAll("userrating")
l = len(contents)
if l > 0:
outputln("----------- "+lang+" ---------------------------------------------------")
outputln("")
for i in range(0, l):
output(ratings[i].string+"/10 - ")
if len(contents[i].contents) > 0:
try:
outputln(contents[i].contents[0])
except:
outputln("*** Unknown chars ***")
else:
outputln("Rating only")
outputln("")
except HTTPError, e:
print("Error during request!\n")
print("Cod.: ", e.code)
except URLError, e:
print("Invalid URL!\n")
print("Message: ", e.reason)

There already is a site that gives you this information. Take a look at http://wp7reviews.tomverhoeff.com/

There is also a free WP7 app called AppTracker which allows you to track reviews from different regions, as well as translate them into your native language

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Cloud web scraping with requests returns nothing - heroku

Related

Giphy API not responding in discord.py

Discord.py Rewrite Giphy Cog Error: Unclosed Connector

Attempting to rewrite a Discord.py GIF Bot from an old outdated guide

Python Telegram bot on Heroku can't access Google Spreadsheet

Getting all reviews from a specific Windows Phone Marketplace Game

Categories

Resources