python - Is my code ever reaching the proxy method? - requests - proxy

So I am playing around with proxies here and there with requests, Basically meaning that if I run a proxy with a requests. It will then be used in the whole session with that requests. Now I have coded but I haven't been able to check if it really goes through and I don't either know if this place is correct to paste. What I have done is looking like
with open('proxies.json') as json_data_file:
proxies = json.load(json_data_file)
def setProxy(proxy):
s = requests.Session()
proxies = {'http': 'http://' + proxy,
'https': 'http://' + proxy}
s.proxies.update(proxies)
return s
def info(thread):
global prod
prod = int(thread) + 1
runit(proxies)
def runit(proxies):
try:
if proxies != []:
s = setProxy(random.choice(proxies))
sleepy = time.sleep(.5)
else:
s = requests.Session()
sleepy = time.sleep(1)
r = s.get(url)
except requests.exceptions.ProxyError:
log(Fore.RED + "Proxy DEAD - rotating" + Fore.RESET)
sleepy
passwd(proxies)
PostUrl = s.post('www.hellotest.com')
print("Does it actually use the proxy or not?"
def main():
i = 0
jobs = []
for i in range(10):
p = multiprocessing.Process(target=info, args=(str(i),))
jobs.append(p)
time.sleep(.5)
p.start()
for p in jobs:
p.join()
sys.exit()
Is there a way to actually see if it does it or not? This is also my first time doing it so! Please do not judge!

Related

Cloud web scraping with requests returns nothing

I'm trying to create a cog for my Discord bot that scrapes Indeed and returns info on job postings (position, company, location, etc). My bot is hosted on Heroku, which is where the issues start. I've tested my web scraper by itself and when implemented as a cog for my Discord bot locally. It works both times. However, when I tried to deploy it on Heroku, the cog stopped working.
I read that this was because cloud-hosting services have blacklists or something for web scraping apps and functions. So I tried to use rq as suggested in this post:
https://devcenter.heroku.com/articles/python-rq
I did all the steps, added an additional worker, a worker.py file, and installed the Redis To Go addon. However, when I try to use the following, I receive nothing back:
url = get_url(job_title, location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# soup.find() returns None
I'm sure I just implemented something wrong, but can someone help me please? The full code is below:
import discord
from discord.ext import commands
import random
import requests
import time
from bs4 import BeautifulSoup
from rq import Queue
from worker import conn
ret = []
def get_url(position, location):
'''Generate url from position and location'''
template = 'https://www.indeed.com/jobs?q={}&l={}'
position = position.replace(" ", "+")
location = location.replace(" ", "+")
url = template.format(position, location)
return url
def get_jobs(job_title, location):
'''Max returned number of jobs is 15 per page.'''
global ret
url = get_url(job_title, location)
response = requests.get(url)
print(f"Responses: {response}")
### This returns <Response [200]>
soup = BeautifulSoup(response.text, "html.parser")
job_names = []
for job_name in soup.find_all("h2", class_="jobTitle"):
job_names.append(job_name.get_text())
### Each one just returns an empty list []
companies = []
for company in soup.find_all("span", class_="companyName"):
companies.append(company.get_text())
locations = []
for location in soup.find_all("div", class_="companyLocation"):
locations.append(location.get_text())
salaries = []
for salary in soup.find_all("div", class_="attribute_snippet"):
if salary.get_text().startswith("$"):
salaries.append(salary.get_text())
else:
salaries.append("Unknown")
links = []
for link in soup.find_all("a", class_=lambda value: value and value.startswith("tapItem fs-unmask result"), href=True):
link = link["href"]
link = "https://indeed.com" + link
links.append(link)
ret = [job_names, companies, locations, salaries, links]
print(ret)
### This returns [[], [], [], [], []]
class JobScraper(commands.Cog):
def __init__(self, client): # References whatever is passed through the client from discord
self.client = client
self.q = Queue(connection=conn)
#commands.command(aliases=["job", "find_job", "find_jobs", "get_job", "get_jobs"])
async def jobs(self, ctx, *, query):
'''Scrapes Indeed.com for jobs and returns them.
The input format should be "eve jobs [job title], [job location], [num returned]
e.g. eve jobs ai researcher, san francisco, 3'''
key_terms = query.split(",")
key_terms = [term.strip() for term in key_terms]
if len(key_terms) == 3:
num_jobs = int(key_terms[2])
else:
num_jobs = 15
# ret = get_jobs(key_terms[0], key_terms[1])
job = self.q.enqueue(get_jobs, key_terms[0], key_terms[1])
await ctx.send("Here is what I found:")
for i in range(num_jobs):
await ctx.send("```" +
f"\nTitle: {ret[0][i]}" +
f"\nCompany: {ret[1][i]}" +
f"\nLocation: {ret[2][i]}" +
f"\nSalary: {ret[3][i]}" +
f"\nLink: {ret[4][i]}" +
"\n```")
def setup(client):
client.add_cog(JobScraper(client))

Translating performance run from autocannon to locust

I need to translate autocannon performance test into locust python code and reach the same requests per second criteria > 3000
this is the autocannon command:
AUTOCANNON="taskset -c 8-15 /opt/autocannon-tests/node_modules/.bin/autocannon --amount 100000 --connections 30 --bailout 5 --json"
$AUTOCANNON $URL/applications -m PUT -H "Content-Type:application/json" -H "Authorization=$AUTHORIZATION_HEADER" -b '{"name":"test"}'
I managed to reach number of requests per second > 3000
I wrote a python code
class _PerformanceTask(SequentialTaskSet):
def __init__(self, *args, **kwargs):
SequentialTaskSet.__init__(self, *args, **kwargs)
self.username = 'admin'
self.password = 'admin'
self.token = None
self.identifier = time.time()
self.error = None
self.as3_user_id = None
self.non_admin_user_token = None
self.as3_user_token = None
self.system_id = None
self.open_api_retrieve_count = 0
self.declare_id = None
self.network_id = None
self.irule_app = None
self.irule_network_id = None
self.application_editor_user = None
def on_start(self):
self.login()
def _log(self, fmt, *args):
print('[%s] %s' % (self.identifier, fmt % args))
def _request(self, method, path, non_admin_user_token=False, headers=None, **kwargs):
self._log('[%s]%s', method, path)
self._log('%s', repr(kwargs))
if not headers:
headers = {'Content-Type': 'application/json'}
if self.token:
headers['Authorization'] = 'Bearer %s' % self.token
if non_admin_user_token:
headers['Authorization'] = 'Bearer %s' % self.non_admin_user_token
resp = self.client.request(method, path, headers=headers, **kwargs)
self._log('resp status code: %s', resp.status_code)
self._log('resp content: %s', resp.text)
assert resp.status_code in (200, 201, 204, 202)
if (re.search('^[\[\{]', resp.text)):
return resp.json()
return resp.text
def login(self):
self._log('login')
resp = self._request(
method='GET',
path='/login',
auth=(self.username, self.password),
)
self.token = resp['token']
self._log('token is: %s', self.token)
#task
def run_performance(self):
self._log('PUT request to $URL/applications with auth. header.')
resp = self._request(
method='PUT',
path='/applications',
json={
"name":"test",
}
)
self._log('response is: %s', resp)
class PerformanceTask(FastHttpUser):
tasks = [_PerformanceTask]
Note: I am using FastHttpUser + locust-plugins installed
But I can't reach the same result.
The ways I run this performance.py script
locust --locustfile performance.py --host https://localhost:5443/api/v1 --headless -u 30 -i 100000
and also distributed:
locust --locustfile performance.py --host https://localhost:5443/api/v1 --headless -u 30 -i 10000 --master --expect-workers=8
and start workers like
locust --locustfile performance.py --worker --master-host=127.0.0.1 -i 10000 &
anyway - I get table of results and the speed is much lower no matter how I run :
req/s failures/s
224.49 0.00
I hope you have ideas
I'm not familiar with autocannon so I'm not entirely sure, but a quick look through the documentation says that the --connections doesn't seem like it translates to Locust's --users/-u. It says it's "The number of concurrent connections to use." To get something similar to that, I believe you'd have to set up a FastHttpSession and specify concurrency there. Something like:
fast_http_session = FastHttpSession(environment=env, base_url="https://localhost:5443/api/v1", user=None, concurrency=30)
You'll need to get the environment from Locust when it runs to pass it into there, and may or may not want to specify your actual user (which you can pass as self if you put this in your user class).
But that should get you the number of concurrent connections to use, and then you'd want to crank up the number of users you spawn. As you make your calls using the session you created, the users will reuse the 30 open connections, it will just be up to you to discover how many users you need to spawn to "saturate" the connections like autocannon claims to do and/or how many the machine you run it on can handle.

Apache Ignite REST API

I am Using Apache Ignite 2.8.0.
I have developed small dash board that is used to monitor the performance of Ignite.
Now my problem is finding the number of servers.
first i have find the total number of nodes(in node variable), then
total_servers = 0
port = 8080
for j in range(0,node + 1):
if(persistence == True):
url_cache = "http://localhost:" + str(port) + "/ignite?cmd=top&sessionToken=" +sessionToken
else:
url_cache = "http://localhost:" + str(port) + "/ignite?cmd=top"
try:
print(j)
try:
res = requests.get(url = url_cache)
print(res.status_code)
if(res.status_code == 200):
total_servers = total_servers + 1
except:
pass
except:
pass
port = port + 1
But it will take much time, i don't want that.
is there any simple way to finding the number of servers running in Apache Ignite by using REST API http request?
From REST you can run SQL command SELECT * FROM SYS.NODES; to determine that:
~/Downloads/apache-ignite-2.8.1-bin% wget -q -O- http://localhost:8080/ignite\?cmd=qryfldexe\&pageSize\=10\&cacheName\=default\&qry=select\ \*\ from\ sys.nodes | jq .response.items
[
[
"3304155a-bc83-402f-a884-59d39f074d3a",
"0:0:0:0:0:0:0:1%lo,127.0.0.1,172.17.0.1,192.168.1.7:47500",
"2.8.1#20200521-sha1:86422096",
false,
false,
1,
"[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.17.0.1, 192.168.1.7]",
"[192.168.1.7, 172.17.0.1]",
true
]
]
(assumes you have cache named default for API purposes)
Finally i have found one Answer.. if it's wrong please suggest me,
http://localhost:8080/ignite?cmd=node&id=a427-a04631d64c98&attr=true
in response["attributes"]["org.apache.ignite.cache.client"] => false,

How do I avoid the loop argument

The following code is part of some automated tests that I have written in python 3.6:
connected = False
def aiohttp_server(loop):
async def handler(msg, session):
global connected
if msg.type == sockjs.MSG_OPEN:
connected = True
if msg.type == sockjs.MSG_CLOSE:
connected = False
app = web.Application(loop=loop)
sockjs.add_endpoint(app, handler)
runner = web.AppRunner(app)
return runner
def run_server(runner, loop):
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
asyncio.set_event_loop(loop)
loop.run_until_complete(runner.setup())
site = web.TCPSite(runner, 'localhost', 8080)
loop.run_until_complete(site.start())
loop.run_forever()
def start_server():
loop = asyncio.new_event_loop()
t = threading.Thread(target=run_server, args=(aiohttp_server(loop),loop,), daemon=True)
t.start()
time.sleep(0.01)
Basically, calling start_server should initiate a simple web server with a sockjs endpoint named /sockjs
I am not yet a master of python's async keyword. There are two issues, that I suspect are related:
Firstly, I am getting a deprecation warning on the app = web.Application(loop=loop) statement:
/home/peter/incubator/sockjs_client/tests/test_sockjs_client.py:25: DeprecationWarning: loop argument is deprecated
app = web.Application(loop=loop)
/home/peter/.local/lib/python3.6/site-packages/sockjs/route.py:54: DeprecationWarning: loop property is deprecated
manager = SessionManager(name, app, handler, app.loop)
And secondly, the tests fail occasionally. I believe that, depending on machine load, sometimes the server hasn't had enough time to start before the test code actually starts executing.
Basically, what I need is for the start_server function to initialise a web application with a websocket endpoint, and not return until the application is prepared to accept websocket connections.
Firstly, I am getting a deprecation warning on the app = web.Application(loop=loop) statement:
The recommended way to avoid passing around the loop everywhere is to switch to asyncio.run. Instead of managing the loop manually, let asyncio.run create (and close) the loop for you. If all your work is done in coroutines, you can access the loop with get_event_loop() or get_running_loop().
Basically, what I need is for the start_server function to initialise a web application with a websocket endpoint, and not return until the application is prepared to accept websocket connections.
You can pass a threading.Event to the thread that gets set when the site is set up, and wait for it in the main thread.
Here is an (untested) example that implements both suggestions:
connected = False
def aiohttp_server():
async def handler(msg, session):
global connected
if msg.type == sockjs.MSG_OPEN:
connected = True
if msg.type == sockjs.MSG_CLOSE:
connected = False
app = web.Application()
sockjs.add_endpoint(app, handler)
return web.AppRunner(app)
async def run_server(ready):
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
runner = aiohttp_server()
await runner.setup()
site = web.TCPSite(runner, 'localhost', 8080)
await site.start()
ready.set()
# emulates loop.run_forever()
await asyncio.get_running_loop().create_future()
def start_server():
ready = threading.Event()
threading.Thread(target=asyncio.run, args=(aiohttp_server(ready),),
daemon=True).start()
ready.wait()
Please upgrade sockjs to the newest version.
It doesn't require passing the loop anymore.

ZeroMQ Subscribers not receiving message from Publisher over an inproc: transport class

I am fairly new to pyzmq. I am trying to understand inproc: transport class and have created this sample example to play with.
It looks a Publisher instance is publishing messages but Subscriber instances are not receiving any.
In case I move Subscriber instances into a separate process and change inproc: to a tcp: transport class, the example works.
Here is the code:
import threading
import time
import zmq
context = zmq.Context.instance()
address = 'inproc://test'
class Publisher(threading.Thread):
def __init__(self):
self.socket = context.socket(zmq.PUB)
self.socket.bind(address)
def run(self):
while True:
message = 'snapshot,current_time_%s' % str(time.time())
print 'sending message %s' % message
self.socket.send(message)
time.sleep(1)
class Subscriber(object):
def __init__(self, sub_name):
self.name = sub_name
self.socket = context.socket(zmq.SUB)
self.socket.connect(address)
def listen(self):
while True:
try:
msg = self.socket.recv()
a, b = msg.split(' ', 1)
print 'Received message -> %s-%s-%s' % (self.name, a, b)
except zmq.ZMQError as e:
logger.exception(e)
if __name__ == '__main__':
thread_a = []
for i in range(0, 1):
subs = Subscriber('subscriber_%s' % str(i))
th = threading.Thread(target=subs.listen)
thread_a.append(th)
th.start()
pub = Publisher()
pub_th = threading.Thread(target=pub.run)
pub_th.start()
There is nothing wrong, but
ZeroMQ is a wonderfull toolbox.It is full of smart, bright and self-adapting services under the hood, that literally save our poor lives in many ways.Still it is worth to read and obey a few rules from the documentation.
inproc transport class has one such. .bind() first, before .connect()-s
[ Page 38, Code Connected, Volume I ]... inproc is an inter-thread signalling transport ... it is faster than tcp or ipc. This transport has a specific limitation compared to tpc and icp: the server must issue a bind before any client issues a connect. This is something future versions of ØMQ may fix, but at present this defines how you use inproc sockets.
So, as an example:
if __name__ == '__main__':
pub = Publisher()
pub_th = threading.Thread( target = pub.run )
pub_th.start()
# give it a place to start before .connect()-s may take place
# give it a time to start before .connect()-s may take place
sleep(0.5)
thread_a = []
for i in range( 0, 1 ):
subs = Subscriber( 'subscriber_%s' % str( i ) )
th = threading.Thread( target = subs.listen )
thread_a.append( th )
th.start()

Resources