Here is pseudo code that I would like to parallelize, but don't know where to start
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['myDB']
collection = db.myCollection
test_list = ['foo', 'bar']
result_list = list()
for el in test_list:
result_list.append(collection.distinct('attrib',{'version': el}))
I know how to create parallel loop with joblib, but I am not sure how to query MongoDB in parallel, should I create multiple clients or collections? Will the above code work if I simply re-write it with joblib without caring about MongoDB?
You can run requests in separate threads:
from multiprocessing.dummy import Pool as ThreadPool
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['myDB']
collection = db.myCollection
thread_pool_size = 4
pool = ThreadPool(thread_pool_size)
def my_function(el):
return collection.distinct('attrib', {'version': el}))
test_list = ['foo', 'bar']
result_list = pool.map(my_function, test_list)
Related
Pretty new to asynch so here is my question and thank you in advance.
Hi All very simple question I might be thinking too much into.
I am trying to access this cassandra client outside of these defined listeners below that get registered to a sanic main app.
I need the session in order to use an update query which will execute Asynchronously. I can definetly connect and event query from the 'setup_cassandra_session_listener' method below. But having tough time figuring how to call this Cassandra session outside and isolate so i can access else where.
from aiocassandra import aiosession
from cassandra.cluster import Cluster
from sanic import Sanic
from config import CLUSTER_HOST, TABLE_NAME, CASSANDRA_KEY_SPACE, CASSANDRA_PORT, DATA_CENTER, DEBUG_LEVEL, LOGGER_FORMAT
log = logging.getLogger('sanic')
log.setLevel('INFO')
cassandra_cluster = None
def setup_cassandra_session_listener(app, loop):
global cassandra_cluster
cassandra_cluster = Cluster([CLUSTER_HOST], CASSANDRA_PORT, DATA_CENTER)
session = cassandra_cluster.connect(CASSANDRA_KEY_SPACE)
metadata = cassandra_cluster.metadata
app.session = cassandra_cluster.connect(CASSANDRA_KEY_SPACE)
log.info('Connected to cluster: ' + metadata.cluster_name)
aiosession(session)
app.cassandra = session
def teardown_cassandra_session_listener(app, loop):
global cassandra_cluster
cassandra_cluster.shutdown()
def register_cassandra(app: Sanic):
app.listener('before_server_start')(setup_cassandra_session_listener)
app.listener('after_server_stop')(teardown_cassandra_session_listener)
Here is a working example that should do what you need. It does not actually run Cassandra (since I have no experience doing that). But, in principle this should work with any database connection you need to manage across the lifespan of your running server.
from sanic import Sanic
from sanic.response import text
app = Sanic()
class DummyCluser:
def connect(self):
print("Connecting")
return "session"
def shutdown(self):
print("Shutting down")
def setup_cassandra_session_listener(app, loop):
# No global variables needed
app.cluster = DummyCluser()
app.session = app.cluster.connect()
def teardown_cassandra_session_listener(app, loop):
app.cluster.shutdown()
def register_cassandra(app: Sanic):
# Changed these listeners to be more friendly if running with and ASGI server
app.listener('after_server_start')(setup_cassandra_session_listener)
app.listener('before_server_stop')(teardown_cassandra_session_listener)
#app.get("/")
async def get(request):
return text(app.session)
if __name__ == "__main__":
register_cassandra(app)
app.run(debug=True)
The idea is that you attach to your app instance (as you did) and then are able to simply access that inside your routes with request.app.
I am trying to pull a large dataset from pyodbc. My code below works ok, but it is serial, hence slow. I want to make it able to initiate multiple IO calls asynchronously. I see many examples using asyncio - but cannot find anything i can use with fetchmany. I appreciate any suggestions! I attempted to pool using asyncio but couldn't make it work.
conn = pyodbc.connect('DSN=Denodo Interfaces')
cursor = conn.cursor()
strng = strng.replace('myWellName', well_name)
cursor.execute(strng)
cols = [column[0] for column in cursor.description]
mylist=[]
while True:
rows = cursor.fetchmany(10000)
if not rows:
break
df = pd.DataFrame([tuple(t) for t in rows], columns = cols)
mylist.append(df)
df = pd.concat(mylist, axis=0).reset_index(drop=True)
I am using Spark Streaming for creating a system to enrich incoming data from a cloudant database. Example -
Incoming Message: {"id" : 123}
Outgoing Message: {"id" : 123, "data": "xxxxxxxxxxxxxxxxxxx"}
My code for the driver class is as follows:
from Sample.Job import EnrichmentJob
from Sample.Job import FunctionJob
import pyspark
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from kafka import KafkaConsumer, KafkaProducer
import json
class SampleFramework():
def __init__(self):
pass
#staticmethod
def messageHandler(m):
return json.loads(m.message)
#staticmethod
def processData(rdd):
if (rdd.isEmpty()):
print("RDD is Empty")
return
# Expand
expanded_rdd = rdd.mapPartitions(EnrichmentJob.enrich)
# Score
scored_rdd = expanded_rdd.map(FunctionJob.function)
# Publish RDD
def run(self, ssc):
self.ssc = ssc
directKafkaStream = KafkaUtils.createDirectStream(self.ssc, QUEUENAME, \
{"metadata.broker.list": META,
"bootstrap.servers": SERVER}, \
messageHandler= SampleFramework.messageHandler)
directKafkaStream.foreachRDD(SampleFramework.processData)
ssc.start()
ssc.awaitTermination()
Code for the the Enrichment Job is as follows:
class EnrichmentJob:
cache = {}
#staticmethod
def enrich(data):
# Assume that Cloudant Connector using the available config
cloudantConnector = CloudantConnector(config, config["cloudant"]["host"]["req_db_name"])
final_data = []
for row in data:
id = row["id"]
if(id not in EnrichmentJob.cache.keys()):
data = cloudantConnector.getOne({"id": id})
row["data"] = data
EnrichmentJob.cache[id]=data
else:
data = EnrichmentJob.cache[id]
row["data"] = data
final_data.append(row)
cloudantConnector.close()
return final_data
My question is - Is there someway to maintain [1]"a global cache on the main memory that is accessible to all workers" or [2]"local caches on each of the workers such that they remain persisted in the foreachRDD setting"?
I have already explored the following -
Broadcast Variables - Here we go the [1] way. As I understand, they are meant to be read-only and immutable. I have checked out this reference but it cites an example of unpersisting/persisting the broadcasted variable. Is this a good practice?
Static Variables - Here we go the [2] way. The class that is being referred to ("Enricher" in this case) maintains a cache in the form of a static variable dictionary. But it turns out that the ForEachRDD function spawns a completely new process for each incoming RDD and this removes the previously initiated static variable. This is the one coded above.
I have two possible solutions right now -
Maintain an offline cache on the file system.
Do the entire computation of this enrichment task on my driver node. This would cause the entire data to end up on driver and be maintained there. The cache object will be sent to the enrichment job as an argument to the mapping function.
Here obviously the first one looks better than the second, but I wish to conclude that these two are the only ways around, before committing to them. Any pointers would be appreciated!
Is there someway to maintain [1]"a global cache on the main memory that is accessible to all workers"
No. There is no "main memory" which can be accessed by all workers. Each worker runs in a separate process and communicates with external world with sockets. Not to mention separation between different physical nodes in non-local mode.
There are some techniques that can be applied to achieve worker scoped cache with memory mapped data (using SQLite being the simplest one) but it takes some additional effort to implement the right way (avoid conflicts and such).
or [2]"local caches on each of the workers such that they remain persisted in the foreachRDD setting"?
You can use standard caching techniques with scope limited to the individual worker processes. Depending on the configuration (static vs. dynamic resource allocation, spark.python.worker.reuse) it may or may not be preserved between multiple tasks and batches.
Consider following, simplified, example:
map_param.py:
from pyspark import AccumulatorParam
from collections import Counter
class CounterParam(AccumulatorParam):
def zero(self, v: Counter) -> Counter:
return Counter()
def addInPlace(self, acc1: Counter, acc2: Counter) -> Counter:
acc1.update(acc2)
return acc1
my_utils.py:
from pyspark import Accumulator
from typing import Hashable
from collections import Counter
# Dummy cache. In production I would use functools.lru_cache
# but it is a bit more painful to show with accumulator
cached = {}
def f_cached(x: Hashable, counter: Accumulator) -> Hashable:
if cached.get(x) is None:
cached[x] = True
counter.add(Counter([x]))
return x
def f_uncached(x: Hashable, counter: Accumulator) -> Hashable:
counter.add(Counter([x]))
return x
main.py:
from pyspark.streaming import StreamingContext
from pyspark import SparkContext
from counter_param import CounterParam
import my_utils
from collections import Counter
def main():
sc = SparkContext("local[1]")
ssc = StreamingContext(sc, 5)
cnt_cached = sc.accumulator(Counter(), CounterParam())
cnt_uncached = sc.accumulator(Counter(), CounterParam())
stream = ssc.queueStream([
# Use single partition to show cache in work
sc.parallelize(data, 1) for data in
[[1, 2, 3], [1, 2, 5], [1, 3, 5]]
])
stream.foreachRDD(lambda rdd: rdd.foreach(
lambda x: my_utils.f_cached(x, cnt_cached)))
stream.foreachRDD(lambda rdd: rdd.foreach(
lambda x: my_utils.f_uncached(x, cnt_uncached)))
ssc.start()
ssc.awaitTerminationOrTimeout(15)
ssc.stop(stopGraceFully=True)
print("Counter cached {0}".format(cnt_cached.value))
print("Counter uncached {0}".format(cnt_uncached.value))
if __name__ == "__main__":
main()
Example run:
bin/spark-submit main.py
Counter cached Counter({1: 1, 2: 1, 3: 1, 5: 1})
Counter uncached Counter({1: 3, 2: 2, 3: 2, 5: 2})
As you can see we get expected results:
For "cached" objects accumulator is updated only once per unique key per worker process (partition).
For not-cached objects accumulator is update each time key occurs.
requests = [conn.request_spot_instances(price=0.0034, image_id='ami-6989a659', count=1,type='one-time', instance_type='m1.micro')]
I used the following code. But it is not working.
Use the following code to create instance from python command line.
import boto.ec2
conn = boto.ec2.connect_to_region(
"us-west-2",
aws_access_key_id="<aws access key>",
aws_secret_access_key="<aws secret key>",
)
conn = boto.ec2.connect_to_region("us-west-2")
conn.run_instances(
"<ami-image-id>",
key_name="myKey",
instance_type="t2.micro",
security_groups=["your-security-group-here"],
)
To create an EC2 instance using Python on AWS, you need to have "aws_access_key_id_value" and "aws_secret_access_key_value".
You can store such variables in config.properties and write your code in create-ec2-instance.py file
Create a config.properties and save the following code in it.
aws_access_key_id_value='YOUR-ACCESS-KEY-OF-THE-AWS-ACCOUNT'
aws_secret_access_key_value='YOUR-SECRETE-KEY-OF-THE-AWS-ACCOUNT'
region_name_value='region'
ImageId_value = 'ami-id'
MinCount_value = 1
MaxCount_value = 1
InstanceType_value = 't2.micro'
KeyName_value = 'name-of-ssh-key'
Create create-ec2-instance.py and save the following code in it.
import boto3
def getVarFromFile(filename):
import imp
f = open(filename)
global data
data = imp.load_source('data', '', f)
f.close()
getVarFromFile('config.properties')
ec2 = boto3.resource(
'ec2',
aws_access_key_id=data.aws_access_key_id_value,
aws_secret_access_key=data.aws_secret_access_key_value,
region_name=data.region_name_value
)
instance = ec2.create_instances(
ImageId = data.ImageId_value,
MinCount = data.MinCount_value,
MaxCount = data.MaxCount_value,
InstanceType = data.InstanceType_value,
KeyName = data.KeyName_value)
print (instance[0].id)
Use the following command to execute the python code.
python create-ec2-instance.py
The following code fetches parameter from request and respond from couchbase db as per the value of the parameter.
couchbase = Couchbase("ubuntumartini03:8091", "thebucket", "")
bucket = couchbase["thebucket"]
class MH(tornado.web.RequestHandler):
def get(self):
key = self.get_argument("pub_id", strip=True)
result = json.loads(bucket.get(key)[2])
self.write(result['metaTag'])
if __name__=="__main__":
app = tornado.web.Application(handlers=[(r"/", MH)])
app.listen(8888,"")
tornado.ioloop.IOLoop.instance().start()
Problem: For the given hardware, we can make 10k/sec calls to Couchbase from Tornado machine. But when we are making a call from client to Tornado machine, we are only able to make 350 calls/sec.
Surely the bottleneck here is Tornado. How to optimize it to be able to make atleast 7k calls/sec?
Edit your code like this :
from tornado.ioloop import IOLoop
couchbase = Couchbase("ubuntumartini03:8091", "thebucket", "")
bucket = couchbase["thebucket"]
class MH(tornado.web.RequestHandler):
async def get(self):
key = self.get_argument("pub_id", strip=True)
result = await IOLoop.current().run_in_executor(None,bucket.get,*(key))
self.write(result[2]['metaTag'])
if __name__=="__main__":
app = tornado.web.Application(handlers=[(r"/", MH)])
app.listen(8888,"")
tornado.ioloop.IOLoop.instance().start()
What client do you use, is it a synchronous or an ansynchronous client? If this is a synchronous client, it can't not make full use of the tornado ioloop reactor.