How to pass flowfile attribute inside a python class in NiFi? - apache-nifi

Goal: Is to add filename field value in CSV using executeScript in Python by getting the flowfile attribute.
Problem: How to pass the flowfile for me to get the attribute to be included the outputstream write
Below sample code is not working in getting attribute filename.
class PyStreamCallback(StreamCallback):
def __init__(self,flowFile):
self.ff = flowFile
pass
def process(self, inputStream, outputStream):
text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
list_index = 0
textArr = []
newText=''
for t in text.splitlines():
list_index += 1
t= t + '|' + str(list_index) + '|"' + t + '"|' + self.ff.getAttribute('filename')
textArr.append(t)
newText = '\n'.join(textArr)
outputStream.write(bytearray(newText.encode('utf-8')))
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile,PyStreamCallback())
session.transfer(flowFile, REL_SUCCESS)

Declare a global variable to hold the file name attribute value. Sample code snippet,
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
from org.apache.nifi.processors.script import ExecuteScript
from org.python.core.util.FileUtil import wrap
from io import StringIO
global file_name
# Define a subclass of StreamCallback for use in session.write()
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
with wrap(inputStream) as f:
lines = f.readlines()
for line in lines:
line = line.strip() + '|' + file_name + '\n'
with wrap(outputStream, 'w') as filehandle:
filehandle.writelines("%s" % line for line in lines)
# end class
flowFile = session.get()
if (flowFile != None):
try
file_name = flowFile.getAttribute('filename')
flowFile = session.write(flowFile, PyStreamCallback())
session.transfer(flowFile, ExecuteScript.REL_SUCCESS)
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
excp = str(exc_type) + str(fname)+ str(exc_tb.tb_lineno)
attrMap = {'exception': str(excp)}
flowFile = session.putAllAttributes(flowFile , attrMap)
session.transfer(flowFile , ExecuteScript.REL_FAILURE)

Related

Why does Google AutoML Sample Python Code Not Run?

The sample Google AutoML prediction python code causes an error on execution. Recommended execution is "python predict.py YOUR_LOCAL_IMAGE_FILE YOUR_PROJECT_ID YOUR_MODEL_ID" Error is:
File "predict.py", line 25
print get_prediction(content, project_id, model_id)
^
SyntaxError: invalid syntax
(Thanks in advance)
Google sample code
import sys
from google.cloud import automl_v1beta1
from google.cloud.automl_v1beta1.proto import service_pb2
def get_prediction(content, project_id, model_id):
prediction_client = automl_v1beta1.PredictionServiceClient()
name = 'projects/{}/locations/us-central1/models/{}'.format(project_id, model_id)
payload = {'image': {'image_bytes': content }}
params = {}
request = prediction_client.predict(name, payload, params)
return request # waits till request is returned
if __name__ == '__main__':
file_path = sys.argv[1]
project_id = sys.argv[2]
model_id = sys.argv[3]
with open(file_path, 'rb') as ff:
content = ff.read()
print get_prediction(content, project_id, model_id)
last line of the code print... should not be in line indent.
import sys
from google.cloud import automl_v1beta1
from google.cloud.automl_v1beta1.proto import service_pb2
def get_prediction(content, project_id, model_id):
prediction_client = automl_v1beta1.PredictionServiceClient()
name = 'projects/{}/locations/us-central1/models/{}'.format(project_id, model_id)
payload = {'image': {'image_bytes': content }}
params = {}
request = prediction_client.predict(name, payload, params)
return request # waits till request is returned
if __name__ == '__main__':
file_path = sys.argv[1]
project_id = sys.argv[2]
model_id = sys.argv[3]
with open(file_path, 'rb') as ff:
content = ff.read()
print get_prediction(content, project_id, model_id)
print (get_prediction(content, project_id, model_id))
from google.cloud import automl_v1beta1
from google.cloud.automl_v1beta1.proto import service_pb2
# 'content' is base-64-encoded image data.
def get_prediction(content, project_id, model_id):
prediction_client = automl_v1beta1.PredictionServiceClient()
name = 'projects/{}/locations/us-central1/models/{}'.format(project_id, model_id)
payload = {'image': {'image_bytes': content }}
params = {}
request = prediction_client.predict(name, payload, params)
return request # waits till request is returned
if __name__ == '__main__':
file_path = sys.argv[1]
project_id = sys.argv[2]
model_id = sys.argv[3]
with open(file_path, 'rb') as ff:
content = ff.read()
print (get_prediction(content, project_id, model_id))

Unable to fetch value from JSR223 Sampler

I have a JSR223 Sampler in Jmeter with the following code:
import com.jayway.jsonpath.JsonPath
import org.apache.commons.lang3.RandomUtils
import org.apache.jmeter.samplers.SampleResult
def options = JsonPath.read(prev.getResponseDataAsString(), '$.options')
if(options.size() == "1" || options.size() == "2") {
def randomOption = options.get(0)
def code = randomOption.get("code")
vars.put('code1', code)
def values = randomOption.get('values')
def randomValue = values.get(RandomUtils.nextInt(0, values.size()))
def value = randomValue.get('value')
vars.put('valueF', value)
def options2 = JsonPath.read(prev.getResponseDataAsString(), '$.options')
def randomOption2 = options2.get(1)
def code2 = randomOption2.get("code")
vars.put('code2', code2)
def values2 = randomOption2.get('values')
def randomValue2 = values2.get(RandomUtils.nextInt(0, values.size()))
def value2 = randomValue2.get('value')
vars.put('valueF2', value2)
}
else {
vars.put('no loop','Not enterd into loop')
}
vars.put('counts',new
groovy.json.JsonSlurper().parse(prev.getResponseData()).options.size() as
String)
def size = com.jayway.jsonpath.JsonPath.read(prev.getResponseDataAsString(),
'$.options_available')
if (size == []) {
vars.put('size', 'NonConfigurable')
}
else {
vars.put('size', 'Configurable')
}
I am unable to get the value of code1 and valueF , code2 and valueF2 outside of the Sampler. Any possible help is appreciated!
Try amending this line:
def value = randomValue.get('value')
to
def value = randomValue.get('value') as String
Similarly for the "code:
def code = randomOption.get("code") as String
Get used to look into jmeter.log file when you face an issue with JMeter, in the majority of cases it should contain enough troubleshooting information.
If you need further assistance on the topic update it with the full response you're trying to parse. In the meantime:
Groovy: Parsing and Producing JSON
JayWay JSonPath
Groovy is the New Black
JSONArray size should be used by using length(), change in your code
if(options.length() == 1 || options.length() == 2) {

Python using multiple decorators on a method and logging the method name inside each decorator

Assume I have two decorator functions in a file log.py
def timeit(logger, level = 'DEBUG'):
def timeit_decorator(method):
def timeit_wrapper(*args, **kw):
ts = time.time()
result = method(*args, **kw)
te = time.time()
logger.log(logging.getLevelName(level), '%2.4f sec' % (te - ts), extra = dict(filename = method.__code__.co_filename, funcName = method.__code__.co_name))
return result
return timeit_wrapper
return timeit_decorator
And I have a file test.py having one function which uses both the decorators like this,
#timeit(logger = LOGGER)
#logargs(logger = LOGGER)
def test(arg1 = 'something'):
pass
When I run test.py one of the decorator prints module, func & lineno as [test.py:7 - test() ]
and other one prints like [log.py:6 - timeit_wrapper()]
How do I make both the decorator to print the actual method, module & lineno which is [test.py:7 - test() ]

Filter tweets in tweepy.StreamListener on_data method

Understand from many articles on stack overflow that the filter method in the tweepy.streaming.stream class uses a logical OR for track and location arguements
so the below will return either tweets from location=USA or with a word ""
streamObj = tweepy.streaming.Stream(oauthObject
,EchoStreamListener(api=apiInstance,
dump_json=args.json,
numtweets=args.numtweets))
keyWordList = ['panthers','falcon']
GEOBOX_USA = [-125,25.1,-60.5,49.1]
streamObj.filter(locations=GEOBOX_USA, track=keyWordList, languages=['en'])
This solution (How to add a location filter to tweepy module
) to check keywords in the on_status method works great, but if i needed to store the entire json variable i think i would have to use the on_data
so changed the on_data (as shown in code below), but get an error:
File "/Library/Python/2.7/site-packages/tweepy/streaming.py", line 294, in _run
raise exception
KeyError: 'text'
-- coding: utf-8 --
from types import *
import tweepy
import json
import argparse
import io
class EchoStreamListener(tweepy.StreamListener):
def __init__(self, api, dump_json=False, numtweets=0):
self.api = api
self.dump_json = dump_json
self.count = 0
self.limit = int(numtweets)
super(tweepy.StreamListener, self).__init__()
# def on_status(self, status):
# if any(keyWord in status.text.lower() for keyWord in keyWordList):
# print status.text
#
# self.count+=1
# return False if self.count == self.limit else True
# else:
# return True # Don't kill the stream
def on_data(self, tweet):
tweet_data = json.loads(tweet) # This allows the JSON data be used as a normal dictionary:
if any(keyWord in tweet_data['text'] for keyWord in keyWordList):
if self.dump_json:
print json.dumps(tweet_data)
saveFile.write(unicode(tweet) + "\n")
self.count+=1
return False if self.count == self.limit else True
else:
print tweet_data['created_at','name','text'].encode("utf-8").rstrip()
def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True
def get_parser():
parser = argparse.ArgumentParser(add_help=True)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
'-j', '--json',
action='store_true',
help='dump each tweet as a json string'
)
group.add_argument(
'-t', '--text',
dest='json',
action='store_false',
help='dump each tweet\'s text'
)
parser.add_argument(
'-n', '--numtweets',
metavar='numtweets',
help='set number of tweets to retrieve'
)
return parser
if __name__ == '__main__':
oauthObject = tweepy.OAuthHandler(myconsumer_key, myconsumer_secret)
oauthObject.set_access_token(myaccess_key,myaccess_secret)
apiInstance = tweepy.API(oauthObject)
parser = get_parser()
args = parser.parse_args()
streamObj = tweepy.streaming.Stream(oauthObject
,EchoStreamListener(api=apiInstance,
dump_json=args.json,
numtweets=args.numtweets))
keyWordList = ['panthers','falcon']
GEOBOX_USA = [-125,25.1,-60.5,49.1]
saveFile = io.open('/Users/deepaktanna/raw_tweets.json', 'w', encoding='utf-8')
streamObj.filter(locations=GEOBOX_USA, languages=['en'])
saveFile.close()

Threading blocking GUI

I have a 2-way panel parsing a ListCtrl. I still have my GUI blocked from this thread:
class MyThread(threading.Thread):
def __init__(self, DirQ, new_dbfQ, RemoveQ):
threading.Thread.__init__(self)
self.DirQ = DirQ
self.new_dbfQ = new_dbfQ
self.RemoveQ = RemoveQ
def run(self):
""" worker """
self.OpenDir = self.DirQ.get()
self.new_dbf = self.new_dbfQ.get()
self.RegRemove = self.RemoveQ.get()
with open(str(self.OpenDir), 'r') as infile:
reader = csv.reader(infile)
data = StringIO()
writer = csv.writer(data)
for line in csv.reader(self.new_dbf.splitlines()):
row = line
row_edit = re.sub(self.RegRemove,'', row[1])
writer.writerow([row[0], row_edit])
msg = data.getvalue()
wx.CallAfter(Publisher().sendMessage, "update", msg)
I have a button the toggles:
def checkBtnClick3(self, event):
self.DirQ.put(self.OpenDir.GetValue())
self.new_dbfQ.put(self.new_dbf)
self.RemoveQ.put(self.RegRemove.GetValue())
t = MyThread(self.DirQ, self.new_dbfQ, self.RemoveQ)
t.setDaemon(True)
t.start()
Do I need to add some kind of idle function on my frame class to free-up the GUI?

Resources