How to replace carriage return (CR) and not line feed (LF) on Nifi - apache-nifi

I have a text file to ingest but, some of the text fields os this file are coming with CR char in it. The official line break is LF so I figured I could replace all CR and normalize to files.
But I'm triyng to do this for a week now with no good results.
What I tried so far is using replaceText processor with several configurations. Line by line, entire text, already tried "\r", "\\r", "[\r]" but nothing seemed to work.
Do you have any suggestions or experience to share?

You can achieve same with ExecuteScript processor, where you can implement your custom cleaning logic with less limitations. Refer below python snippet to start with,
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
from org.apache.nifi.processors.script import ExecuteScript
from org.python.core.util.FileUtil import wrap
from io import StringIO
import re
# Define a subclass of StreamCallback for use in session.write()
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
with wrap(inputStream) as f:
lines = f.readlines()
outer_new_value_list = []
is_header_row = True
for row in lines:
if is_header_row:
is_header_row = False
outer_new_value_list.append(row)
continue
char_list = list(row.strip())
for position, char in enumerate(char_list):
// put your custom cleaning logic here
if char == '\\' or char == '^' or char == '"' or char == '~' :
replace_char = '\\' + char
char_list[position] = replace_char
new_data_line = ''.join([str(elem) for elem in char_list])
outer_new_value_list.append(new_data_line + '\r\n')
with wrap(outputStream, 'w') as filehandle:
filehandle.writelines("%s" % line for line in outer_new_value_list)
# end class
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile, PyStreamCallback())
session.transfer(flowFile, ExecuteScript.REL_SUCCESS)
# implicit return at the end

After a week of attempts, i was able to replace the carriage return in my file with a simple replacetext processor. Here is the print for you to try if needed.

Related

how can we print the output of a code in gui , not in console?

i have a written piece of code of around 100 lines which is printing some output of around 20 lines. How can I print this output in GUI ??
I just wrote this implementation for a project of mine, its in Python 2.7 but it should be easy to adapt it to Python 3.6
#!/usr/lib/python2.7/
# -*- coding: utf-8 -*-
from Tkinter import *
import ttk, collections
class GUI():
def __init__(self) :
self.window = Tk()
def draw(self) :
self.root = Frame(self.window,padx=15,pady=15,width=800,height=200)
self.root.grid(column=0,row=0)
self.drawConsole()
self.window.mainloop()
def drawConsole(self) :
self.consoleFrame = Frame(self.root, padx=15)
self.consoleFrame.grid(column=0,row=4,sticky="EW",pady=10)
self.logTest = Text(self.consoleFrame, height=15, state='disabled', wrap='word',background='black',foreground='yellow')
self.logTest.grid(column=0,row=0,sticky="EW")
self.scrollbar = Scrollbar(self.consoleFrame, orient=VERTICAL,command=self.logTest.yview)
self.scrollbar.grid(column=1,row=0,sticky=(N,S))
self.logTest['yscrollcommand'] = self.scrollbar.set
def writeToLog(self, msg):
numlines = self.logTest.index('end - 1 line').split('.')[0]
self.logTest['state'] = 'normal'
if numlines==24:
self.logTest.delete(1.0, 2.0)
if self.logTest.index('end-1c')!='1.0':
self.logTest.insert('end', '\n')
self.logTest.insert('end', msg)
self.logTest.see(END)
self.logTest['state'] = 'disabled'
if __name__ == "__main__":
gui = GUI()
gui.draw()
gui.writeToLog("Hello World")
I am a Python 3.x guy, but when it comes to tkinter, you can set labels with variables instead of using print(). So to get it on the GUI, you want to set labels with variables. This would look something like this:
from tkinter import *
window = Tk()
variable = StringVar()
variable.set(data_to_console) #You can use a variable with a string here or any string
label = Label(window, textvariable=variable)
label.grid(row=x, column=y)
So you take the strings that would be output into the console and use .set() to put it into a String Variable that tkinter can use. The Labels will allow for the data that would be printed to be in the GUI. Hope this helps!

unknown syntaxerror cause when declaring list on PyQt

i had this weird syntax error at the end of my newly declared list that contains only string.
i had changed the double quotation mark (") into single ('). commented 2 lines above the data variable
i also check for tabs and spaces.
i'm running this on spyder
from PyQt4 import QtGui, QtCore, uic
import sys
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
app.setStyle("cleanlooks")
#DATA
data = ["one","two","three","four","five"]
listView = QtGui.QListView()
listView.show()
model = QtGui.QStringListModel(data)
listView.setModel(model)
combobox = QtGui.QComboBox()
combobox.setModel(model)
combobox.show()
listView2 = QtGui.QListView()
listView2.show()
listView2.setModel(model)
sys.exit(app.exec_())
runfile('C:/Users/abdull/.spyder/temp.py', wdir='C:/Users/abdull/.spyder')
File "C:/Users/abdull/.spyder/temp.py", line 10
data = ["one","two","three","four","five"]
^
SyntaxError: invalid syntax

How to use ExecuteScript (with python as a script engine) for an exercise to add numbers? [Novice user trying to learn NiFi]

I am relatively new to NiFi and am not sure how to do the following correctly. I would like to use ExecuteScript processor (script engine: python) to do the following (only in python please):
1) There is a CSV file containing the following information (the first row is the header):
first,second,third
1,4,9
7,5,2
3,8,7
2) I would like to find the sum of individual rows and generate a final file with a modified header. The final file should look like this:
first,second,third,total
1,4,9,14
7,5,2,14
3,8,7,18
For the python script, I wrote:
def summation(first,second,third):
numbers = first + second + third
return numbers
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile, summation())
But it does not work and I am not sure how to fix this. Can anyone provide me an understanding on how to approach this problem?
The NiFi flow:
Thank you
Your script is not doing what you would like it to do. There are a couple approaches to this problem:
Operate on the whole flowfile at once with a script that iterates over the rows in the CSV content
Treat the rows in the CSV content as a "record" and operate on each record with a script that handles a single line
I will provide changes to your script to handle the entire flowfile content at once; you can read more about the Record* processors here, here, and here.
Here is a script which performs the action you expect. Note the differences to see where I changed things (this script could certainly be made more efficient and concise; it is verbose to demonstrate what is happening, and I am not a Python expert).
import json
from java.io import BufferedReader, InputStreamReader
from org.apache.nifi.processor.io import StreamCallback
# This PyStreamCallback class is what the processor will use to ingest and output the flowfile content
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
try:
# Get the provided inputStream into a format where you can read lines
reader = BufferedReader(InputStreamReader(inputStream))
# Set a marker for the first line to be the header
isHeader = True
try:
# A holding variable for the lines
lines = []
# Loop indefinitely
while True:
# Get the next line
line = reader.readLine()
# If there is no more content, break out of the loop
if line is None:
break
# If this is the first line, add the new column
if isHeader:
header = line + ",total"
# Write the header line and the new column
lines.append(header)
# Set the header flag to false now that it has been processed
isHeader = False
else:
# Split the line (a string) into individual elements by the ',' delimiter
elements = self.extract_elements(line)
# Get the sum (this method is unnecessary but shows where your "summation" method would go)
sum = self.summation(elements)
# Write the output of this line
newLine = ",".join([line, str(sum)])
lines.append(newLine)
# Now out of the loop, write the output to the outputStream
output = "\n".join([str(l) for l in lines])
outputStream.write(bytearray(output.encode('utf-8')))
finally:
if reader is not None:
reader.close()
except Exception as e:
log.warn("Exception in Reader")
log.warn('-' * 60)
log.warn(str(e))
log.warn('-' * 60)
raise e
session.transfer(flowFile, REL_FAILURE)
def extract_elements(self, line):
# This splits the line on the ',' delimiter and converts each element to an integer, and puts them in a list
return [int(x) for x in line.split(',')]
# This method replaces your "summation" method and can accept any number of inputs, not just 3
def summation(self, list):
# This returns the sum of all items in the list
return sum(list)
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile,PyStreamCallback())
session.transfer(flowFile, REL_SUCCESS)
Result from my flow (using your input in a GenerateFlowFile processor):
2018-07-20 13:54:06,772 INFO [Timer-Driven Process Thread-5] o.a.n.processors.standard.LogAttribute LogAttribute[id=b87f0c01-0164-1000-920e-799647cb9b48] logging for flow file StandardFlowFileRecord[uuid=de888571-2947-4ae1-b646-09e61c85538b,claim=StandardContentClaim [resourceClaim=StandardResourceClaim[id=1532106928567-1, container=default, section=1], offset=2499, length=51],offset=0,name=470063203212609,size=51]
--------------------------------------------------
Standard FlowFile Attributes
Key: 'entryDate'
Value: 'Fri Jul 20 13:54:06 EDT 2018'
Key: 'lineageStartDate'
Value: 'Fri Jul 20 13:54:06 EDT 2018'
Key: 'fileSize'
Value: '51'
FlowFile Attribute Map Content
Key: 'filename'
Value: '470063203212609'
Key: 'path'
Value: './'
Key: 'uuid'
Value: 'de888571-2947-4ae1-b646-09e61c85538b'
--------------------------------------------------
first,second,third,total
1,4,9,14
7,5,2,14
3,8,7,18

exporting seiral masseges to text file using pyserial

I'm new to python and I've write a simple code that reading lines from serial port and than write those lines to a text file. No errores occured, but the serial masseges did not appear in the text.
the code:
import serial
ser = serial.Serial('COM32', baudrate=115200, parity=serial.PARITY_NONE, stopbits=serial.STOPBITS_ONE,
bytesize=serial.EIGHTBITS, xonxoff=1)
text = open("temptext1.txt", "a+")
while 1:
read_line = ser.readline()
print read_line
text.write(read_line)
Thnaks for the helpers, i siriously dont have a clue how to debug this.
Try below code.
import serial
import io
def getSerialLogs(comport, fileName='SerialLog.txt', baudrate=115200):
ser = serial.Serial(comport, baudrate, xonxoff =True, timeout=1)
sio = io.TextIOWrapper(io.BufferedRWPair(ser, ser, 1), encoding="utf-8")
with open(fileName, 'a') as f:
while ser.isOpen():
datastring = sio.readline()
f.write(datastring)

Script working in Python2 but not in Python 3 (hashlib)

I worked today in a simple script to checksum files in all available hashlib algorithms (md5, sha1.....) I wrote it and debug it with Python2, but when I decided to port it to Python 3 it just won't work. The funny thing is that it works for small files, but not for big files. I thought there was a problem with the way I was buffering the file, but the error message is what makes me think it is something related to the way I am doing the hexdigest (I think) Here is a copy of my entire script, so feel free to copy it, use it and help me figure out what the problem is with it. The error I get when checksuming a 250 MB file is
"'utf-8' codec can't decode byte 0xf3 in position 10: invalid continuation byte"
I google it, but can't find anything that fixes it. Also if you see better ways to optimize it, please let me know. My main goal is to make work 100% in Python 3. Thanks
#!/usr/local/bin/python33
import hashlib
import argparse
def hashFile(algorithm = "md5", filepaths=[], blockSize=4096):
algorithmType = getattr(hashlib, algorithm.lower())() #Default: hashlib.md5()
#Open file and extract data in chunks
for path in filepaths:
try:
with open(path) as f:
while True:
dataChunk = f.read(blockSize)
if not dataChunk:
break
algorithmType.update(dataChunk.encode())
yield algorithmType.hexdigest()
except Exception as e:
print (e)
def main():
#DEFINE ARGUMENTS
parser = argparse.ArgumentParser()
parser.add_argument('filepaths', nargs="+", help='Specified the path of the file(s) to hash')
parser.add_argument('-a', '--algorithm', action='store', dest='algorithm', default="md5",
help='Specifies what algorithm to use ("md5", "sha1", "sha224", "sha384", "sha512")')
arguments = parser.parse_args()
algo = arguments.algorithm
if algo.lower() in ("md5", "sha1", "sha224", "sha384", "sha512"):
Here is the code that works in Python 2, I will just put it in case you want to use it without having to modigy the one above.
#!/usr/bin/python
import hashlib
import argparse
def hashFile(algorithm = "md5", filepaths=[], blockSize=4096):
'''
Hashes a file. In oder to reduce the amount of memory used by the script, it hashes the file in chunks instead of putting
the whole file in memory
'''
algorithmType = hashlib.new(algorithm) #getattr(hashlib, algorithm.lower())() #Default: hashlib.md5()
#Open file and extract data in chunks
for path in filepaths:
try:
with open(path, mode = 'rb') as f:
while True:
dataChunk = f.read(blockSize)
if not dataChunk:
break
algorithmType.update(dataChunk)
yield algorithmType.hexdigest()
except Exception as e:
print e
def main():
#DEFINE ARGUMENTS
parser = argparse.ArgumentParser()
parser.add_argument('filepaths', nargs="+", help='Specified the path of the file(s) to hash')
parser.add_argument('-a', '--algorithm', action='store', dest='algorithm', default="md5",
help='Specifies what algorithm to use ("md5", "sha1", "sha224", "sha384", "sha512")')
arguments = parser.parse_args()
#Call generator function to yield hash value
algo = arguments.algorithm
if algo.lower() in ("md5", "sha1", "sha224", "sha384", "sha512"):
for hashValue in hashFile(algo, arguments.filepaths):
print hashValue
else:
print "Algorithm {0} is not available in this script".format(algorithm)
if __name__ == "__main__":
main()
I haven't tried it in Python 3, but I get the same error in Python 2.7.5 for binary files (the only difference is that mine is with the ascii codec). Instead of encoding the data chunks, open the file directly in binary mode:
with open(path, 'rb') as f:
while True:
dataChunk = f.read(blockSize)
if not dataChunk:
break
algorithmType.update(dataChunk)
yield algorithmType.hexdigest()
Apart from that, I'd use the method hashlib.new instead of getattr, and hashlib.algorithms_available to check if the argument is valid.

Resources