How to execute python script in NiFi and put the result output into a file? - apache-nifi

I have a script in python that is connected with sap hana to get the info from a query and then I want to load this info into a .csv file with Apache NiFi.
My python script:
codecs.register(lambda s: (
pyhdb.cesu8.CESU8_CODEC_INFO
if s in {'cesu-8', 'cesu_8'}
else None
))
connection = pyhdb.connect(
host="192.168.xx.xx",
port=30215,
user="ùser",
password="pass"
)
cursor = connection.cursor()
#cursor.execute("SELECT TOP 100 * FROM \"SAPABAP1\".\"BKPF\";")
cursor.execute("select distinct cepc.PRCTR as suc_sucursal_id,cepct.KTEXT as suc_sucursal_desc,csks.BUKRS as soc_sociedad_id FROM \"SAPABAP1\".\"CEPC\" cepc left join \"SAPABAP1\".\"CEPCT\" cepct on cepc.PRCTR=cepct.PRCTR left join \"SAPABAP1\".\"CSKS\" csks on csks.KOSTL=cepc.PRCTR where csks.BUKRS in ('1000','1002','2000','3000','3001','4000','5000','5001','7000','8000') and cepct.KTEXT not like '%TEST%';")
x = cursor.fetchall()
#print(type(x))
print("Sucursales:")
print(" ")
print(x)
print(" ")
print(type(x))
print(" ")
f = open("sucursales3.csv", "a", newline="")
writer = csv.writer(sys.stdout)
writer.writerow(x)
f.close()
But I don't know how set the output in this script and set it in Nifi to put it into a file.
I have this flow but doesn't work:
'Execute Stream Command' configuration:
I don't know why the execute stream command is not executing the python script.
Could some body help me with this flow?
I am very new in Nifi!

Related

Liquidsoap request.dynamic

I would like to create a dynamic playlist. I am using an external bash script to generate multiple paths to audio files, I am using request.dynamic for that but it seems like it only reads the first line of the output of my bash script. Can anyone please help me? Any help or suggestion would be appreciated. Thanks
Here is my liquidsoap script :
set ("log.file.path","/home/admin/radio.log")
def my_request_function() =
result =
list.hd(default="", get_process_lines("sh testScript.sh"))
request.create(result, persistent=true)
end
m = request.dynamic(my_request_function)
m = audio_to_stereo(m)
radio = m
clock.assign_new(id="/stream",[output.icecast(%vorbis(samplerate=44100, channels=2, quality=0.3),format="audio/ogg", fallible=true, host = "ip address", port = 8080 , password="password", mount = "/test1",radio)])
Here is my bash script :
#!/bin/bash
now="$(date +'%Y-%m-%d')"
cd Playlist
cd Musique
cat $now*
and here is the result of it :
./Audio/147/n.mp3
./Audio/150/test.mp3
./Audio/308/eee.mp3

NiFi: how to write local files information in all the nodes?

NiFi version: 1.5
Environment: 3 node HDF
NiFi flow:
ListFTP(primary) -> FetchFTP -> PutFile -> ExecuteScript
i get the file from the source location and save in destination location. After which, i write python code(ExecuteScript) to track the files written successfully in local file system(/var/log/nifi/status/status.csv).
below is my python code:
from org.apache.nifi.processors.script import ExecuteScript
import datetime
flowFile = session.get()
if (flowFile != None):
fn = flowFile.getAttribute('filename')
timestamp = datetime.datetime.now()
fopn = open('/var/log/nifi/status/status.csv','a')
res = "\n" + "timestamp" + "fn" + "SUCCESS"
fd.write(res)
fd.close()
session.transfer(flowFile,ExecuteScript.REL_SUCCESS)
Execution = All nodes
the file information are written into the local path(status.csv) but it is written only in one instance.
i want to update the file(status.csv) in all the nodes.
TIA.

Is it possible to read pdf/audio/video files(unstructured data) using Apache Spark?

Is it possible to read pdf/audio/video files(unstructured data) using Apache Spark?
For example, I have thousands of pdf invoices and I want to read data from those and perform some analytics on that. What steps must I do to process unstructured data?
Yes, it is. Use sparkContext.binaryFiles to load files in binary format and then use map to map value to some other format - for example, parse binary with Apache Tika or Apache POI.
Pseudocode:
val rawFile = sparkContext.binaryFiles(...
val ready = rawFile.map ( here parsing with other framework
What is important, parsing must be done with other framework like mentioned previously in my answer. Map will get InputStream as an argument
We had a scenario where we needed to use a custom decryption algorithm on the input files. We didn't want to rewrite that code in Scala or Python. Python-Spark code follows:
from pyspark import SparkContext, SparkConf, HiveContext, AccumulatorParam
def decryptUncompressAndParseFile(filePathAndContents):
'''each line of the file becomes an RDD record'''
global acc_errCount, acc_errLog
proc = subprocess.Popen(['custom_decrypt_program','--decrypt'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(unzippedData, err) = proc.communicate(input=filePathAndContents[1])
if len(err) > 0: # problem reading the file
acc_errCount.add(1)
acc_errLog.add('Error: '+str(err)+' in file: '+filePathAndContents[0]+
', on host: '+ socket.gethostname()+' return code:'+str(returnCode))
return [] # this is okay with flatMap
records = list()
iterLines = iter(unzippedData.splitlines())
for line in iterLines:
#sys.stderr.write('Line: '+str(line)+'\n')
values = [x.strip() for x in line.split('|')]
...
records.append( (... extract data as appropriate from values into this tuple ...) )
return records
class StringAccumulator(AccumulatorParam):
''' custom accumulator to holds strings '''
def zero(self,initValue=""):
return initValue
def addInPlace(self,str1,str2):
return str1.strip()+'\n'+str2.strip()
def main():
...
global acc_errCount, acc_errLog
acc_errCount = sc.accumulator(0)
acc_errLog = sc.accumulator('',StringAccumulator())
binaryFileTup = sc.binaryFiles(args.inputDir)
# use flatMap instead of map, to handle corrupt files
linesRdd = binaryFileTup.flatMap(decryptUncompressAndParseFile, True)
df = sqlContext.createDataFrame(linesRdd, ourSchema())
df.registerTempTable("dataTable")
...
The custom string accumulator was very useful in identifying corrupt input files.

How to export data from Spark SQL to CSV

This command works with HiveQL:
insert overwrite directory '/data/home.csv' select * from testtable;
But with Spark SQL I'm getting an error with an org.apache.spark.sql.hive.HiveQl stack trace:
java.lang.RuntimeException: Unsupported language features in query:
insert overwrite directory '/data/home.csv' select * from testtable
Please guide me to write export to CSV feature in Spark SQL.
You can use below statement to write the contents of dataframe in CSV format
df.write.csv("/data/home/csv")
If you need to write the whole dataframe into a single CSV file, then use
df.coalesce(1).write.csv("/data/home/sample.csv")
For spark 1.x, you can use spark-csv to write the results into CSV files
Below scala snippet would help
import org.apache.spark.sql.hive.HiveContext
// sc - existing spark context
val sqlContext = new HiveContext(sc)
val df = sqlContext.sql("SELECT * FROM testtable")
df.write.format("com.databricks.spark.csv").save("/data/home/csv")
To write the contents into a single file
import org.apache.spark.sql.hive.HiveContext
// sc - existing spark context
val sqlContext = new HiveContext(sc)
val df = sqlContext.sql("SELECT * FROM testtable")
df.coalesce(1).write.format("com.databricks.spark.csv").save("/data/home/sample.csv")
Since Spark 2.X spark-csv is integrated as native datasource. Therefore, the necessary statement simplifies to (windows)
df.write
.option("header", "true")
.csv("file:///C:/out.csv")
or UNIX
df.write
.option("header", "true")
.csv("/var/out.csv")
Notice: as the comments say, it is creating the directory by that name with the partitions in it, not a standard CSV file. This, however, is most likely what you want since otherwise your either crashing your driver (out of RAM) or you could be working with a non distributed environment.
The answer above with spark-csv is correct but there is an issue - the library creates several files based on the data frame partitioning. And this is not what we usually need. So, you can combine all partitions to one:
df.coalesce(1).
write.
format("com.databricks.spark.csv").
option("header", "true").
save("myfile.csv")
and rename the output of the lib (name "part-00000") to a desire filename.
This blog post provides more details: https://fullstackml.com/2015/12/21/how-to-export-data-frame-from-apache-spark/
The simplest way is to map over the DataFrame's RDD and use mkString:
df.rdd.map(x=>x.mkString(","))
As of Spark 1.5 (or even before that)
df.map(r=>r.mkString(",")) would do the same
if you want CSV escaping you can use apache commons lang for that. e.g. here's the code we're using
def DfToTextFile(path: String,
df: DataFrame,
delimiter: String = ",",
csvEscape: Boolean = true,
partitions: Int = 1,
compress: Boolean = true,
header: Option[String] = None,
maxColumnLength: Option[Int] = None) = {
def trimColumnLength(c: String) = {
val col = maxColumnLength match {
case None => c
case Some(len: Int) => c.take(len)
}
if (csvEscape) StringEscapeUtils.escapeCsv(col) else col
}
def rowToString(r: Row) = {
val st = r.mkString("~-~").replaceAll("[\\p{C}|\\uFFFD]", "") //remove control characters
st.split("~-~").map(trimColumnLength).mkString(delimiter)
}
def addHeader(r: RDD[String]) = {
val rdd = for (h <- header;
if partitions == 1; //headers only supported for single partitions
tmpRdd = sc.parallelize(Array(h))) yield tmpRdd.union(r).coalesce(1)
rdd.getOrElse(r)
}
val rdd = df.map(rowToString).repartition(partitions)
val headerRdd = addHeader(rdd)
if (compress)
headerRdd.saveAsTextFile(path, classOf[GzipCodec])
else
headerRdd.saveAsTextFile(path)
}
With the help of spark-csv we can write to a CSV file.
val dfsql = sqlContext.sql("select * from tablename")
dfsql.write.format("com.databricks.spark.csv").option("header","true").save("output.csv")`
The error message suggests this is not a supported feature in the query language. But you can save a DataFrame in any format as usual through the RDD interface (df.rdd.saveAsTextFile). Or you can check out https://github.com/databricks/spark-csv.
enter code here IN DATAFRAME:
val p=spark.read.format("csv").options(Map("header"->"true","delimiter"->"^")).load("filename.csv")

Prevent Oracle SQL Developer from truncating CLOBs on export

I want to export a query result that contains large CLOBs to a CSV file. However, once exported in the CSV fields, CLOBs are truncated after around 4K characters (i.e. they'll prematurely end with "…"). How to prevent Oracle SQL Developer from truncating CLOBs on export?
You could bypass Oracle SQL Developer for the export, e.g. you could use use a Python script to take care of the export so that the CLOBs won't get truncated:
from __future__ import print_function
from __future__ import division
import time
import cx_Oracle
def get_cursor():
'''
Get a cursor to the database
'''
# http://stackoverflow.com/questions/24149138/cx-oracle-doesnt-connect-when-using-sid-instead-of-service-name-on-connection-s
# http://www.oracle.com/technetwork/articles/dsl/prez-python-queries-101587.html
ip = '' # E.g. '127.0.0.1'
port = '' # e.g. '3306'
sid = ''
dsnStr = cx_Oracle.makedsn(ip, port, sid)
username = '' # E.g. 'FRANCK'
password = '' # E.g. '123456'
db = cx_Oracle.connect(user=username, password=password, dsn=dsnStr)
cursor = db.cursor()
return cursor
def read_sql(filename):
'''
Read an SQL file and return it as a string
'''
file = open(filename, 'r')
return ' '.join(file.readlines()).replace(';', '')
def execute_sql_file(filename, cursor, verbose = False, display_query = False):
'''
Execute an SQL file and return the results
'''
sql = read_sql(filename)
if display_query: print(sql)
start = time.time()
if verbose: print('SQL query started... ', end='')
cursor.execute(sql)
if verbose:
end = time.time()
print('SQL query done. (took {0} seconds)'.format(end - start))
return cursor
def main():
'''
This is the main function
'''
# Demo:
cursor = oracle_db.get_cursor()
sql_filename = 'your_query.sql' # Write your query there
cursor = oracle_db.execute_sql_file(sql_filename, cursor, True)
result_filename = 'result.csv' # Will export your query result there
result_file = open(result_filename, 'w')
delimiter = ','
for row in cursor:
for count, column in enumerate(row):
if count > 0: result_file.write(delimiter)
result_file.write(str(column))
result_file.write('\n')
result_file.close()
if __name__ == "__main__":
main()
#cProfile.run('main()') # if you want to do some profiling
I'm using using SQL Developer Version 4.1.3.20 and have the same issue. The only thing that worked for me was selecting XML as the export format. Doing this, I was able to export a ~135,000 character JSON string with no truncation.
The second problem, however, is immediately after exporting I attempted to import data and SQL Developer said it could not open the file due to error "null".

Resources