PySpark Parse Kafka csv delimited data to columns using from_csv - windows

I am new to Structured Streaming with Kafka. Trying to convert the , delimited data from Kafka to Dataframe in PySpark using schema and from_csv.
kafkaDataSchema = StructType([
StructField("sid", StringType()), StructField("timestamp", LongType()),
StructField("sensor", StringType()), StructField("value", StringType()),
])
kafkaStream = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", self.config.get('kafka-config', 'bootstrap-servers')) \
.option("subscribe", self.config.get('kafka-config', 'topic-list-input')) \
.option("startingOffsets", self.config.get('kafka-config', 'startingOffsets')) \
.load()\
.selectExpr("CAST(value AS STRING)")
formattedStream = kafkaStream.select(from_csv(kafkaStream.value, kafkaDataSchema))
I am getting the below error:
Traceback (most recent call last):
File "main.py", line 43, in <module>
formattedStream = KafkaSource.readData(spark)
File "src.zip/src/main/sources/KafkaSource.py", line 31, in readData
File "src.zip/src/main/sources/KafkaSource.py", line 36, in formatKafkaData
File "/spark-3.1.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/sql/functions.py", line 4082, in from_csv
TypeError: schema argument should be a column or string
How can I solve the issue?

Related

Parquet 2.4+ and PyArrow 10.0.1 - Attempting to switch pyarrow column from string to datetime

I attempted to follow the advice of Converting string timestamp to datetime using pyarrow , however my formatting seems to not be accepted by pyarrow
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
'query_start_time': ["2022-12-30T19:02:40.466",
"2022-12-30T19:02:40.466",
"2022-12-30T19:02:40.466",
"2022-12-30T19:02:40.466",
"2022-12-30T19:02:40.466",
"2022-12-30T19:02:40.466"]})
pc.strptime(table.column("query_start_time"), format='%Y-%m-%dT%H:%M:%S.%f', unit='ms')
writer = pq.ParquetWriter('example.parquet', table.schema)
writer.write_table(table)
writer.close()
I've attempted removing the T , adding a Z at the end of the formatter and string.. seems instead I need to ..?
Traceback (most recent call last):
File "/home/emcp/Dev/temp_pyarrow/main.py", line 16, in <module>
pc.strptime(table.column("query_start_time"), format='%Y-%m-%d %H:%M:%S.%f', unit='ms')
File "/home/emcp/Dev/temp_pyarrow/venv/lib/python3.10/site-packages/pyarrow/compute.py", line 255, in wrapper
return func.call(args, options, memory_pool)
File "pyarrow/_compute.pyx", line 355, in pyarrow._compute.Function.call
File "pyarrow/error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Failed to parse string: '2022-12-30T19:02:40.466' as a scalar of type timestamp[ms]
Do I need to manually convert the datetime value into an integer column and THEN change the column again?
EDIT: When I strip the .%f and change units to units='s' it seems to not error.. I am looking into that now

Pyspark - error when save data into Hive table "unresolved operator 'InsertIntoTable HiveTableRelation'"

I use the following:
pyspark library , version 2.3.1
python, version 2.7.1
hadoop, version 2.7.3
hive, version 1.2.1000.2.6.5.30-1
spark version 2
My hive table looks following:
CREATE TABLE IF NOT EXISTS my_database.my_table
(
division STRING COMMENT 'Sample column'
)
I want to save data into HIVE using pyspark. I use the following code:
spark_session = SparkSession.builder.getOrCreate()
hive_context = HiveContext(spark_session.sparkContext)
hive_table_schema = hive_context.table("my_database.my_table").schema
df_to_save = spark_session.createDataFrame([["a"],["b"],["c"]], schema=hive_table_schema)
df_to_save.write.mode("append").insertInto("my_database.my_table")
But the following error occur:
Traceback (most recent call last):
File "/home/my_user/mantis service_quality_check__global/scripts/row_counts_preprocess.py", line 147, in <module> df_to_save.write.mode("append").insertInto(hive_table_row_counts_str)
File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 716, in insertInto
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
pyspark.sql.utils.AnalysisException: u"unresolved operator 'InsertIntoTable HiveTableRelation `my_database`.`my_table`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [division#14], false, false;;\n'InsertIntoTable HiveTableRelation `my_database`.`my_table`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [division#14], false, false\n+- LogicalRDD [division#2], false\n"
Please is there someone who cane help with this? I am stuck with this few days
I found the issue. SparkSession has to support hive. The method enableHiveSupport() has to be call when spark session is created.
Then creation of spark session will looks like following
spark_session = SparkSession.builder.enableHiveSupport().getOrCreate()

Error while restoring Couchbase database using cbrestore command in mac

I am getting the below error while restoring the couchbase database to my local mac from a server.
Traceback (most recent call last):
File "/Applications/Couchbase Server.app/Contents/Resources/couchbase-core/lib/python/cbrestore", line
12, in <module>
pump_transfer.exit_handler(pump_transfer.Restore().main(sys.argv))
File "/Applications/Couchbase Server.app/Contents/Resources/couchbase-core/lib/python/pump_transfer.py", line 80, in main
rv = pumpStation.run()
File "/Applications/Couchbase Server.app/Contents/Resources/couchbase-core/lib/python/pump.py", line 136, in run
rv = self.transfer_bucket_msgs(source_bucket, source_map, sink_map)
File "/Applications/Couchbase Server.app/Contents/Resources/couchbase-core/lib/python/pump.py", line 233, in transfer_bucket_msgs
source_map)
File "/Applications/Couchbase Server.app/Contents/Resources/couchbase-core/lib/python/pump_bfd.py", line 546, in total_msgs
rv, db, ver = connect_db(x, opts, CBB_VERSION)
ValueError: need more than 2 values to unpack
Any help is appreciated.
Thanks,
Emraan

Spark structured streaming query exception

This is my streaming code
session = get_session(SparkConf())
lookup = '/Users/vahagn/stream'
userSchema = StructType().add("auction_id", "string").add("dma", "string")
auctions = session.readStream.schema(userSchema).json("/Users/vahagn/stream/")
inputDF = auctions.groupBy("auction_id").count()
print inputDF.isStreaming
inputDF.printSchema()
inputDF.writeStream.outputMode("update").format("console").start().awaitTermination()
After reading first file I'm getting error, which doesn't explain anything.
Any ideas ?
Traceback (most recent call last):
File "/Users/vahagn/hydra/spark/structured_streaming.py", line 257, in <module>
inputDF.writeStream.outputMode("update").format("console").start().awaitTermination()
File "/Users/vahagn/Downloads/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/streaming.py", line 106, in awaitTermination
File "/Users/vahagn/Downloads/spark-2.3.0-bin-hadoop2.7/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/Users/vahagn/Downloads/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/utils.py", line 75, in deco
pyspark.sql.utils.StreamingQueryException: u'null\n=== Streaming Query ===\nIdentifier: [id = 2f4b442a-38f9-41f1-a3d4-52e0a48427c0, runId = b843f25f-4132-4d52-ae64-f3be5e85a3d9]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {FileStreamSource[file:/Users/vahagn/stream]: {"logOffset":0}}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nAggregate [auction_id#0], [auction_id#0, count(1) AS count#7L]\n+- StreamingExecutionRelation FileStreamSource[file:/Users/vahagn/stream], [auction_id#0, dma#1]\n'
I've solved problem by downgrading java9 to java8.

pymysql,Lost connection to MySQL server during query

Lost connection to MySQL server during query, how can I fix this? Better fix this in my program.
import pymysql
connection = pymysql.connect(host='***',
user='***',
password='***',
db='***',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
with connection.cursor() as cursor:
sql = "SELECT MAX(group_id) FROM topic_duplicate_check"
cursor.execute(sql) # Exception
r = cursor.fetchone()
max_gid = None
try:
max_gid = r['MAX(group_id)']
except:
pass
print(max_gid)
C:\ProgramData\Anaconda3\python.exe F:/group_topics/main.py
Traceback (most recent call last):
File "F:/group_topics/main.py", line 41, in
cursor.execute(sql)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymysql\cursors.py", line 166, in execute
result = self._query(query)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymysql\cursors.py", line 322, in _query
conn.query(q)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymysql\connections.py", line 856, in query
self._affected_rows = self._read_query_result(unbuffered=unbuffered)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymysql\connections.py", line 1057, in _read_query_result
result.read()
File "C:\ProgramData\Anaconda3\lib\site-packages\pymysql\connections.py", line 1340, in read
first_packet = self.connection._read_packet()
File "C:\ProgramData\Anaconda3\lib\site-packages\pymysql\connections.py", line 987, in _read_packet
packet_header = self._read_bytes(4)
File "C:\ProgramData\Anaconda3\lib\site-packages\pymysql\connections.py", line 1033, in _read_bytes
CR.CR_SERVER_LOST, "Lost connection to MySQL server during query")
pymysql.err.OperationalError: (2013, 'Lost connection to MySQL server during query')
Process finished with exit code 1

Resources