ImportError: No module named sparkdl.image.imageIO - image

i'm doing image classification using spark.
i have already imported sparkdl jar(added path of jar in the conf/spark.default)
ImportError: No module named sparkdl.image.imageIO
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 161, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 91, in read_udfs
_, udf = read_single_udf(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 78, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 54, in read_command
command = serializer._read_with_length(file)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
return self.loads(obj)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 454, in loads
return pickle.loads(obj)
ImportError: No module named sparkdl.image.imageIO
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
from sparkd import readImages
when i read image it work good ,
but when create model and fit the image DataFrame that time it gives
No module named sparkdl.image.imageIO

Related

RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]')

I am working with pyspark and elasticsearch (py library), and while updating one of the documents in ES I am getting the following error.
2021-09-08 06:31:49 ERROR JobScheduler:91 - Error running job streaming job 1631082700000 ms.1
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 161, in <lambda>
func = lambda t, rdd: old_func(rdd)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 124, in RDDfromKafkaStream
posttoES(row)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 100, in posttoES
es.update(index="anonprofile", id = jsonid, body=query)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/__init__.py", line 1903, in update
"POST", path, params=params, headers=headers, body=body
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 458, in perform_request
raise e
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 426, in perform_request
timeout=timeout,
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/http_urllib3.py", line 277, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/base.py", line 331, in _raise_error
status_code, error_message, additional_info
elasticsearch.exceptions.RequestError: RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]')
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 172, in <module>
ssc.awaitTermination()
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/context.py", line 192, in awaitTermination
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o31.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 161, in <lambda>
func = lambda t, rdd: old_func(rdd)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 124, in RDDfromKafkaStream
posttoES(row)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 100, in posttoES
es.update(index="anonprofile", id = jsonid, body=query)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/__init__.py", line 1903, in update
"POST", path, params=params, headers=headers, body=body
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 458, in perform_request
raise e
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 426, in perform_request
timeout=timeout,
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/http_urllib3.py", line 277, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/base.py", line 331, in _raise_error
status_code, error_message, additional_info
elasticsearch.exceptions.RequestError: RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]')
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
The code snippet is-
row = json.loads(row)
count = row["Count"]
row.pop("Count")
jsonid = hashlib.sha224(json.dumps(row).encode('ascii', 'ignore')).hexdigest()
if es.exists(index="anonprofile", id=jsonid):
q = {
"script": {
"source": "ctx._source.Count+={}".format(count),
"lang": "painless"
}
}
es.update(index="anonprofile", id = jsonid, body=q)
else:
row["Count"] = count
es.index(index="anonprofile", id=jsonid, body={'doc' : row})
The first document goes to the else block and works as expected, but entering the if block it returns an error on the update call.
Looking up on the internet, I tried changing the query, but nothing seems to be working.
P.S. For learning purposes I was trying to execute the same task without the pyspark integrating, and that seems to be working file. The code for that is here
The problem is that you insert information in doc field which is convert into a properties since row variable is a dict of values and you try to update _source.Count instead of _source.doc.Count
body arg with doc field is only usefull for update with for example an upsert or a script when the document not exist.
So for example :
row["Count"] = count
body = {
"script": {
"source": "ctx._source.Count+={}".format(count),
"lang": "painless"
}
"upsert": row
}
es.update(index="anonprofile", id=jsonid, body=body)
instead of your if exists...

install pyspark in EC2 instance (amazon linux)

Hi I tried to install pyspark in a EC2 instance (standard Amazon linux image). I installed anaconda python 3.6 and used "pip install pyspark" to install spark. It worked just fine. But when I try to enter pyspark with command "pyspark", I got the following error message. What could have gone wrong? Thanks!
Python 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 18:10:19)
[GCC 7.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
18/03/03 05:47:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18/03/03 05:47:13 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext may be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:236)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:214)
java.lang.Thread.run(Thread.java:748)
Traceback (most recent call last):
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/python/pyspark/shell.py", line 45, in <module>
spark = SparkSession.builder\
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/sql/session.py", line 169, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 334, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 118, in __init__
conf, jsc, profiler_cls)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 180, in _do_init
self._jsc = jsc or self._initialize_context(self._conf._jconf)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 273, in _initialize_context
return self._jvm.JavaSparkContext(jconf)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1401, in __call__
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.ExceptionInInitializerError
at org.apache.spark.SparkConf.validateSettings(SparkConf.scala:546)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:373)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:236)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: 10-236-108-194: 10-236-108-194: Temporary failure in name resolution
at java.net.InetAddress.getLocalHost(InetAddress.java:1505)
at org.apache.spark.util.Utils$.findLocalInetAddress(Utils.scala:891)
at org.apache.spark.util.Utils$.org$apache$spark$util$Utils$$localIpAddress$lzycompute(Utils.scala:884)
at org.apache.spark.util.Utils$.org$apache$spark$util$Utils$$localIpAddress(Utils.scala:884)
at org.apache.spark.util.Utils$$anonfun$localHostName$1.apply(Utils.scala:941)
at org.apache.spark.util.Utils$$anonfun$localHostName$1.apply(Utils.scala:941)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.util.Utils$.localHostName(Utils.scala:941)
at org.apache.spark.internal.config.package$.<init>(package.scala:204)
at org.apache.spark.internal.config.package$.<clinit>(package.scala)
... 14 more
Caused by: java.net.UnknownHostException: 10-236-108-194: Temporary failure in name resolution
at java.net.Inet6AddressImpl.lookupAllHostAddr(Native Method)
at java.net.InetAddress$2.lookupAllHostAddr(InetAddress.java:928)
at java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1323)
at java.net.InetAddress.getLocalHost(InetAddress.java:1500)
... 23 more
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/python/pyspark/shell.py", line 54, in <module>
spark = SparkSession.builder.getOrCreate()
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/sql/session.py", line 169, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 334, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 118, in __init__
conf, jsc, profiler_cls)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 180, in _do_init
self._jsc = jsc or self._initialize_context(self._conf._jconf)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/context.py", line 273, in _initialize_context
return self._jvm.JavaSparkContext(jconf)
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1401, in __call__
File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.internal.config.package$
at org.apache.spark.SparkConf.validateSettings(SparkConf.scala:546)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:373)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:236)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Just use a Docker container: https://github.com/jupyter/docker-stacks/tree/master/pyspark-notebook why go thru the hassle of configuring it?

trying tio connect to phoenix table with pyspark and getting the following error

my commnd for reading phoenix table:
sql_sc.read.format("org.apache.phoenix.spark").option("table", tablename).option("zkUrl", "10.0.11.21:2181").load()
error:
Traceback (most recent call last):
File "/bdaas/exe/healthcare/hl7visualization.py", line 42, in
hl7 = phoenix_sparkdata(spark_app='hl7-app',spark_master='local',table_name='hl7table_v2_3')
File "/bdaas/exe/healthcare/hl7visualization.py", line 19, in init
self.dataframe = self.phoenix_getdataframe(table_name)
File "/bdaas/exe/healthcare/hl7visualization.py", line 41, in phoenix_getdataframe
df = self.sql_sc.read.format("org.apache.phoenix.spark").option("table", tablename).option("zkUrl", "10.0.11.21:2181").load()
File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 139, in load
File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in call
File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 45, in deco
File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o43.load.
: java.lang.NoSuchMethodError: com.fasterxml.jackson.module.scala.deser.BigDecimalDeserializer$.handledType()Ljava/lang/Class;
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.(ScalaNumberDeserializersModule.scala:49)
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.(ScalaNumberDeserializersModule.scala)
at com.fasterxml.jackson.module.scala.deser.ScalaNumberDeserializersModule$class.$init$(ScalaNumberDeserializersModule.scala:61)
at com.fasterxml.jackson.module.scala.DefaultScalaModule.(DefaultScalaModule.scala:19)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.(DefaultScalaModule.scala:35)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.(DefaultScalaModule.scala)
at org.apache.spark.rdd.RDDOperationScope$.(RDDOperationScope.scala:81)
at org.apache.spark.rdd.RDDOperationScope$.(RDDOperationScope.scala)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:714)
at org.apache.spark.SparkContext.newAPIHadoopRDD(SparkContext.scala:1152)
at org.apache.phoenix.spark.PhoenixRDD.(PhoenixRDD.scala:46)
at org.apache.phoenix.spark.PhoenixRelation.schema(PhoenixRelation.scala:50)
at org.apache.spark.sql.execution.datasources.LogicalRelation.(LogicalRelation.scala:37)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)

Spark unzip from S3

I'm using Spark (pyspark) to read in data and have ran into a snag as some of my data is now in .gz format.
%pyspark
data = sc.textFile("s3://mybucket.file.gz")
data.first()
Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-5205743886772607083.py", line 267, in <module>
raise Exception(traceback.format_exc())
Exception: Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-5205743886772607083.py", line 260, in <module>
exec(code)
File "<stdin>", line 1, in <module>
File "/usr/lib/spark/python/pyspark/rdd.py", line 1041, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/usr/lib/spark/python/pyspark/rdd.py", line 1032, in sum
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
File "/usr/lib/spark/python/pyspark/rdd.py", line 906, in fold
vals = self.mapPartitions(func).collect()
File "/usr/lib/spark/python/pyspark/rdd.py", line 809, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 31.0 failed 4 times, most recent failure: Lost task 0.3 in stage 31.0 (TID 270, ip-172-16-238-231.us-west-1.compute.internal, executor 17): java.io.IOException: incorrect header check
at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.inflateBytesDirect(Native Method)
Any thoughts on how to read this in and uncompress?
Strangely enough, all I had to do is remove the "gz" off the end and it worked.

Running a Windows Batch File through Piping in Apache Spark

I have a requirement in which I have to run a Windows batch file using Apache Spark on multiple nodes of the Spark cluster.
So is it possible to do the same using Piping concept of Apache Spark?
I have before run a shell file using Piping in Spark on a Ubuntu machine. My below code doing the same runs fine:
data = ["hi","hello","how","are","you"]
distScript = "/home/aawasthi/echo.sh"
distScriptName = "echo.sh"
sc.addFile(distScript)
RDDdata = sc.parallelize(data)
print RDDdata.pipe(SparkFiles.get(distScriptName)).collect()
I tried to adapt the same code to run a Windows batch file on a Windows machine having Spark (1.6 prebuilt for Hadoop 2.6) installed. But it gives me the error on the sc.addFile step. Code is below:
batchFile = "D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv"
batchFileName = "runOpenCv"
sc.addFile(batchFile)
Error thrown by Spark is below:
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-9e13c265cbae> in <module>()
----> 1 sc.addFile(batchFile)`
Py4JJavaError: An error occurred while calling o160.addFile.
: java.io.FileNotFoundException: Added file D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv does not exist.
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1364)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1340)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Although the batch file exists at the given location.
UPDATE:
Added .bat as extension in the batchFile & batchFileName & file:/// in the starting of the file path. The modified code is:
from pyspark import SparkFiles
from pyspark import SparkContext
sc
batchFile = "file:///D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv.bat"
batchFileName = "runOpenCv.bat"
sc.addFile(batchFile)
RDDdata = sc.parallelize(["hi","hello"])
print SparkFiles.get("runOpenCv.bat")
print RDDdata.pipe(SparkFiles.get(batchFileName)).collect()
Now it doesn't give error in the addFile step, and print SparkFiles.get("runOpenCv.bat") prints the path
C:\Users\abhilash.awasthi\AppData\Local\Temp\spark-c0f383b1-8365-4840-bd0f-e7eb46cc6794\userFiles-69051066-f18c-45dc-9610-59cbde0d77fe\runOpenCv.bat
So file is added. But in the last step of the code it throws the below error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-6-bf2b8aea3ef0> in <module>()
----> 1 print RDDdata.pipe(SparkFiles.get(batchFileName)).collect()
D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.pyc in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
D:\spark-1.6.2-bin-hadoop2.6\python\lib\py4j-0.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\sql\utils.pyc in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
D:\spark-1.6.2-bin-hadoop2.6\python\lib\py4j-0.9-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 111, in main
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 106, in process
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 317, in func
return f(iterator)
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 715, in func
shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
File "C:\Anaconda2\lib\subprocess.py", line 710, in __init__
errread, errwrite)
File "C:\Anaconda2\lib\subprocess.py", line 958, in _execute_child
startupinfo)
WindowsError: [Error 2] The system cannot find the file specified
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 111, in main
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 106, in process
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 317, in func
return f(iterator)
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 715, in func
shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
File "C:\Anaconda2\lib\subprocess.py", line 710, in __init__
errread, errwrite)
File "C:\Anaconda2\lib\subprocess.py", line 958, in _execute_child
startupinfo)
WindowsError: [Error 2] The system cannot find the file specified
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
Please escape /
batchFile = "D://spark-1.6.2-bin-hadoop2.6//data//OpenCV//runOpenCv"
Also, as AA suggested above, it may have .cmd or .bat extension.

Resources