RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]') - elasticsearch

I am working with pyspark and elasticsearch (py library), and while updating one of the documents in ES I am getting the following error.
2021-09-08 06:31:49 ERROR JobScheduler:91 - Error running job streaming job 1631082700000 ms.1
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 161, in <lambda>
func = lambda t, rdd: old_func(rdd)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 124, in RDDfromKafkaStream
posttoES(row)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 100, in posttoES
es.update(index="anonprofile", id = jsonid, body=query)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/__init__.py", line 1903, in update
"POST", path, params=params, headers=headers, body=body
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 458, in perform_request
raise e
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 426, in perform_request
timeout=timeout,
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/http_urllib3.py", line 277, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/base.py", line 331, in _raise_error
status_code, error_message, additional_info
elasticsearch.exceptions.RequestError: RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]')
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 172, in <module>
ssc.awaitTermination()
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/context.py", line 192, in awaitTermination
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o31.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 161, in <lambda>
func = lambda t, rdd: old_func(rdd)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 124, in RDDfromKafkaStream
posttoES(row)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 100, in posttoES
es.update(index="anonprofile", id = jsonid, body=query)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/__init__.py", line 1903, in update
"POST", path, params=params, headers=headers, body=body
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 458, in perform_request
raise e
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 426, in perform_request
timeout=timeout,
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/http_urllib3.py", line 277, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/base.py", line 331, in _raise_error
status_code, error_message, additional_info
elasticsearch.exceptions.RequestError: RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]')
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
The code snippet is-
row = json.loads(row)
count = row["Count"]
row.pop("Count")
jsonid = hashlib.sha224(json.dumps(row).encode('ascii', 'ignore')).hexdigest()
if es.exists(index="anonprofile", id=jsonid):
q = {
"script": {
"source": "ctx._source.Count+={}".format(count),
"lang": "painless"
}
}
es.update(index="anonprofile", id = jsonid, body=q)
else:
row["Count"] = count
es.index(index="anonprofile", id=jsonid, body={'doc' : row})
The first document goes to the else block and works as expected, but entering the if block it returns an error on the update call.
Looking up on the internet, I tried changing the query, but nothing seems to be working.
P.S. For learning purposes I was trying to execute the same task without the pyspark integrating, and that seems to be working file. The code for that is here

The problem is that you insert information in doc field which is convert into a properties since row variable is a dict of values and you try to update _source.Count instead of _source.doc.Count
body arg with doc field is only usefull for update with for example an upsert or a script when the document not exist.
So for example :
row["Count"] = count
body = {
"script": {
"source": "ctx._source.Count+={}".format(count),
"lang": "painless"
}
"upsert": row
}
es.update(index="anonprofile", id=jsonid, body=body)
instead of your if exists...

Related

ImportError: No module named sparkdl.image.imageIO

i'm doing image classification using spark.
i have already imported sparkdl jar(added path of jar in the conf/spark.default)
ImportError: No module named sparkdl.image.imageIO
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 161, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 91, in read_udfs
_, udf = read_single_udf(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 78, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 54, in read_command
command = serializer._read_with_length(file)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
return self.loads(obj)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 454, in loads
return pickle.loads(obj)
ImportError: No module named sparkdl.image.imageIO
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
from sparkd import readImages
when i read image it work good ,
but when create model and fit the image DataFrame that time it gives
No module named sparkdl.image.imageIO

trying tio connect to phoenix table with pyspark and getting the following error

my commnd for reading phoenix table:
sql_sc.read.format("org.apache.phoenix.spark").option("table", tablename).option("zkUrl", "10.0.11.21:2181").load()
error:
Traceback (most recent call last):
File "/bdaas/exe/healthcare/hl7visualization.py", line 42, in
hl7 = phoenix_sparkdata(spark_app='hl7-app',spark_master='local',table_name='hl7table_v2_3')
File "/bdaas/exe/healthcare/hl7visualization.py", line 19, in init
self.dataframe = self.phoenix_getdataframe(table_name)
File "/bdaas/exe/healthcare/hl7visualization.py", line 41, in phoenix_getdataframe
df = self.sql_sc.read.format("org.apache.phoenix.spark").option("table", tablename).option("zkUrl", "10.0.11.21:2181").load()
File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 139, in load
File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in call
File "/usr/hdp/2.4.2.0-258/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 45, in deco
File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o43.load.
: java.lang.NoSuchMethodError: com.fasterxml.jackson.module.scala.deser.BigDecimalDeserializer$.handledType()Ljava/lang/Class;
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.(ScalaNumberDeserializersModule.scala:49)
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.(ScalaNumberDeserializersModule.scala)
at com.fasterxml.jackson.module.scala.deser.ScalaNumberDeserializersModule$class.$init$(ScalaNumberDeserializersModule.scala:61)
at com.fasterxml.jackson.module.scala.DefaultScalaModule.(DefaultScalaModule.scala:19)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.(DefaultScalaModule.scala:35)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.(DefaultScalaModule.scala)
at org.apache.spark.rdd.RDDOperationScope$.(RDDOperationScope.scala:81)
at org.apache.spark.rdd.RDDOperationScope$.(RDDOperationScope.scala)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:714)
at org.apache.spark.SparkContext.newAPIHadoopRDD(SparkContext.scala:1152)
at org.apache.phoenix.spark.PhoenixRDD.(PhoenixRDD.scala:46)
at org.apache.phoenix.spark.PhoenixRelation.schema(PhoenixRelation.scala:50)
at org.apache.spark.sql.execution.datasources.LogicalRelation.(LogicalRelation.scala:37)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)

Spark unzip from S3

I'm using Spark (pyspark) to read in data and have ran into a snag as some of my data is now in .gz format.
%pyspark
data = sc.textFile("s3://mybucket.file.gz")
data.first()
Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-5205743886772607083.py", line 267, in <module>
raise Exception(traceback.format_exc())
Exception: Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-5205743886772607083.py", line 260, in <module>
exec(code)
File "<stdin>", line 1, in <module>
File "/usr/lib/spark/python/pyspark/rdd.py", line 1041, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/usr/lib/spark/python/pyspark/rdd.py", line 1032, in sum
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
File "/usr/lib/spark/python/pyspark/rdd.py", line 906, in fold
vals = self.mapPartitions(func).collect()
File "/usr/lib/spark/python/pyspark/rdd.py", line 809, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 31.0 failed 4 times, most recent failure: Lost task 0.3 in stage 31.0 (TID 270, ip-172-16-238-231.us-west-1.compute.internal, executor 17): java.io.IOException: incorrect header check
at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.inflateBytesDirect(Native Method)
Any thoughts on how to read this in and uncompress?
Strangely enough, all I had to do is remove the "gz" off the end and it worked.

Running a Windows Batch File through Piping in Apache Spark

I have a requirement in which I have to run a Windows batch file using Apache Spark on multiple nodes of the Spark cluster.
So is it possible to do the same using Piping concept of Apache Spark?
I have before run a shell file using Piping in Spark on a Ubuntu machine. My below code doing the same runs fine:
data = ["hi","hello","how","are","you"]
distScript = "/home/aawasthi/echo.sh"
distScriptName = "echo.sh"
sc.addFile(distScript)
RDDdata = sc.parallelize(data)
print RDDdata.pipe(SparkFiles.get(distScriptName)).collect()
I tried to adapt the same code to run a Windows batch file on a Windows machine having Spark (1.6 prebuilt for Hadoop 2.6) installed. But it gives me the error on the sc.addFile step. Code is below:
batchFile = "D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv"
batchFileName = "runOpenCv"
sc.addFile(batchFile)
Error thrown by Spark is below:
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-9e13c265cbae> in <module>()
----> 1 sc.addFile(batchFile)`
Py4JJavaError: An error occurred while calling o160.addFile.
: java.io.FileNotFoundException: Added file D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv does not exist.
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1364)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1340)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Although the batch file exists at the given location.
UPDATE:
Added .bat as extension in the batchFile & batchFileName & file:/// in the starting of the file path. The modified code is:
from pyspark import SparkFiles
from pyspark import SparkContext
sc
batchFile = "file:///D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv.bat"
batchFileName = "runOpenCv.bat"
sc.addFile(batchFile)
RDDdata = sc.parallelize(["hi","hello"])
print SparkFiles.get("runOpenCv.bat")
print RDDdata.pipe(SparkFiles.get(batchFileName)).collect()
Now it doesn't give error in the addFile step, and print SparkFiles.get("runOpenCv.bat") prints the path
C:\Users\abhilash.awasthi\AppData\Local\Temp\spark-c0f383b1-8365-4840-bd0f-e7eb46cc6794\userFiles-69051066-f18c-45dc-9610-59cbde0d77fe\runOpenCv.bat
So file is added. But in the last step of the code it throws the below error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-6-bf2b8aea3ef0> in <module>()
----> 1 print RDDdata.pipe(SparkFiles.get(batchFileName)).collect()
D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.pyc in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
D:\spark-1.6.2-bin-hadoop2.6\python\lib\py4j-0.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\sql\utils.pyc in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
D:\spark-1.6.2-bin-hadoop2.6\python\lib\py4j-0.9-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 111, in main
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 106, in process
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 317, in func
return f(iterator)
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 715, in func
shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
File "C:\Anaconda2\lib\subprocess.py", line 710, in __init__
errread, errwrite)
File "C:\Anaconda2\lib\subprocess.py", line 958, in _execute_child
startupinfo)
WindowsError: [Error 2] The system cannot find the file specified
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 111, in main
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 106, in process
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 317, in func
return f(iterator)
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 715, in func
shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
File "C:\Anaconda2\lib\subprocess.py", line 710, in __init__
errread, errwrite)
File "C:\Anaconda2\lib\subprocess.py", line 958, in _execute_child
startupinfo)
WindowsError: [Error 2] The system cannot find the file specified
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
Please escape /
batchFile = "D://spark-1.6.2-bin-hadoop2.6//data//OpenCV//runOpenCv"
Also, as AA suggested above, it may have .cmd or .bat extension.

jupyter notebook does not work on mac

I have Mac OS X 10.11.5. I installed jupyter notebook via anaconda. When I type jupyter notebook. It opens "Home" tab in Safari but gives me an error in the command window:
[W 20:48:53.368 NotebookApp] /Users/hoosier/cv.py doesn't exist
However, there is a file /Users/hoosier/cv.py on my computer. When I try to start python2, it tries but eventually kernel dies. Here is what I get in the command window:
[I 20:52:33.947 NotebookApp] Creating new notebook in
[I 20:52:34.331 NotebookApp] Kernel started: 7f5cc96e-8163-48f7-a59b-ce13a6839608
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[I 20:52:37.333 NotebookApp] KernelRestarter: restarting kernel (1/5)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[I 20:52:40.339 NotebookApp] KernelRestarter: restarting kernel (2/5)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[I 20:52:43.344 NotebookApp] KernelRestarter: restarting kernel (3/5)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[W 20:52:44.352 NotebookApp] Timeout waiting for kernel_info reply from 7f5cc96e-8163-48f7-a59b-ce13a6839608
[I 20:52:46.351 NotebookApp] KernelRestarter: restarting kernel (4/5)
WARNING:root:kernel 7f5cc96e-8163-48f7-a59b-ce13a6839608 restarted
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[W 20:52:49.361 NotebookApp] KernelRestarter: restart failed
[W 20:52:49.361 NotebookApp] Kernel 7f5cc96e-8163-48f7-a59b-ce13a6839608 died, removing from map.
ERROR:root:kernel 7f5cc96e-8163-48f7-a59b-ce13a6839608 restarted failed!
[W 20:52:49.372 NotebookApp] Kernel deleted before session
[W 20:52:49.373 NotebookApp] 410 DELETE /api/sessions/cce1452d-221b-47b6-9bc6-52440b639281 (::1) 1.50ms referer=http://localhost:8888/notebooks/Untitled32.ipynb?kernel_name=python2
[W 20:53:32.867 NotebookApp] /Users/hoosier/cv.py doesn't exist
[I 20:54:35.237 NotebookApp] Saving file at /Untitled32.ipynb

Resources