Related
I am working with pyspark and elasticsearch (py library), and while updating one of the documents in ES I am getting the following error.
2021-09-08 06:31:49 ERROR JobScheduler:91 - Error running job streaming job 1631082700000 ms.1
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 161, in <lambda>
func = lambda t, rdd: old_func(rdd)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 124, in RDDfromKafkaStream
posttoES(row)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 100, in posttoES
es.update(index="anonprofile", id = jsonid, body=query)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/__init__.py", line 1903, in update
"POST", path, params=params, headers=headers, body=body
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 458, in perform_request
raise e
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 426, in perform_request
timeout=timeout,
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/http_urllib3.py", line 277, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/base.py", line 331, in _raise_error
status_code, error_message, additional_info
elasticsearch.exceptions.RequestError: RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]')
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 172, in <module>
ssc.awaitTermination()
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/context.py", line 192, in awaitTermination
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o31.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 161, in <lambda>
func = lambda t, rdd: old_func(rdd)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 124, in RDDfromKafkaStream
posttoES(row)
File "/usr/bin/spark-2.4.0-bin-hadoop2.7/spark_scripts/main.py", line 100, in posttoES
es.update(index="anonprofile", id = jsonid, body=query)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/client/__init__.py", line 1903, in update
"POST", path, params=params, headers=headers, body=body
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 458, in perform_request
raise e
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/transport.py", line 426, in perform_request
timeout=timeout,
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/http_urllib3.py", line 277, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/connection/base.py", line 331, in _raise_error
status_code, error_message, additional_info
elasticsearch.exceptions.RequestError: RequestError(400, 'illegal_argument_exception', '[f756ea2593ee][172.18.0.4:9300][indices:data/write/update[s]]')
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
The code snippet is-
row = json.loads(row)
count = row["Count"]
row.pop("Count")
jsonid = hashlib.sha224(json.dumps(row).encode('ascii', 'ignore')).hexdigest()
if es.exists(index="anonprofile", id=jsonid):
q = {
"script": {
"source": "ctx._source.Count+={}".format(count),
"lang": "painless"
}
}
es.update(index="anonprofile", id = jsonid, body=q)
else:
row["Count"] = count
es.index(index="anonprofile", id=jsonid, body={'doc' : row})
The first document goes to the else block and works as expected, but entering the if block it returns an error on the update call.
Looking up on the internet, I tried changing the query, but nothing seems to be working.
P.S. For learning purposes I was trying to execute the same task without the pyspark integrating, and that seems to be working file. The code for that is here
The problem is that you insert information in doc field which is convert into a properties since row variable is a dict of values and you try to update _source.Count instead of _source.doc.Count
body arg with doc field is only usefull for update with for example an upsert or a script when the document not exist.
So for example :
row["Count"] = count
body = {
"script": {
"source": "ctx._source.Count+={}".format(count),
"lang": "painless"
}
"upsert": row
}
es.update(index="anonprofile", id=jsonid, body=body)
instead of your if exists...
I followed the examples:
from kubernetes import client, config
config.load_kube_config()
v1 = client.CoreV1Api()
print("Listing pods with their IPs:")
ret = v1.list_pod_for_all_namespaces(watch=False)
for i in ret.items:
print("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
But always got this error:
2018-08-28 23:03:48,818 WARNING Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),)': /api/v1/pods?watch=False
...ommit some retry logs
Traceback (most recent call last):
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/contrib/pyopenssl.py", line 444, in wrap_socket
cnx.do_handshake()
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/OpenSSL/SSL.py", line 1907, in do_handshake
self._raise_ssl_error(self._ssl, result)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/OpenSSL/SSL.py", line 1639, in _raise_ssl_error
_raise_current_error()
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/OpenSSL/_util.py", line 54, in exception_from_error_queue
raise exception_type(errors)
OpenSSL.SSL.Error: [('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/connectionpool.py", line 343, in _make_request
self._validate_conn(conn)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/connectionpool.py", line 849, in _validate_conn
conn.connect()
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/connection.py", line 356, in connect
ssl_context=context)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/util/ssl_.py", line 372, in ssl_wrap_socket
return context.wrap_socket(sock)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/contrib/pyopenssl.py", line 450, in wrap_socket
raise ssl.SSLError('bad handshake: %r' % e)
ssl.SSLError: ("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/kubernetes/client/apis/core_v1_api.py", line 13608, in list_pod_for_all_namespaces
(data) = self.list_pod_for_all_namespaces_with_http_info(**kwargs)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/kubernetes/client/apis/core_v1_api.py", line 13705, in list_pod_for_all_namespaces_with_http_info
collection_formats=collection_formats)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/kubernetes/client/api_client.py", line 321, in call_api
_return_http_data_only, collection_formats, _preload_content, _request_timeout)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/kubernetes/client/api_client.py", line 155, in __call_api
_request_timeout=_request_timeout)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/kubernetes/client/api_client.py", line 342, in request
headers=headers)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/kubernetes/client/rest.py", line 231, in GET
query_params=query_params)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/kubernetes/client/rest.py", line 205, in request
headers=headers)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/request.py", line 68, in request
**urlopen_kw)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/request.py", line 89, in request_encode_url
return self.urlopen(method, url, **extra_kw)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/poolmanager.py", line 322, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/connectionpool.py", line 667, in urlopen
**response_kw)
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "/Users/maxpeng/.pyenv/versions/3.5.0/lib/python3.5/site-packages/urllib3/util/retry.py", line 398, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='xx.xx.xx.xx', port=xxxx): Max retries exceeded with url: /api/v1/pods?watch=False (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))
Mac OSX 10.13.6
kubectl version
Client Version: version.Info{Major:"1", Minor:"11",
GitVersion:"v1.11.2",
GitCommit:"bb9ffb1654d4a729bb4cec18ff088eacc153c239",
GitTreeState:"clean", BuildDate:"2018-08-08T16:31:10Z",
GoVersion:"go1.10.3", Compiler:"gc", Platform:"darwin/amd64"}
Server Version: version.Info{Major:"1", Minor:"10",
GitVersion:"v1.10.4",
GitCommit:"5ca598b4ba5abb89bb773071ce452e33fb66339d",
GitTreeState:"clean", BuildDate:"2018-06-06T08:00:59Z",
GoVersion:"go1.9.3", Compiler:"gc", Platform:"linux/amd64"}
pip list
|Package | Version|
|certifi | 2018.8.24|
|kubernetes | 7.0.0|
......ommit other dependencies
P.S. I tried the solution here: http://www.cdotson.com/2017/01/sslerror-with-python-3-6-x-on-macos-sierra/. But without luck.
This appears to be an issue with the client script not being able to find/accept/verify the cert. Setting the location of the cert is likely the most viable method of resolution directly from the script:
from kubernetes import client
from kubernetes.client import Configuration, ApiClient
config = Configuration()
config.api_key = {'authorization': 'Bearer <api_key>'}
config.host = 'https://my-kubernetes-cluster'
config.ssl_ca_cert = "/path/to/ca_chain.crt"
api_client = ApiClient(configuration=config)
v1 = client.CoreV1Api(api_client)
v1.list_pod_for_all_namespaces(watch=False)
You could also set this up directly in .kube/config:
apiVersion: v1
clusters:
- cluster:
api-version: v1
certificate-authority: /path/to/ca_chain.crt
server: "https://my-kubernetes-cluster"
...
The last option is to skip verification ( not recommend because it defeats the purpose of SSL ):
v1 = client.CoreV1Api()
v1.api_client.configuration.verify_ssl = False
More Information:
• https://github.com/kubernetes-client/python/issues/521
• How to specify a CA bundle in kubernetes python client
i'm doing image classification using spark.
i have already imported sparkdl jar(added path of jar in the conf/spark.default)
ImportError: No module named sparkdl.image.imageIO
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 161, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 91, in read_udfs
_, udf = read_single_udf(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 78, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 54, in read_command
command = serializer._read_with_length(file)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
return self.loads(obj)
File "/home/ubuntu/install/spark-2.1.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 454, in loads
return pickle.loads(obj)
ImportError: No module named sparkdl.image.imageIO
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
from sparkd import readImages
when i read image it work good ,
but when create model and fit the image DataFrame that time it gives
No module named sparkdl.image.imageIO
I have a requirement in which I have to run a Windows batch file using Apache Spark on multiple nodes of the Spark cluster.
So is it possible to do the same using Piping concept of Apache Spark?
I have before run a shell file using Piping in Spark on a Ubuntu machine. My below code doing the same runs fine:
data = ["hi","hello","how","are","you"]
distScript = "/home/aawasthi/echo.sh"
distScriptName = "echo.sh"
sc.addFile(distScript)
RDDdata = sc.parallelize(data)
print RDDdata.pipe(SparkFiles.get(distScriptName)).collect()
I tried to adapt the same code to run a Windows batch file on a Windows machine having Spark (1.6 prebuilt for Hadoop 2.6) installed. But it gives me the error on the sc.addFile step. Code is below:
batchFile = "D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv"
batchFileName = "runOpenCv"
sc.addFile(batchFile)
Error thrown by Spark is below:
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-9e13c265cbae> in <module>()
----> 1 sc.addFile(batchFile)`
Py4JJavaError: An error occurred while calling o160.addFile.
: java.io.FileNotFoundException: Added file D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv does not exist.
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1364)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1340)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Although the batch file exists at the given location.
UPDATE:
Added .bat as extension in the batchFile & batchFileName & file:/// in the starting of the file path. The modified code is:
from pyspark import SparkFiles
from pyspark import SparkContext
sc
batchFile = "file:///D:/spark-1.6.2-bin-hadoop2.6/data/OpenCV/runOpenCv.bat"
batchFileName = "runOpenCv.bat"
sc.addFile(batchFile)
RDDdata = sc.parallelize(["hi","hello"])
print SparkFiles.get("runOpenCv.bat")
print RDDdata.pipe(SparkFiles.get(batchFileName)).collect()
Now it doesn't give error in the addFile step, and print SparkFiles.get("runOpenCv.bat") prints the path
C:\Users\abhilash.awasthi\AppData\Local\Temp\spark-c0f383b1-8365-4840-bd0f-e7eb46cc6794\userFiles-69051066-f18c-45dc-9610-59cbde0d77fe\runOpenCv.bat
So file is added. But in the last step of the code it throws the below error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-6-bf2b8aea3ef0> in <module>()
----> 1 print RDDdata.pipe(SparkFiles.get(batchFileName)).collect()
D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.pyc in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
D:\spark-1.6.2-bin-hadoop2.6\python\lib\py4j-0.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\sql\utils.pyc in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
D:\spark-1.6.2-bin-hadoop2.6\python\lib\py4j-0.9-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 111, in main
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 106, in process
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 317, in func
return f(iterator)
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 715, in func
shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
File "C:\Anaconda2\lib\subprocess.py", line 710, in __init__
errread, errwrite)
File "C:\Anaconda2\lib\subprocess.py", line 958, in _execute_child
startupinfo)
WindowsError: [Error 2] The system cannot find the file specified
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 111, in main
File "D:\spark-1.6.2-bin-hadoop2.6\python\lib\pyspark.zip\pyspark\worker.py", line 106, in process
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 317, in func
return f(iterator)
File "D:\spark-1.6.2-bin-hadoop2.6\python\pyspark\rdd.py", line 715, in func
shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
File "C:\Anaconda2\lib\subprocess.py", line 710, in __init__
errread, errwrite)
File "C:\Anaconda2\lib\subprocess.py", line 958, in _execute_child
startupinfo)
WindowsError: [Error 2] The system cannot find the file specified
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
Please escape /
batchFile = "D://spark-1.6.2-bin-hadoop2.6//data//OpenCV//runOpenCv"
Also, as AA suggested above, it may have .cmd or .bat extension.
I have Mac OS X 10.11.5. I installed jupyter notebook via anaconda. When I type jupyter notebook. It opens "Home" tab in Safari but gives me an error in the command window:
[W 20:48:53.368 NotebookApp] /Users/hoosier/cv.py doesn't exist
However, there is a file /Users/hoosier/cv.py on my computer. When I try to start python2, it tries but eventually kernel dies. Here is what I get in the command window:
[I 20:52:33.947 NotebookApp] Creating new notebook in
[I 20:52:34.331 NotebookApp] Kernel started: 7f5cc96e-8163-48f7-a59b-ce13a6839608
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[I 20:52:37.333 NotebookApp] KernelRestarter: restarting kernel (1/5)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[I 20:52:40.339 NotebookApp] KernelRestarter: restarting kernel (2/5)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[I 20:52:43.344 NotebookApp] KernelRestarter: restarting kernel (3/5)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[W 20:52:44.352 NotebookApp] Timeout waiting for kernel_info reply from 7f5cc96e-8163-48f7-a59b-ce13a6839608
[I 20:52:46.351 NotebookApp] KernelRestarter: restarting kernel (4/5)
WARNING:root:kernel 7f5cc96e-8163-48f7-a59b-ce13a6839608 restarted
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Library/Python/2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 591, in launch_instance
app.initialize(argv)
File "<decorator-gen-121>", line 2, in initialize
File "/Library/Python/2.7/site-packages/traitlets/config/application.py", line 75, in catch_config_error
return method(app, *args, **kwargs)
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 390, in initialize
self.init_extensions()
File "/Library/Python/2.7/site-packages/ipykernel/kernelapp.py", line 362, in init_extensions
extension_man.load_extension('ipywidgets')
File "/Library/Python/2.7/site-packages/IPython/core/extensions.py", line 89, in load_extension
__import__(module_str)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/__init__.py", line 22, in <module>
from .widgets import *
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/__init__.py", line 1, in <module>
from .widget import Widget, CallbackDispatcher, register, widget_serialization, handle_version_comm_opened
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 117, in <module>
class Widget(LoggingConfigurable):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipywidgets/widgets/widget.py", line 154, in Widget
in which to find _model_name. If empty, look in the global registry.""").tag(sync=True)
AttributeError: 'Unicode' object has no attribute 'tag'
[W 20:52:49.361 NotebookApp] KernelRestarter: restart failed
[W 20:52:49.361 NotebookApp] Kernel 7f5cc96e-8163-48f7-a59b-ce13a6839608 died, removing from map.
ERROR:root:kernel 7f5cc96e-8163-48f7-a59b-ce13a6839608 restarted failed!
[W 20:52:49.372 NotebookApp] Kernel deleted before session
[W 20:52:49.373 NotebookApp] 410 DELETE /api/sessions/cce1452d-221b-47b6-9bc6-52440b639281 (::1) 1.50ms referer=http://localhost:8888/notebooks/Untitled32.ipynb?kernel_name=python2
[W 20:53:32.867 NotebookApp] /Users/hoosier/cv.py doesn't exist
[I 20:54:35.237 NotebookApp] Saving file at /Untitled32.ipynb