when i use rdd fucntion on ec2, (bin/pyspark)
this error occur
data was
data=sc.textFile("/root/lab/irisdata")
or
data=sc.textFile("s3n://" + bucket.name+"/" + file_key.name")
15/08/27 09:16:58 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
15/08/27 09:16:58 WARN snappy.LoadSnappy: Snappy native library not loaded
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/spark/python/pyspark/rdd.py", line 1247, in take
totalParts = self.getNumPartitions()
File "/root/spark/python/pyspark/rdd.py", line 355, in getNumPartitions
return self._jrdd.partitions().size()
File "/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o23.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://ec2-54-65-212-78.ap-northeast-1.compute.amazonaws.com:9000/root/lab/irisdata
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:197)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:208)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:207)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:65)
at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:47)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
plz let me know how do i fix it.
ps. in local computer, same code no error :(
The line
Input path does not exist: hdfs://ec2-54-65-212-78.ap-northeast-1.compute.amazonaws.com:9000/root/lab/irisdata
indicates that the file you're attempting to read into an rdd is not where you think it is. Looks like it's not in the local filesystem but instead in hdfs.
Related
I am trying to learn pyspark. I have installed python 3.6.5 on my windows 10 machine.
I am using spark version 2.3.
I have downloaded zip file from git. I have a WordCount.py file with me.
When I try to run the command in cmd:
spark-submit WordCount.py
I am getting the below error.
I am executing this command in the directory where I have copied WordCount.py.
18/10/14 15:24:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18/10/14 15:24:43 ERROR SparkContext: Error initializing SparkContext.
java.io.FileNotFoundException: File file:/E:/notes/Hadoop/spark/course%20projects/python-spark-tutorial-master/rdd/WordCount.py does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:611)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:824)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:601)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:421)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1528)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1498)
at org.apache.spark.SparkContext$$anonfun$13.apply(SparkContext.scala:461)
at org.apache.spark.SparkContext$$anonfun$13.apply(SparkContext.scala:461)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:461)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Traceback (most recent call last):
File "E:/notes/Hadoop/spark/course projects/python-spark-tutorial-master/rdd/WordCount.py", line 5, in <module>
sc = SparkContext(conf = conf)
File "E:\notes\Hadoop\spark\spark_installation\python\lib\pyspark.zip\pyspark\context.py", line 118, in __init__
File "E:\notes\Hadoop\spark\spark_installation\python\lib\pyspark.zip\pyspark\context.py", line 180, in _do_init
File "E:\notes\Hadoop\spark\spark_installation\python\lib\pyspark.zip\pyspark\context.py", line 270, in _initialize_context
File "E:\notes\Hadoop\spark\spark_installation\python\lib\py4j-0.10.6-src.zip\py4j\java_gateway.py", line 1428, in __call__
File "E:\notes\Hadoop\spark\spark_installation\python\lib\py4j-0.10.6-src.zip\py4j\protocol.py", line 320, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.io.FileNotFoundException: File file:/E:/notes/Hadoop/spark/course%20projects/python-spark-tutorial-master/rdd/WordCount.py does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:611)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:824)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:601)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:421)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1528)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1498)
at org.apache.spark.SparkContext$$anonfun$13.apply(SparkContext.scala:461)
at org.apache.spark.SparkContext$$anonfun$13.apply(SparkContext.scala:461)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:461)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
there is a space in the name of the course projects directory.
try moving your project to another directory without a space
Following is the output of my job ... on single mode it runs well but on pseudo distribute mode it throws the following error all the time... I have tried a lot but could not meet a possible solution from anyone yet. I need a quick fix of the problem.
Highly obliged upon success...
rab#rab-VirtualBox:~/hadoop/hadoop-1.2.1$ bin/hadoop jar hadoop-examples-*.jar grep input output 'dfs[a-z.]+'
17/03/17 22:13:12 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/03/17 22:13:12 WARN snappy.LoadSnappy: Snappy native library not loaded
17/03/17 22:13:12 INFO mapred.JobClient: Cleaning up the staging area hdfs://localhost:9000/tmp/hadoop-rab/mapred/staging/rab/.staging/job_201703172201_0004
17/03/17 22:13:12 ERROR security.UserGroupInformation: PriviledgedActionException as:rab cause:org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://localhost:9000/user/rab/input
org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://localhost:9000/user/rab/input
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:197)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:208)
at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:1081)
at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:1073)
at org.apache.hadoop.mapred.JobClient.access$700(JobClient.java:179)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:983)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:936)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:936)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:910)
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1353)
at org.apache.hadoop.examples.Grep.run(Grep.java:69)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
at org.apache.hadoop.examples.Grep.main(Grep.java:93)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.ProgramDriver$ProgramDescription.invoke(ProgramDriver.java:68)
at org.apache.hadoop.util.ProgramDriver.driver(ProgramDriver.java:139)
at org.apache.hadoop.examples.ExampleDriver.main(ExampleDriver.java:64)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.main(RunJar.java:160)
The log output shows error:
"org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://localhost:9000/user/rab/input"
Do you have a directory called "input" under /user/rab on the HDFS file system? The error indicates that you don't ! The error also shows that it is looking at hdfs and not the local file system. You can check with the following command
"hdfs dfs -ls /user/rab"
The full command that you used "bin/hadoop jar hadoop-examples-*.jar grep input output 'dfs[a-z.]+'" expects the command to be of the form "hadoop jar jarfilename classname hdfsinputdirectory hdfsoutputdirectory" where classname is the name of the class in the jar file that you want to run.
The error which I am getting is as :
16/02/10 11:21:50 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
16/02/10 11:21:53 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://localhost:9000/tmp/outku already exists
at org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:146)
at org.apache.hadoop.mapreduce.JobSubmitter.checkSpecs(JobSubmitter.java:266)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:139)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1290)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1287)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1287)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1308)
at mr.WordCount.main(WordCount.java:87)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Please help me How can I resolve this Error or what is making this to happen.
Everytime hadoop run a MapReduce, it's create a folder as an output.
If you want to run the same job, you can specify diferent output folder name, or delete the past one.
FileAlreadyExistsException: Output directory hdfs://localhost:9000/tmp/outku. It means output directory already is exist.
Always specify the output directory name at run time(i.e Hadoop will create the directory automatically for you. You need not to worry about the output directory creation)
you have to delete the existing output directory and try running the M/r Job or change the output directory name
First, I start thrift server in spark. /sbin/start-thriftserver.sh
and the deamon is started.
hadoop 13015 1 99 13:52 pts/1 00:00:09 /usr/lib/jvm/jre-1.7.0-openjdk.x86_64/bin/java -cp /home/hadoop/spark/lib/hive-jdbc-0.13.0.jar:/home/hadoop/spark-1.4.1-bin-hadoop2.6/sbin/../conf/:/home/hadoop/spark-1.4.1-bin-hadoop2.6/lib/spark-assembly-1.4.1-hadoop2.6.0.jar:/home/hadoop/spark-1.4.1-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/home/hadoop/spark-1.4.1-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/home/hadoop/spark-1.4.1-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar
-Xms512m -Xmx512m -XX:MaxPermSize=256m org.apache.spark.deploy.SparkSubmit --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 spark-internal
After, I start /bin/pyspark
my hive version is 0.13.1,
spark version is 1.4.1,
hadoop version is 2.7
spark classpath is below.
SPARK_CLASSPATH= /home/account/spark/lib/hive-jdbc-0.13.0.jar:
/home/account/spark/lib/hive-exec-0.13.0.jar:
/home/account/spark/lib/hive-metastore-0.13.0.jar:
/home/account/spark/lib/hive-service-0.13.0.jar:
/home/account/spark/lib/libfb303-0.9.0.jar:
/home/account/spark/lib/log4j-1.2.16.jar
In pyspark(python-shell), I wrote this code.
>>> df = sqlContext.load(source="jdbc",driver="org.apache.hive.jdbc.HiveDriver", url="jdbc:hive2://IP:10000/default", dbtable="default.test")
but It's not working, I get this error. How can I resolve this error?
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/dev/user/ja/spark/python/pyspark/sql/context.py", line 458, in load
return self.read.load(path, source, schema, **options)
File "/home/dev/user/ja/spark/python/pyspark/sql/readwriter.py", line 112, in load
return self._df(self._jreader.load())
File "/home/dev/user/ja/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/home/dev/user/ja/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o29.load.
: java.sql.SQLException: Method not supported
at org.apache.hive.jdbc.HiveResultSetMetaData.isSigned(HiveResultSetMetaData.java:141)
at org.apache.spark.sql.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:132)
at org.apache.spark.sql.jdbc.JDBCRelation.<init>(JDBCRelation.scala:128)
at org.apache.spark.sql.jdbc.DefaultSource.createRelation(JDBCRelation.scala:113)
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:269)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:114)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:601)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:722)
I think HiveResultSetMetaData.isSigned method is not supported in hive.
but I don't know how can I resolve this error. please help..
thank you
It's Uncertain. but I answer to my question.
I think that It caused by version.
when I execute command below, I get "Method not support" error.
but when I this command at spark-1.3.1, It's worked.
>>> df = sqlContext.load(source="jdbc",driver="org.apache.hive.jdbc.HiveDriver", url="jdbc:hive2://IP:10000/default", dbtable="default.test")
so I think problem is version.
but It's my guess.
This page maybe help to you.
http://docs.hortonworks.com/HDPDocuments/HDP1/HDP-1.2.4/ds_Hive/jdbc-hs2.html
I use Hadoop distribution 1.1.2. When I try to run example wordcount routine I get following error.
Input command:
'D:/Files/hadoop-1.1.2/hadoop-1.1.2/bin/hadoop' jar 'D:/Files/hadoop-1.1.2/hadoop-1.1.2/hadoop-examples-1.1.2.jar' wordcount input output
The result:
13/07/03 11:02:42 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
13/07/03 11:02:42 ERROR security.UserGroupInformation: PriviledgedActionException as:PC cause:java.io.IOException: Failed to set permissions of path: \tmp\hadoop-PC\mapred\staging\PC119237705.staging to 0700
java.io.IOException: Failed to set permissions of path: \tmp\hadoop-PC\mapred\staging\PC119237705.staging to 0700
at org.apache.hadoop.fs.FileUtil.checkReturnValue(FileUtil.java:689)
at org.apache.hadoop.fs.FileUtil.setPermission(FileUtil.java:662)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:509)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:344)
at org.apache.hadoop.fs.FilterFileSystem.mkdirs(FilterFileSystem.java:189)
at org.apache.hadoop.mapreduce.JobSubmissionFiles.getStagingDir(JobSubmissionFiles.java:116)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:918)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:912)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:912)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
at org.apache.hadoop.examples.WordCount.main(WordCount.java:67)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.ProgramDriver$ProgramDescription.invoke(ProgramDriver.java:68)
at org.apache.hadoop.util.ProgramDriver.driver(ProgramDriver.java:139)
at org.apache.hadoop.examples.ExampleDriver.main(ExampleDriver.java:64)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.main(RunJar.java:156)
I have problem to locate the particular cause of this error. Please help.
Looks like you have hit this. You might find this patch helpful. But, before that you might wanna try changing the directory permissions to 755 and re-running the job.