getting Error while importing data from mongodb to hdfs - hadoop

I am getting errors while importing data from mongodb to hdfs.
I as using:
Ambari Sandbox [Hortonworks] Hadoop 2.7
MongoDB version 3.0
These are the jar files I am including:
mongo-java-driver-2.11.4.jar
mongo-hadoop-core-1.3.0.jar
Here is the code I am using:
package com.mongo.test;
import java.io.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.mapreduce.*;
import org.bson.*;
import com.mongodb.MongoClient;
import com.mongodb.hadoop.*;
import com.mongodb.hadoop.util.*;
public class ImportFromMongoToHdfs {
private static final Log log =
LogFactory.getLog(ImportFromMongoToHdfs.class);
public static class ReadEmpDataFromMongo extends Mapper<Object,
BSONObject, Text, Text>{
public void map(Object key, BSONObject value, Context context) throws
IOException, InterruptedException{
System.out.println("Key: " + key);
System.out.println("Value: " + value);
String md5 = value.get("md5").toString();
String name = value.get("name").toString();
String dev = value.get("dev").toString();
String salary = value.get("salary").toString();
String location = value.get("location").toString();
String output = "\t" + name + "\t" + dev + "\t" + salary + "\t" +
location;
context.write( new Text(md5), new Text(output));
}
}
public static void main(String[] args)throws Exception {
final Configuration conf = new Configuration();
MongoConfigUtil.setInputURI(conf,"mongodb://10.25.3.196:27017/admin.emp")
;
MongoConfigUtil.setCreateInputSplits(conf, false);
System.out.println("Configuration: " + conf);
final Job job = new Job(conf, "ReadWeblogsFromMongo");
Path out = new Path("/mongodb3");
FileOutputFormat.setOutputPath(job, out);
job.setJarByClass(ImportFromMongoToHdfs.class);
job.setMapperClass(ReadEmpDataFromMongo.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(com.mongodb.hadoop.MongoInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(0);
System.exit(job.waitForCompletion(true) ? 0 : 1 );
}
}
This is the error I am getting back:
[root#sandbox ~]# hadoop jar /mongoinput/mongdbconnect.jar com.mongo.test.ImportFromMongoToHdfs
WARNING: Use "yarn jar" to launch YARN applications.
Configuration: Configuration: core-default.xml, core-site.xml
15/09/09 09:22:51 INFO impl.TimelineClientImpl: Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
15/09/09 09:22:53 INFO client.RMProxy: Connecting to ResourceManager at sandbox.hortonworks.com/10.25.3.209:8050
15/09/09 09:22:53 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
15/09/09 09:22:54 INFO splitter.SingleMongoSplitter: SingleMongoSplitter calculating splits for mongodb://10.25.3.196:27017/admin.emp
15/09/09 09:22:54 INFO mapreduce.JobSubmitter: number of splits:1
15/09/09 09:22:55 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1441784509780_0003
15/09/09 09:22:55 INFO impl.YarnClientImpl: Submitted application application_1441784509780_0003
15/09/09 09:22:55 INFO mapreduce.Job: The url to track the job: http://sandbox.hortonworks.com:8088/proxy/application_1441784509780_0003/
15/09/09 09:22:55 INFO mapreduce.Job: Running job: job_1441784509780_0003
15/09/09 09:23:05 INFO mapreduce.Job: Job job_1441784509780_0003 running in uber mode : false
15/09/09 09:23:05 INFO mapreduce.Job: map 0% reduce 0%
15/09/09 09:23:12 INFO mapreduce.Job: Task Id : attempt_1441784509780_0003_m_000000_0, Status : FAILED
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.mapreduce.task.JobContextImpl.getInputFormatClass(JobContextImpl.java:174)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:749)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 8 more
15/09/09 09:23:18 INFO mapreduce.Job: Task Id : attempt_1441784509780_0003_m_000000_1, Status : FAILED
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.mapreduce.task.JobContextImpl.getInputFormatClass(JobContextImpl.java:174)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:749)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 8 more
15/09/09 09:23:24 INFO mapreduce.Job: Task Id : attempt_1441784509780_0003_m_000000_2, Status : FAILED
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.mapreduce.task.JobContextImpl.getInputFormatClass(JobContextImpl.java:174)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:749)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 8 more
15/09/09 09:23:32 INFO mapreduce.Job: map 100% reduce 0%
15/09/09 09:23:32 INFO mapreduce.Job: Job job_1441784509780_0003 failed with state FAILED due to: Task failed task_1441784509780_0003_m_000000
Job failed as tasks failed. failedMaps:1 failedReduces:0
15/09/09 09:23:32 INFO mapreduce.Job: Counters: 9
Job Counters
Failed map tasks=4
Launched map tasks=4
Other local map tasks=3
Rack-local map tasks=1
Total time spent by all maps in occupied slots (ms)=16996
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=16996
Total vcore-seconds taken by all map tasks=16996
Total megabyte-seconds taken by all map tasks=4249000
[root#sandbox ~]#
Does anyone know what is wrong?

make sure you keep mongo-hadoop jar in Hadoop class path and restart the Hadoop.
The error java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat should be resolved.

You are getting ClassNotFoundException becuase you is unable to reach to jar "mongo-hadoop-core*.jar". You have to make "mongo-hadoop-core*.jar" available to your code
There are many ways you resolve this error -
Create Fat Jar for your program. Fat jar will contain all necessary dependent jars. You can easily create fat jar if you are using any IDE.
use "-libjars" argument while submitting your yarn job
Copy mongo jars to Hadoop_Classpath location

I have just resolved a problem like this. In fact, this is an error at run time. If we set Hadoop_ClassPath pointing to the external necessary jar files, this was not enough yet. Because, I think at run time, Hadoop will look for jar files in the folder in which Hadoop is installed. I realize that we need to copy all necessary external jar files in the folder installed Hadoop. So :
First, you need to check HADOOP_CLASSPATH by typing :
- hadoop classpath
Then copy the necessary external jar file in one the HADOOP_CLASSPATH. For exemple, I will copy mongo-hadoop-1.5.1.jar and some others jar files to folder /usr/local/hadoop/share/hadoop/mapreduce.
Then it works for me!

Related

Unable to Configure Number of Reducers In WordCount Job in hadoop

I m using Single Node Cluster - Hadoop-2.7.0 in my Linum Machine.
My code for WordCount Job is running fine with 1 reducer.
But Not working fine if i increase the reducers.
It is showing the following error:
15/05/25 21:15:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library
15/05/25 21:15:10 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
15/05/25 21:15:10 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
15/05/25 21:15:10 WARN snappy.LoadSnappy: Snappy native library is available
15/05/25 21:15:10 INFO snappy.LoadSnappy: Snappy native library loaded
15/05/25 21:15:10 INFO mapred.FileInputFormat: Total input paths to process : 1
15/05/25 21:15:10 INFO mapred.JobClient: Running job: job_local_0001
15/05/25 21:15:11 INFO util.ProcessTree: setsid exited with exit code 0
15/05/25 21:15:11 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin#5f1fd699
15/05/25 21:15:11 INFO mapred.MapTask: numReduceTasks: 1
15/05/25 21:15:11 INFO mapred.MapTask: io.sort.mb = 100
15/05/25 21:15:11 INFO mapred.MapTask: data buffer = 79691776/99614720
15/05/25 21:15:11 INFO mapred.MapTask: record buffer = 262144/327680
15/05/25 21:15:11 WARN mapred.LocalJobRunner: job_local_0001
java.io.IOException: Illegal partition for am (1)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1073)
at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
at WordMapper.map(WordMapper.java:24)
at WordMapper.map(WordMapper.java:1)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)
My getPartition Method Looks like this:
public int getPartition(Text key, IntWritable value, int numRedTasks) {
String s = key.toString();
if(s.length() == 1)
{
return 0;
}
else if(s.length() == 2)
{
return 1;
}
else if(s.length() == 3)
{
return 2;
}
else
return 3;
}
Run Method in WordCount.class File:
if(input.length < 2)
{
System.out.println("Please provide valid input");
return -1;
}
else
{
JobConf config = new JobConf();
FileInputFormat.setInputPaths(config, new Path(input[0]));
FileOutputFormat.setOutputPath(config, new Path(input[1]));
config.setMapperClass(WordMapper.class);
config.setReducerClass(WordReducer.class);
config.setNumReduceTasks(4);
config.setPartitionerClass(MyPartitioner.class);
config.setMapOutputKeyClass(Text.class);
config.setMapOutputValueClass(IntWritable.class);
config.setOutputKeyClass(Text.class);
config.setOutputValueClass(IntWritable.class);
JobClient.runJob(config);
}
return 0;
}
My Mapper and Reducer Code is fine because Wordcount Job with 1 reducer is running fine.
Any One able to figure it out?
This may be due to pig fails in the operation due to high default_parallel could be set in it.
Thanks,
Shailesh.
You need to use tooRunner in your driver class and invoke the toolrunner in your main class. You can do this by using combiner as part of workflow. Below is the driver class code: As you can see from the code below, along with the mapper and reducer calls, there is a combiner call as well. And the exit code in the main runner is " int exitCode = ToolRunner.run(new Configuration(), new WordCountWithCombiner(), args);" which invokes tool runner at run time and you can specify the number of reducers or mappers you would like to use by using the "-D" option when running the wordcount program. A sample command line would look like "-D mapred.reduce.tasks =2 input output"
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
public class WordCountWithCombiner extends Configured
implements Tool{
#Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
Job job = new Job(conf, "MyJob");
job.setJarByClass(WordCount.class);
job.setJobName("Word Count With Combiners");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(WordCountMapper.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new WordCountWithCombiner(), args);
System.exit(exitCode);
}
}

DistributedCache - third party jar not found

I'm trying to get a hold of DistributedCache. I'm using Apache Hadoop 1.2.1 on two nodes.
I referred to the Cloudera post which is simply extended in the other posts that explain how to use third-party jars using -libjars
Note:
In my jar, I haven't included any jar libs. - neither Hadoop core nor commons lang.
The code :
public class WordCounter extends Configured implements Tool {
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
// Job job = new Job(getConf(), args[0]);
Job job = new Job(super.getConf(), args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setJarByClass(WordCounter.class);
FileInputFormat.setInputPaths(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
int jobState = job.waitForCompletion(true) ? 0 : 1;
return jobState;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args == null || args.length < 3) {
System.out.println("The below three arguments are expected");
System.out
.println("<job name> <hdfs path of the input file> <hdfs path of the output file>");
return;
}
WordCounter wordCounter = new WordCounter();
// System.exit(ToolRunner.run(wordCounter, args));
System.exit(ToolRunner.run(new Configuration(), wordCounter, args));
}
}
The Mapper class is naive, its only attempting to use the StringUtils from Apache Commons(and NOT hadoop)
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* #author 298790
*
*/
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static IntWritable one = new IntWritable(1);
#Override
protected void map(
LongWritable key,
Text value,
org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
StringTokenizer strTokenizer = new StringTokenizer(value.toString());
Text token = new Text();
while (strTokenizer.hasMoreTokens()) {
token.set(strTokenizer.nextToken());
context.write(token, one);
}
System.out.println("Converting " + value + " to upper case "
+ StringUtils.upperCase(value.toString()));
}
}
The commands that I use :
bigdata#slave3:~$ export HADOOP_CLASSPATH=dumphere/lib/commons-lang3-3.1.jar
bigdata#slave3:~$
bigdata#slave3:~$ echo $HADOOP_CLASSPATH
dumphere/lib/commons-lang3-3.1.jar
bigdata#slave3:~$
bigdata#slave3:~$ echo $LIBJARS
dumphere/lib/commons-lang3-3.1.jar
bigdata#slave3:~$ hadoop jar dumphere/code/jars/hdp_3rdparty.jar com.hadoop.basics.WordCounter "WordCount" "/input/dumphere/Childhood_days.txt" "/output/dumphere/wc" -libjars ${LIBJARS}
The exception I get :
Warning: $HADOOP_HOME is deprecated.
14/08/13 21:56:05 INFO input.FileInputFormat: Total input paths to process : 1
14/08/13 21:56:05 INFO util.NativeCodeLoader: Loaded the native-hadoop library
14/08/13 21:56:05 WARN snappy.LoadSnappy: Snappy native library not loaded
14/08/13 21:56:05 INFO mapred.JobClient: Running job: job_201408111719_0190
14/08/13 21:56:06 INFO mapred.JobClient: map 0% reduce 0%
14/08/13 21:56:37 INFO mapred.JobClient: Task Id : attempt_201408111719_0190_m_000000_0, Status : FAILED
Error: java.lang.ClassNotFoundException: org.apache.commons.lang3.StringUtils
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at com.hadoop.basics.WCMapper.map(WCMapper.java:40)
at com.hadoop.basics.WCMapper.map(WCMapper.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
14/08/13 21:56:42 INFO mapred.JobClient: Task Id : attempt_201408111719_0190_m_000000_1, Status : FAILED
Error: java.lang.ClassNotFoundException: org.apache.commons.lang3.StringUtils
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:423)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:356)
at com.hadoop.basics.WCMapper.map(WCMapper.java:40)
at com.hadoop.basics.WCMapper.map(WCMapper.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
The Cloudera post mentions :
The jar will be placed in distributed cache and will be made available to all of the job’s task attempts. More specifically, you will find the JAR in one of the ${mapred.local.dir}/taskTracker/archive/${user.name}/distcache/… subdirectories on local nodes.
But on that path, I'm not able to find the commons-lang3-3.1.jar
What am I missing?

Error on map reduce example of Hadoop 2.2.0

I am new to hadoop and after installing Hadoop 2.2.0 I tried to follow example http://www.srccodes.com/p/article/45/run-hadoop-wordcount-mapreduce-example-windows to try a simple map reduce job.
However whenever I try to do the map reduce job over the txt file I created, I keep getting failures with this message
c:\hadoop>bin\yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.2.0.ja
r wordcount /input output
14/03/26 14:20:48 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0
:8032
14/03/26 14:20:50 INFO input.FileInputFormat: Total input paths to process : 1
14/03/26 14:20:51 INFO mapreduce.JobSubmitter: number of splits:1
14/03/26 14:20:51 INFO Configuration.deprecation: user.name is deprecated. Inste
ad, use mapreduce.job.user.name
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.jar is deprecated. Inst
ead, use mapreduce.job.jar
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.output.value.class is d
eprecated. Instead, use mapreduce.job.output.value.class
14/03/26 14:20:51 INFO Configuration.deprecation: mapreduce.combine.class is dep
recated. Instead, use mapreduce.job.combine.class
14/03/26 14:20:51 INFO Configuration.deprecation: mapreduce.map.class is depreca
ted. Instead, use mapreduce.job.map.class
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.job.name is deprecated.
Instead, use mapreduce.job.name
14/03/26 14:20:51 INFO Configuration.deprecation: mapreduce.reduce.class is depr
ecated. Instead, use mapreduce.job.reduce.class
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.input.dir is deprecated
. Instead, use mapreduce.input.fileinputformat.inputdir
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.output.dir is deprecate
d. Instead, use mapreduce.output.fileoutputformat.outputdir
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.map.tasks is deprecated
. Instead, use mapreduce.job.maps
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.output.key.class is dep
recated. Instead, use mapreduce.job.output.key.class
14/03/26 14:20:51 INFO Configuration.deprecation: mapred.working.dir is deprecat
ed. Instead, use mapreduce.job.working.dir
14/03/26 14:20:51 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_13
95833928952_0004
14/03/26 14:20:52 INFO impl.YarnClientImpl: Submitted application application_13
95833928952_0004 to ResourceManager at /0.0.0.0:8032
14/03/26 14:20:52 INFO mapreduce.Job: The url to track the job: http://GoncaloPe
reira:8088/proxy/application_1395833928952_0004/
14/03/26 14:20:52 INFO mapreduce.Job: Running job: job_1395833928952_0004
14/03/26 14:21:08 INFO mapreduce.Job: Job job_1395833928952_0004 running in uber
mode : false
14/03/26 14:21:08 INFO mapreduce.Job: map 0% reduce 0%
14/03/26 14:21:20 INFO mapreduce.Job: Task Id : attempt_1395833928952_0004_m_000
000_0, Status : FAILED
Error: java.lang.ClassCastException: org.apache.hadoop.mapreduce.lib.input.FileS
plit cannot be cast to org.apache.hadoop.mapred.InputSplit
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:402)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:162)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma
tion.java:1491)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:157)
14/03/26 14:21:33 INFO mapreduce.Job: Task Id : attempt_1395833928952_0004_m_000
000_1, Status : FAILED
Error: java.lang.ClassCastException: org.apache.hadoop.mapreduce.lib.input.FileS
plit cannot be cast to org.apache.hadoop.mapred.InputSplit
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:402)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:162)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma
tion.java:1491)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:157)
14/03/26 14:21:48 INFO mapreduce.Job: Task Id : attempt_1395833928952_0004_m_000
000_2, Status : FAILED
Error: java.lang.ClassCastException: org.apache.hadoop.mapreduce.lib.input.FileS
plit cannot be cast to org.apache.hadoop.mapred.InputSplit
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:402)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:162)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma
tion.java:1491)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:157)
14/03/26 14:22:04 INFO mapreduce.Job: map 100% reduce 100%
14/03/26 14:22:10 INFO mapreduce.Job: Job job_1395833928952_0004 failed with sta
te FAILED due to: Task failed task_1395833928952_0004_m_000000
Job failed as tasks failed. failedMaps:1 failedReduces:0
14/03/26 14:22:10 INFO mapreduce.Job: Counters: 6
Job Counters
Failed map tasks=4
Launched map tasks=4
Other local map tasks=3
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=48786
Total time spent by all reduces in occupied slots (ms)=0
Since I followed everything with no issues step by step I have no idea why this might be, does anyone know?
Edit: Tried adopt 2.3.0 same issue happens with the example jar given, and the code bellow I tried compile, no idea what the issue is
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class teste {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setJarByClass(teste.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
I had the same issue (java.lang.ClassCastException) and was able to solve it by running Hadoop with admin privileges. The problem seems to be the creation of symbolic links which by default is not possible for non-admin Windows users. Open a console as administrator and then proceed as described in the example from your link.
link you provided has input perameter as input NOT /input...try with this syntax...
C:\hadoop>bin\yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.2.0.jar wordcount input output
if this doesn't work than see this - Link and modify the mapper class.

Avro Map Reduce - AvroInputFormat not found error

This is what i have understood so far reading from varied sources on the internet.
Avro mapred and Avro are not part of CDH4 (Cloudera Distribution) and i have to set it up manually using HADOOP_CLASSPATH=avro.jar:avro-mapred.jar
I have done that and when i run my job on my pseudo cluster it throws the following exception:
13/12/27 00:47:40 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
13/12/27 00:47:40 INFO mapred.FileInputFormat: Total input paths to process : 1
13/12/27 00:47:41 INFO mapred.JobClient: Running job: job_201312221245_0017
13/12/27 00:47:42 INFO mapred.JobClient: map 0% reduce 0%
13/12/27 00:47:57 INFO mapred.JobClient: Task Id : attempt_201312221245_0017_m_000000_0, Status : FAILED
java.lang.RuntimeException: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.avro.mapred.AvroInputFormat not found
I'm running the job as follows:
hadoop jar build/libs/hadoop-boilerplate-1.0.jar CustomerMapReduce transactions/input transactions/output1 -libjars /path/to/libs/avro-1.7.4.jar,/path/to/libs/avro-mapred-1.7.4.jar
You should implement Tool and use getConf() for job configuration.
public class SomeClass extends Configured implements Tool {
#Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
...
}
}

hadoop not running in the multinode cluster

I have a jar file "Tsp.jar" that I made myself. This same jar files executes well in single node cluster setup of hadoop. However when I run it on a cluster comprising 2 machines, a laptop and desktop it gives me an exception when the map function reach 50%. Here is the output
`hadoop#psycho-O:/usr/local/hadoop$ bin/hadoop jar Tsp.jar clust-Tsp_ip1 clust_Tsp_op4
11/04/27 16:13:06 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
11/04/27 16:13:06 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
11/04/27 16:13:06 INFO mapred.FileInputFormat: Total input paths to process : 1
11/04/27 16:13:06 INFO mapred.JobClient: Running job: job_201104271608_0001
11/04/27 16:13:07 INFO mapred.JobClient: map 0% reduce 0%
11/04/27 16:13:17 INFO mapred.JobClient: map 50% reduce 0%
11/04/27 16:13:20 INFO mapred.JobClient: Task Id : attempt_201104271608_0001_m_000001_0, Status : FAILED
java.lang.RuntimeException: java.lang.RuntimeException: java.lang.ClassNotFoundException: Tsp$TspReducer
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:841)
at org.apache.hadoop.mapred.JobConf.getCombinerClass(JobConf.java:853)
at org.apache.hadoop.mapred.Task$CombinerRunner.create(Task.java:1100)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:812)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:350)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307)
at org.apache.hadoop.mapred.Child.main(Child.java:170)
Caused by: java.lang.RuntimeException: java.lang.ClassNotFoundException: Tsp$TspReducer
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:809)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:833)
... 6 more
Caused by: java.lang.ClassNotFoundException: Tsp$TspReducer
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:762)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:807)
... 7 more
11/04/27 16:13:20 WARN mapred.JobClient: Error reading task outputemil-desktop
11/04/27 16:13:20 WARN mapred.JobClient: Error reading task outputemil-desktop
^Z
[1]+ Stopped bin/hadoop jar Tsp.jar clust-Tsp_ip1 clust_Tsp_op4
hadoop#psycho-O:~$ jps
4937 Jps
3976 RunJar
`
Alse the cluster worked fine executing the wordcount example. So I guess its the problem with the Tsp.jar file.
1) Is it necessary to have a jar file to run on a cluster?
2) Here I tried to run a jar file in the cluster which I made. But is still gives a warning that jar file is not found. Why is that?
3) What all should be taken care of when running a jar file? Like what all it must contain other than the program which I wrote? My jar file contains a a Tsp.class, Tsp$TspReducer.class and a Tsp$TspMapper.class. The terminal says it cant find Tsp$TspReducer when it is already there in the jar file.
Thankyou
EDIT
public class Tsp {
public static void main(String[] args) throws IOException {
JobConf conf = new JobConf(Tsp.class);
conf.setJobName("Tsp");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(TspMapper.class);
conf.setCombinerClass(TspReducer.class);
conf.setReducerClass(TspReducer.class);
FileInputFormat.addInputPath(conf,new Path(args[0]));
FileOutputFormat.setOutputPath(conf,new Path(args[1]));
JobClient.runJob(conf);
}
public static class TspMapper extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
function findCost() {
}
public void map(LongWritable key,Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
find adjacency matrix from the input;
for(int i = 0; ...) {
.....
output.collect(new Text(string1), new Text(string2));
}
}
}
public static class TspReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
Text t1 = new Text();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String a;
a = values.next().toString();
output.collect(key,new Text(a));
}
}
}
You currently have
conf.setJobName("Tsp");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(TspMapper.class);
conf.setCombinerClass(TspReducer.class);
conf.setReducerClass(TspReducer.class);
and as the error is stating No job jar file set you are not setting a jar.
You will need to something similar to
conf.setJarByClass(Tsp.class);
From what I'm seeing, that should resolve the error seen here.
11/04/27 16:13:06 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
Do what they say, when setting up your job, set the jar where the class is contained. Hadoop copies the jar into the DistributedCache (a filesystem on every node) and uses the classes out of it.
I had the exact same issue. Here is how I solved the problem(imagine your map reduce class is called A). After creating the job call:
job.setJarByClass(A.class);

Resources