Mapper not invoked while using multipleInputFormat - hadoop

I have a driver class which is using MultipleInputFormat class to invoke different mappers at runtime.
However when I use the MultipleInputs.addInputPath(job, fStatus.getPath(), TextInputFormat.class,CreatePureDeltaMapperOne.class) in the first for loop, my first mapper(CreatePureDeltaMapperOne) is not getting invoked. When I comment the block of code which invokes the multiple input format from the first for loop, and call it from outside, the mapper class is invoked. Please help me find the issue.
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/***
* Creates the pure delta file by matching the history records present in HDFS
* #author Debajit
*
*/
public class CreatePureDeltaDriver {
/**
* #param args
* #throws URISyntaxException
*/
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
String historyFileInputPath="";
String deltaFileDirectoryPath="";
String pureDeltaFileOutPath="";
Configuration config= new Configuration();
Job job = new Job(config, "Pure Delta File Creation");
job.setJarByClass(CreatePureDeltaDriver.class);
Path historyDirPath= new Path(historyFileInputPath);
FileSystem fs = FileSystem.get(config);
FileStatus[] statusHistory = fs.listStatus(historyDirPath);
for (FileStatus fStatus : statusHistory) {
String historyFileName=fStatus.getPath().getName();
if(historyFileName.contains("part-r")){
MultipleInputs.addInputPath(job, fStatus.getPath(), TextInputFormat.class,CreatePureDeltaMapperOne.class);
}
}
Path deltaDirPath= new Path(deltaFileDirectoryPath);
FileStatus[] statusDelta = fs.listStatus(deltaDirPath);
for (FileStatus fStatus : statusDelta) {
String deltaFileName=fStatus.getPath().getName();
if(deltaFileName.startsWith("part-r")){
MultipleInputs.addInputPath(job, fStatus.getPath(), TextInputFormat.class, CreatePureDeltaMapperTwo.class);
}
}
job.setMapperClass(CreatePureDeltaMapperOne.class);
job.setMapperClass(CreatePureDeltaMapperTwo.class);
job.setReducerClass(CreatePureDeltaReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path hisInPath = new Path(historyFileInputPath);
Path outPath = new Path(pureDeltaFileOutPath);
//MultipleInputs.addInputPath(job, hisInPath, TextInputFormat.class, CreatePureDeltaMapperOne.class);
//MultipleInputs.addInputPath(job, delPath, TextInputFormat.class, CreatePureDeltaMapperTwo.class);
FileOutputFormat.setOutputPath(job, outPath);
System.out.println(job.waitForCompletion(true));
}
}
MY MAPPER CLASS
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class CreatePureDeltaMapperOne extends Mapper<LongWritable, Text, Text, Text> {
private Text outKey = new Text();
private Text outValue = new Text();
int counter=0;
private String delimiter="";
private int primaryKeyIndicator =0;
private Integer numMapNodes = null;
public void setup(Context context) throws IOException{
System.out.println("SETUP--- Mapper 1");
Configuration config = context.getConfiguration();
Properties properties = new Properties();
String propertyDirectory = config.get("propertyDirectory");
String propertyFileName =config.get("propertyFileName");
Path propertyDirPath= new Path(propertyDirectory);
FileSystem fs = FileSystem.get(config);
FileStatus[] status = fs.listStatus(propertyDirPath);
for (FileStatus fStatus : status) {
String propFileName=fStatus.getPath().getName().trim();
if(propFileName.equals(propertyFileName)){
properties.load(new InputStreamReader(fs.open(fStatus.getPath())));
this.setNumMapNodes(Integer.parseInt(properties.getProperty("num.of.nodes").trim()));
this.setDelimiter(properties.getProperty("file.delimiter.type").trim());
this.setPrimaryKeyIndicator(Integer.parseInt(properties.getProperty("file.primary.key.index.specifier").trim()));
}
}
}
public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException{
String valueString = val.toString().trim();
String[] tokens = valueString.split(this.getDelimiter());
String temp=tokens[this.getPrimaryKeyIndicator()].toString();
System.out.println(" MAPPER 1 invoked");
this.setOutKey(new Text(tokens[this.getPrimaryKeyIndicator()].toString().trim()));//Account number
this.setOutValue(new Text("h"+valueString.trim()));
context.write(outKey,outValue );
}
}

Do not use these two lines in your code :
job.setMapperClass(CreatePureDeltaMapperOne.class);
job.setMapperClass(CreatePureDeltaMapperTwo.class);
Because you are already passing name of corresponding class in the loop.
Hope it helps..

Related

Mapper output key descending order

I'm new to Hadoop. So far, I am trying to implement a custom writable comparator to sort my map output keys, which is DoubleWritable, in descending order. Below is my comparator class:
class DecreasingComparator extends WritableComparator{
protected DecreasingComparator(){
super(DoubleWritable.class,true);
}
#SuppressWarnings("rawtypes")
#Override
public int compare(WritableComparable w1,WritableComparable w2){
DoubleWritable key1 = (DoubleWritable) w1;
DoubleWritable key2 = (DoubleWritable) w2;
return -1 * key1.compareTo(key2);
}
}
The output I'm getting is this:
By right, it should be sorted in descending order by key. Why is it still ordered by the value, which is IntWritable?
It can't be an issue with the Mapper class since it just outputs the key and values. I'm not sure how to go about this. The Mapper class is as below:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SecondaryMapper extends Mapper<LongWritable,Text,DoubleWritable,IntWritable>{
#Override
public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
String line = value.toString();
DoubleWritable ratio = new DoubleWritable(Double.parseDouble(line.split("\\s")[0]));
IntWritable id = new IntWritable(Integer.parseInt(line.split("\\s")[1]));
context.write(ratio,id);
}
}
Below is my driver class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
//second job import
public class CommentViewRatio {
class DecreasingComparator extends WritableComparator{
protected DecreasingComparator(){
super(DoubleWritable.class,true);
}
#SuppressWarnings("rawtypes")
#Override
public int compare(WritableComparable w1,WritableComparable w2){
DoubleWritable key1 = (DoubleWritable) w1;
DoubleWritable key2 = (DoubleWritable) w2;
return -1 * key1.compareTo(key2);
}
}
public static void main(String[] args) throws Exception{
if(args.length != 2){
System.out.printf("Usage: WordCount <input dir> <output dir>\n");
System.exit(-1);
}
Configuration conf1 = new Configuration();
Job job = new Job(conf1);
job.setJarByClass(CommentViewRatio.class);
job.setJobName("Average");
Path temp = new Path("temp");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, temp);
job.setMapperClass(CommentViewMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(DoubleWritable.class);
// job.setPartitionerClass(CommentViewPartitioner.class);
job.setReducerClass(CommentViewReducer.class);
// job.setNumReduceTasks(4);
job.setOutputKeyClass(DoubleWritable.class);
job.setOutputValueClass(LongWritable.class);
boolean success = job.waitForCompletion(true);
if(success) {
Configuration conf2 = new Configuration();
Job job2 = new Job(conf2);
job2.setJarByClass(CommentViewRatio.class);
FileInputFormat.addInputPath(job2, new Path("temp/part-r-00000"));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.setMapperClass(SecondaryMapper.class);
job2.setMapOutputKeyClass(DoubleWritable.class);
job2.setMapOutputValueClass(IntWritable.class);
// job2.setPartitionerClass(CommentViewPartitioner.class);
job2.setSortComparatorClass(DecreasingComparator.class);
job2.setNumReduceTasks(0);
boolean success2 = job2.waitForCompletion(true);
temp.getFileSystem(conf1).delete(temp);
System.exit(success2 ? 0 : 1);
}
}
}
Any help would be much appreciated, thank you for reading.
Apparently, I needed to add a reducer class for my output to be sorted if I want to use a custom sort comparator.
The new driver looks like this:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
//second job import
public class CommentViewRatio {
public static class DecreasingComparator extends WritableComparator{
protected DecreasingComparator(){
super(DoubleWritable.class,true);
}
#SuppressWarnings("rawtypes")
#Override
public int compare(WritableComparable w1,WritableComparable w2){
DoubleWritable key1 = (DoubleWritable) w1;
DoubleWritable key2 = (DoubleWritable) w2;
return -1 * key1.compareTo(key2);
}
}
public static void main(String[] args) throws Exception{
if(args.length != 2){
System.out.printf("Usage: WordCount <input dir> <output dir>\n");
System.exit(-1);
}
Configuration conf1 = new Configuration();
Job job = new Job(conf1);
job.setJarByClass(CommentViewRatio.class);
job.setJobName("Average");
Path temp = new Path("temp");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, temp);
job.setMapperClass(CommentViewMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(DoubleWritable.class);
// job.setPartitionerClass(CommentViewPartitioner.class);
job.setReducerClass(CommentViewReducer.class);
// job.setNumReduceTasks(4);
job.setOutputKeyClass(DoubleWritable.class);
job.setOutputValueClass(LongWritable.class);
boolean success = job.waitForCompletion(true);
if(success) {
Configuration conf2 = new Configuration();
Job job2 = new Job(conf2);
job2.setJarByClass(CommentViewRatio.class);
FileInputFormat.addInputPath(job2, new Path("temp/part-r-00000"));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.setMapperClass(SecondaryMapper.class);
job2.setMapOutputKeyClass(DoubleWritable.class);
job2.setMapOutputValueClass(IntWritable.class);
//job2.setPartitionerClass(CommentViewPartitioner.class);
job2.setSortComparatorClass(DecreasingComparator.class);
job2.setReducerClass(SecondaryReducer.class);
job2.setOutputKeyClass(DoubleWritable.class);
job2.setOutputValueClass(IntWritable.class);
//job2.setNumReduceTasks(4);
boolean success2 = job2.waitForCompletion(true);
temp.getFileSystem(conf1).delete(temp);
System.exit(success2 ? 0 : 1);
}
}
}
All you need to do is implement a reducer class for the output to be sorted. Weird considering how some sources say that WritableComparator sorts the map output, not the reducer output.

expected org.apache.hadoop.hive.ql.io.orc.OrcStruct, received org.apache.hadoop.hive.ql.io.orc.OrcSerde$OrcSerdeRow

When I read orcfile and write data to orcfile, I get following error:
expected org.apache.hadoop.hive.ql.io.orc.OrcStruct,
received org.apache.hadoop.hive.ql.io.orc.OrcSerde$OrcSerdeRow
Is the MapoutputValue.class not right?
this is my program:
package com.baifendian.basicPlatform.hive.ql.io.orc;
import java.io.IOException;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcNewOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class OrcInput {
final static String inputSchema = "struct<c1:string,c2:string>";
final static String outputSchema = "struct<c1:string,c2:string>";
static StructObjectInspector inputOI;
static SettableStructObjectInspector outputOI;
static OrcSerde orcsd;
public static class OrcReaderMap extends
Mapper<NullWritable, OrcStruct, NullWritable, Writable> {
public void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
TypeInfo tfin = TypeInfoUtils
.getTypeInfoFromTypeString(inputSchema);
TypeInfo tfout = TypeInfoUtils
.getTypeInfoFromTypeString(outputSchema);
inputOI = (StructObjectInspector) OrcStruct
.createObjectInspector(tfin);
outputOI = (SettableStructObjectInspector) OrcStruct
.createObjectInspector(tfout);
orcsd = new OrcSerde();
List<? extends StructField> fldlst = outputOI
.getAllStructFieldRefs();
StringBuffer sbCols = new StringBuffer();
StringBuffer sbTyps = new StringBuffer();
for (StructField sf : fldlst) {
if (sbCols.length() > 0) {
sbCols.append(",");
}
sbCols.append(sf.getFieldName());
if (sbTyps.length() > 0) {
sbTyps.append(",");
}
sbTyps.append(sf.getFieldObjectInspector().getTypeName());
}
Properties props = new Properties();
props.put(IOConstants.COLUMNS, sbCols.toString());
props.put(IOConstants.COLUMNS_TYPES, sbTyps.toString());
orcsd.initialize(context.getConfiguration(), props);
}
public void map(NullWritable meaningless, OrcStruct orc, Context context)
throws IOException, InterruptedException {
List<Object> ilst = inputOI.getStructFieldsDataAsList(orc);
Text f1 = (Text) ilst.get(0);
Text f2 = (Text) ilst.get(1);
// output orc format
OrcStruct objOut = (OrcStruct) outputOI.create();
List<? extends StructField> flst = outputOI.getAllStructFieldRefs();
outputOI.setStructFieldData(objOut, flst.get(0), f1);
outputOI.setStructFieldData(objOut, flst.get(1), f2);
context.write(NullWritable.get(), orcsd.serialize(objOut, outputOI));
}
}
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf, "OrcReader");
job.setJarByClass(OrcInput.class);
job.setInputFormatClass(OrcNewInputFormat.class);
job.setOutputFormatClass(OrcNewOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("/warehouse/bae_xinhua_test.db/orcinput"));
FileOutputFormat.setOutputPath(job, new Path("/warehouse/bae_xinhua_test.db/orcoutput"));
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(OrcStruct.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(OrcStruct.class);
job.setMapperClass(OrcInput.OrcReaderMap.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
please add job.setNumReduceTasks(0);

Hadoop, MapReduce Custom Java Counters Exception in thread "main" java.lang.IllegalStateException: Job in state DEFINE instead of RUNNING

Error is:
Exception in thread "main" java.lang.IllegalStateException: Job in state DEFINE instead of RUNNING
at org.apache.hadoop.mapreduce.Job.ensureState(Job.java:294)
at org.apache.hadoop.mapreduce.Job.getCounters(Job.java:762)
at com.aamend.hadoop.MapReduce.CountryIncomeConf.main(CountryIncomeConf.java:41)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
The error shows that the problem lies in the line:
Counter counter =
job.getCounters().findCounter(COUNTERS.MISSING_FIELDS_RECORD_COUNT);
Also I do have a enum with the name COUNTERS.
Mapper :
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
public class CountryIncomeMapper extends Mapper<Object, Text, Text, DoubleWritable> {
private Logger logger = Logger.getLogger("FilterMapper");
private final int incomeIndex = 54;
private final int countryIndex = 0;
private final int lenIndex = 58;
String seperator = ",";
public void map(Object key, Text line, Context context) throws IOException,
InterruptedException {
if (line == null) {
logger.info("null found.");
context.getCounter(COUNTERS.ERROR_COUNT).increment(1);
return;
}
if (line.toString().contains(
"Adjusted net national income per capita (current US$)")) {
String[] recordSplits = line.toString().split(seperator);
logger.info("The data has been splitted.");
if (recordSplits.length == lenIndex) {
String countryName = recordSplits[countryIndex];
try {
double income = Double.parseDouble(recordSplits[incomeIndex]);
context.write(new Text(countryName), new DoubleWritable(income));
} catch (NumberFormatException nfe) {
logger.info("The value of income is in wrong format." + countryName);
context.getCounter(COUNTERS.MISSING_FIELDS_RECORD_COUNT).increment(1);
return;
}
}
}
}
}
Driver Class :
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class CountryIncomeConf {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Path inputPath = new Path(args[0]);
Path outputDir = new Path(args[1]);
// Create configuration
Configuration conf = new Configuration(true);
// Create job
Job job = new Job(conf, "CountryIncomeConf");
job.setJarByClass(CountryIncomeConf.class);
Counter counter =
job.getCounters().findCounter(COUNTERS.MISSING_FIELDS_RECORD_COUNT);
System.out.println("Error Counter = " + counter.getValue());
// Setup MapReduce
job.setMapperClass(CountryIncomeMapper.class);
job.setNumReduceTasks(1);
// Specify key / value
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
// Input
FileInputFormat.addInputPath(job, inputPath);
job.setInputFormatClass(TextInputFormat.class);
// Output
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormatClass(TextOutputFormat.class);
// Delete output if exists
FileSystem hdfs = FileSystem.get(conf);
if (hdfs.exists(outputDir))
hdfs.delete(outputDir, true);
// Execute job
int code = job.waitForCompletion(true) ? 0 : 1;
System.exit(code);
}
}
Looks like you're trying to get the counter before you submitted the job.
I had the same error at the time of an sqoop export.
The error was generated because the hdfs directory was empty.
Once I populated the directory (corresponding to a hive table), the sqoop ran without problems.

Hadoop setJarByClass not working

My WordCount example is the following structure:
public class WordCount extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {}
public static void main(String[] args) throws Exception {
BasicConfigurator.configure();
Logger.getRootLogger().setLevel(Level.WARN);
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(args[1]))) {
fs.delete(new Path(args[1]), true);
}
Job job = Job.getInstance(conf, "wordcount");
long startTime = System.currentTimeMillis();
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(WordCount.class);
// job.setJar(WordCount.class.getSimpleName());
job.waitForCompletion(true);
System.out.println("Job Finished in "
+ (System.currentTimeMillis() - startTime) / 1000.0
+ " seconds");
return 0;
}
}
The job.setJarByClass() call is not working, and I get a "No job jar file set" message. Also, job.getJar() after this call shows "null" value. Anyone knows what's the problem here?
I also tried with job.setJarByClass(this.getClass()), job.setJar("WordCount") and job.setJar(WordCount.class.getSimpleName()). The first one has no effect, job.getJar() returns null, the second and third both give me FileNotFoundException: File WordCount does not exist. Then I tried with job.setJar("src/wordcount/WordCount.java") and job.setJar("bin/wordcount/WordCount.class"), both succeed within eclipse (without this warning message), but still fail with FileNotFoundException when executed as standalone jar file on command line. I guess the problem may relate to class path setting if not unresolved dependencies.
think you should add appropriate jar files.
In your case you must have this jar org.apache.hadoop.mapreduce.Job in your project file.
I imported the following classes and interfaces
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
And your project working fine.
Just check after importing all above mentioned classes. If any problem, give me a comment.
please use this java code for word counting, with two arguments one is input file other one is result file. And add all jar files from mapreduce and common folders in hadoop directory
package org.samples.mapreduce.training;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.job.tracker", "hdfs://localhost:50001");
conf.set("fs.default.name", "hdfs://localhost:50000");
Job job = new Job(conf, "wordcount");
job.setJarByClass(WordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Or If you want use advance version use this code with three arguments, here third one file which you dont want count example ,
package org.samples.mapreduce.training;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class WordCountV2 {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
static enum CountersEnum { INPUT_WORDS }
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private boolean caseSensitive;
private Set<String> patternsToSkip = new HashSet<String>();
private Configuration conf;
private BufferedReader fis;
#Override
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
if (conf.getBoolean("wordcount.skip.patterns", true)) {
URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
for (URI patternsURI : patternsURIs) {
Path patternsPath = new Path(patternsURI.getPath());
String patternsFileName = patternsPath.getName().toString();
parseSkipFile(patternsFileName);
}
}
}
private void parseSkipFile(String fileName) {
try {
fis = new BufferedReader(new FileReader(fileName));
String pattern = null;
while ((pattern = fis.readLine()) != null) {
patternsToSkip.add(pattern);
}
} catch (IOException ioe) {
System.err.println("Caught exception while parsing the cached file '"
+ StringUtils.stringifyException(ioe));
}
}
#Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = (caseSensitive) ?
value.toString() : value.toString().toLowerCase();
for (String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "");
}
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
Counter counter = context.getCounter(CountersEnum.class.getName(),
CountersEnum.INPUT_WORDS.toString());
counter.increment(1);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCountV2.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
List<String> otherArgs = new ArrayList<String>();
for (int i=0; i < remainingArgs.length; ++i) {
if ("-skip".equals(remainingArgs[i])) {
job.addCacheFile(new Path(remainingArgs[++i]).toUri());
job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
} else {
otherArgs.add(remainingArgs[i]);
}
}
FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

WholeFileRecordReader cannot be cast to org.apache.hadoop.mapred.RecordReader

I wanted to create a new datatype in Hadoop but I get the following error from my custom inputformat class Here is my code :
error - WholeFileRecordReader cannot be cast to org.apache.hadoop.mapred.RecordReader
code -
import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TaskAttemptContext;
public class wholeFileInputFormat extends FileInputFormat<Text, apriori>{
public RecordReader<Text, apriori> getRecordReader(
InputSplit input, JobConf job, Reporter reporter)
throws IOException {
reporter.setStatus(input.toString());
return (RecordReader<Text, apriori>) new WholeFileRecordReader(job,FileSplit)input);
}
}
My custom Record Reader is as follows
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
class WholeFileRecordReader extends RecordReader<Text, apriori> {
private FileSplit fileSplit;
private Configuration conf;
private InputStream in;
private Text key = new Text("");
private apriori value = new apriori();
private boolean processed = false;
public void initialize( JobConf job, FileSplit split)
throws IOException {
this.fileSplit = split;
this.conf = job;
final Path file = fileSplit.getPath();
String StringPath = new String(fileSplit.getPath().toString());
String StringPath2 = new String();
StringPath2 = StringPath.substring(5);
System.out.println(StringPath2);
in = new FileInputStream(StringPath2);
FileSystem fs = file.getFileSystem(conf);
in = fs.open(file);
}
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!processed) {
byte[] contents = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
key.set(file.getName());
try {
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
} finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
#Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
#Override
public apriori getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException {
return processed ? 1.0f : 0.0f;
}
#Override
public void close() throws IOException {
// Do nothing
}
#Override
public void initialize(InputSplit arg0, TaskAttemptContext arg1)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
}
}
WholeFileRecordReader class is a subclass of org.apache.hadoop.mapreduce.RecordReader class.This class cannot be cast to a org.apache.hadoop.mapred.RecordReader class.Can you try using same APIs in both the classes
As per rules of Java programming language only classes or interfaces (collectively known as Type) from same Type hierarchy can be cast or converted into each other. If you try to cast two object which doesn't share same type hierarchy, i.e. there is no parent child relationship between them, you will get compile time error.You can refer this link
There was package mismatch due to which this error was coming.
In your code you combined both MRv1 and MRv2 due to which you got the error.
Packages org.apache.hadoop.mapred is Mrv1. (Map Reduce version 1)
Packages org.apache.hadoop.mapreduce is Mrv2. (Map Reduce version 2)
In your code you combined both MRv1 and MRv2:
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
Either use all the import packages as org.apache.hadoop.mapred (MRv1) or org.apache.hadoop.mapreduce (MRv2).
Hope this helps.

Resources