java.lang.RuntimeException: java.lang.NoSuchMethodException: Hadoop mapreduce - hadoop

I am getting java.lang.NoSuchMethodException please help me in this ...
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Placeholder extends Configured implements Tool
{
private static Path[] filepath;
private static String filename;
private static String[] calendartablecolumn;
private static BufferedReader br;
private static String placeholder;
private static String output;
private static HashMap<String,String> map = new HashMap<String,String>();
public int run(String[] args) throws Exception {
String placeholdername = args[3];
String stratposition = args[4];
String length = args[5];
String category = args[6];
String calendarid = args[7];
Configuration conf = new Configuration();
conf.set("placeholdername", placeholdername);
conf.set("stratposition", stratposition);
conf.set("length", length);
conf.set("category", category);
conf.set("calendarid", calendarid);
Job job = Job.getInstance(conf, "Placeholder");
DistributedCache.addCacheFile(new Path(args[2]).toUri(), getConf());
job.setJarByClass(Placeholder.class);
job.setMapperClass(PlaceholderMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new Placeholder(), args);
System.exit(res);
}
public class PlaceholderMapper extends Mapper<LongWritable,Text,Text,Text>
{
protected void setup(Context context) throws IOException,InterruptedException
{
try {
filepath = DistributedCache.getLocalCacheFiles(getConf());
filename = filepath[0].getName().toString();
br = new BufferedReader(new FileReader(filepath[0].toString()));
String line = null;
while ((line = br.readLine()) != null)
{
calendartablecolumn = line.split("\\,");
}
} catch (Exception e) {
}
}
public void map(LongWritable keys, Text value, Context context) throws IOException, InterruptedException
{
String str_placeholdername = context.getConfiguration().get("placeholdername");
String str_stratposition = context.getConfiguration().get("stratposition");
int int_stratposition = Integer.parseInt(str_stratposition);
String str_length = context.getConfiguration().get("length");
int int_length = Integer.parseInt(str_length);
String str_category = context.getConfiguration().get("category");
String str_calendarid = context.getConfiguration().get("calendarid");
int int_calendarid = Integer.parseInt(str_calendarid);
if (str_category != null)
{
if (str_category.equalsIgnoreCase("filename"))
{
placeholder = filename.substring((int_stratposition-1),(int_length-1));
}else if (str_category.equalsIgnoreCase("startdate"))
{
map.put(calendartablecolumn[0].trim(),calendartablecolumn[3].trim());
placeholder = map.get(int_calendarid);
}else if (str_category.equalsIgnoreCase("enddate"))
{
map.put(calendartablecolumn[0].trim(),calendartablecolumn[4].trim());
placeholder = map.get(int_calendarid);
}else if (str_category.equalsIgnoreCase("currentdate"))
{
DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
Date date = new Date();
String currentdate = (dateFormat.format(date)).toString();
placeholder = currentdate.substring((int_stratposition-1),(int_length-1));
}else if (str_category.equalsIgnoreCase("isdailydata"))
{
}
output = value+","+placeholder;
}else
{
String line = value.toString();
output = line.substring((int_stratposition-1),(int_length-1));
}
context.write(null,new Text (output));
}
}
}
This is the error that i am getting-
Error: java.lang.RuntimeException: java.lang.NoSuchMethodException:
com.nielsen.GRFE.processor.mapreduce.Placeholder$PlaceholderMapper.()
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:131)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:745)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158) Caused by: java.lang.NoSuchMethodException:
com.nielsen.GRFE.processor.mapreduce.Placeholder$PlaceholderMapper.()
at java.lang.Class.getConstructor0(Class.java:2849)
at java.lang.Class.getDeclaredConstructor(Class.java:2053)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:125)

I think the problem is that your PlaceholderMapper class is not static, try to make it public static.

Related

How to insert , MapReduce output into HBASE using same program

I wrote a program which take pdf as an input and produces text output as a whole. I want to load this text in hbase using same program, is there any way to do it.Any help will be appreciable
//Driver Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class PdfInputDriver {
public static void main(String[] args) throws IOException,InterruptedException, ClassNotFoundException
{
Configuration conf = new Configuration();
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
args = parser.getRemainingArgs();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Pdftext");
job.setJarByClass(PdfInputDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setInputFormatClass(PdfInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
System.out.println(job.waitForCompletion(true));
}
}
//InputFormatClass
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class PdfInputFormat extends FileInputFormat<Object, Object> {
#SuppressWarnings({ "unchecked", "rawtypes" })
#Override
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
return new PdfRecordReader();
}
}
//PDF Record Reader class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PdfRecordReader extends RecordReader<Object, Object> {
private String[] lines = null;
private LongWritable key = null;
private Text value = null;
#Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
final Path file = split.getPath();
/*
* The below code contains the logic for opening the file and seek to
* the start of the split. Here we are applying the Pdf Parsing logic
*/
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
PDDocument pdf = null;
String parsedText = null;
PDFTextStripper stripper;
pdf = PDDocument.load(fileIn);
stripper = new PDFTextStripper();
parsedText = stripper.getText(pdf);
//String delims = "[ ]";
this.lines = parsedText.split("/n");
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new LongWritable();
key.set(1);
value = new Text();
value.set(lines[0]);
} else
{
int temp = (int) key.get();
if (temp < (lines.length - 1)) {
int count = (int) key.get();
value = new Text();
value.set(lines[count]);
count = count + 1;
key = new LongWritable(count);
} else {
return false;
}
}
if (key == null || value == null) {
return false;
} else {
return true;
}
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
#Override
public void close() throws IOException {
}
}
//Mapper Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>
{
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, key);
}
}
//Reducer Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Object, Object, Object, Object> {
protected void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
context.write(key, new WordCountReducer());
}
}
i think you are making jar file of it. Just use the part-r-00000 file of what you have generated from mapreduce output.
create 'table'

expected org.apache.hadoop.hive.ql.io.orc.OrcStruct, received org.apache.hadoop.hive.ql.io.orc.OrcSerde$OrcSerdeRow

When I read orcfile and write data to orcfile, I get following error:
expected org.apache.hadoop.hive.ql.io.orc.OrcStruct,
received org.apache.hadoop.hive.ql.io.orc.OrcSerde$OrcSerdeRow
Is the MapoutputValue.class not right?
this is my program:
package com.baifendian.basicPlatform.hive.ql.io.orc;
import java.io.IOException;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcNewOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class OrcInput {
final static String inputSchema = "struct<c1:string,c2:string>";
final static String outputSchema = "struct<c1:string,c2:string>";
static StructObjectInspector inputOI;
static SettableStructObjectInspector outputOI;
static OrcSerde orcsd;
public static class OrcReaderMap extends
Mapper<NullWritable, OrcStruct, NullWritable, Writable> {
public void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
TypeInfo tfin = TypeInfoUtils
.getTypeInfoFromTypeString(inputSchema);
TypeInfo tfout = TypeInfoUtils
.getTypeInfoFromTypeString(outputSchema);
inputOI = (StructObjectInspector) OrcStruct
.createObjectInspector(tfin);
outputOI = (SettableStructObjectInspector) OrcStruct
.createObjectInspector(tfout);
orcsd = new OrcSerde();
List<? extends StructField> fldlst = outputOI
.getAllStructFieldRefs();
StringBuffer sbCols = new StringBuffer();
StringBuffer sbTyps = new StringBuffer();
for (StructField sf : fldlst) {
if (sbCols.length() > 0) {
sbCols.append(",");
}
sbCols.append(sf.getFieldName());
if (sbTyps.length() > 0) {
sbTyps.append(",");
}
sbTyps.append(sf.getFieldObjectInspector().getTypeName());
}
Properties props = new Properties();
props.put(IOConstants.COLUMNS, sbCols.toString());
props.put(IOConstants.COLUMNS_TYPES, sbTyps.toString());
orcsd.initialize(context.getConfiguration(), props);
}
public void map(NullWritable meaningless, OrcStruct orc, Context context)
throws IOException, InterruptedException {
List<Object> ilst = inputOI.getStructFieldsDataAsList(orc);
Text f1 = (Text) ilst.get(0);
Text f2 = (Text) ilst.get(1);
// output orc format
OrcStruct objOut = (OrcStruct) outputOI.create();
List<? extends StructField> flst = outputOI.getAllStructFieldRefs();
outputOI.setStructFieldData(objOut, flst.get(0), f1);
outputOI.setStructFieldData(objOut, flst.get(1), f2);
context.write(NullWritable.get(), orcsd.serialize(objOut, outputOI));
}
}
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf, "OrcReader");
job.setJarByClass(OrcInput.class);
job.setInputFormatClass(OrcNewInputFormat.class);
job.setOutputFormatClass(OrcNewOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("/warehouse/bae_xinhua_test.db/orcinput"));
FileOutputFormat.setOutputPath(job, new Path("/warehouse/bae_xinhua_test.db/orcoutput"));
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(OrcStruct.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(OrcStruct.class);
job.setMapperClass(OrcInput.OrcReaderMap.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
please add job.setNumReduceTasks(0);

passing arguments to record reader in mapreduce hadoop

This is my code for using variours arg
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class Docsparser {
private static String Delimiter;
public static class DocsInputFormat extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return new DocsLineRecordReader();
}
}
public static class DocsLineRecordReader extends RecordReader<Text, Text> {
private Text key = new Text();
private Text value = new Text();
private int currentword = 0;
private String fileline;
private File file = null;
private String line;
private HWPFDocument document;
private WordExtractor extractor = null;
private String[] filedata;
StringBuilder sb = new StringBuilder();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) split;
final Path file = fileSplit.getPath();
Configuration conf = context.getConfiguration();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream filein = fs.open(fileSplit.getPath());
String Delim = conf.get("Delim");
if (filein != null)
{
HWPFDocument document = new HWPFDocument(filein);
extractor = new WordExtractor(document);
fileline = extractor.getText();
filedata = fileline.split(Delim);
}
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException
{
if (key == null) {
key = new Text();
}
if (value == null) {
value = new Text();
}
if(currentword < filedata.length)
{
for ( currentword=0;currentword < filedata.length; currentword++)
{
sb.append(filedata[currentword] +",");
line = sb.toString();
}
key.set(line);
value.set("");
return true;
}
else
{
key = null;
value = null;
return false;
}
}
#Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return (100.0f / filedata.length * currentword) / 100.0f;
}
#Override
public void close() throws IOException {
}
}
public static class Map extends Mapper<Text, Text, Text, Text>{
public void map(Text key, Text value, Context context) throws IOException, InterruptedException
{
context.write(key,value);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "Docsparser");
job.setJarByClass(Docsparser.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
job.setInputFormatClass(DocsInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Exception details:
15/09/28 03:50:04 INFO mapreduce.Job: Task Id :
attempt_1443193152998_2319_m_000000_2, Status : FAILED Error: java.lang.NullPointerException
at java.lang.String.split(String.java:2272)
at java.lang.String.split(String.java:2355)
at com.nielsen.grfe.Docsparser$DocsLineRecordReader.initialize(Docsparser.java:66)
at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.initialize(MapTask.java:548)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:786)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
All the configuration variables have to be set before initializing Job class . Move
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
before
Job job = new Job(conf, "Docsparser");
The NullPointerException occurs in the split method of the fileline string. I suspect that you haven't set the "Delim" configuration value and, thus, your variable Delim is null.

Hadoop setJarByClass not working

My WordCount example is the following structure:
public class WordCount extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {}
public static void main(String[] args) throws Exception {
BasicConfigurator.configure();
Logger.getRootLogger().setLevel(Level.WARN);
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(args[1]))) {
fs.delete(new Path(args[1]), true);
}
Job job = Job.getInstance(conf, "wordcount");
long startTime = System.currentTimeMillis();
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(WordCount.class);
// job.setJar(WordCount.class.getSimpleName());
job.waitForCompletion(true);
System.out.println("Job Finished in "
+ (System.currentTimeMillis() - startTime) / 1000.0
+ " seconds");
return 0;
}
}
The job.setJarByClass() call is not working, and I get a "No job jar file set" message. Also, job.getJar() after this call shows "null" value. Anyone knows what's the problem here?
I also tried with job.setJarByClass(this.getClass()), job.setJar("WordCount") and job.setJar(WordCount.class.getSimpleName()). The first one has no effect, job.getJar() returns null, the second and third both give me FileNotFoundException: File WordCount does not exist. Then I tried with job.setJar("src/wordcount/WordCount.java") and job.setJar("bin/wordcount/WordCount.class"), both succeed within eclipse (without this warning message), but still fail with FileNotFoundException when executed as standalone jar file on command line. I guess the problem may relate to class path setting if not unresolved dependencies.
think you should add appropriate jar files.
In your case you must have this jar org.apache.hadoop.mapreduce.Job in your project file.
I imported the following classes and interfaces
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
And your project working fine.
Just check after importing all above mentioned classes. If any problem, give me a comment.
please use this java code for word counting, with two arguments one is input file other one is result file. And add all jar files from mapreduce and common folders in hadoop directory
package org.samples.mapreduce.training;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.job.tracker", "hdfs://localhost:50001");
conf.set("fs.default.name", "hdfs://localhost:50000");
Job job = new Job(conf, "wordcount");
job.setJarByClass(WordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Or If you want use advance version use this code with three arguments, here third one file which you dont want count example ,
package org.samples.mapreduce.training;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class WordCountV2 {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
static enum CountersEnum { INPUT_WORDS }
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private boolean caseSensitive;
private Set<String> patternsToSkip = new HashSet<String>();
private Configuration conf;
private BufferedReader fis;
#Override
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
if (conf.getBoolean("wordcount.skip.patterns", true)) {
URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
for (URI patternsURI : patternsURIs) {
Path patternsPath = new Path(patternsURI.getPath());
String patternsFileName = patternsPath.getName().toString();
parseSkipFile(patternsFileName);
}
}
}
private void parseSkipFile(String fileName) {
try {
fis = new BufferedReader(new FileReader(fileName));
String pattern = null;
while ((pattern = fis.readLine()) != null) {
patternsToSkip.add(pattern);
}
} catch (IOException ioe) {
System.err.println("Caught exception while parsing the cached file '"
+ StringUtils.stringifyException(ioe));
}
}
#Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = (caseSensitive) ?
value.toString() : value.toString().toLowerCase();
for (String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "");
}
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
Counter counter = context.getCounter(CountersEnum.class.getName(),
CountersEnum.INPUT_WORDS.toString());
counter.increment(1);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCountV2.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
List<String> otherArgs = new ArrayList<String>();
for (int i=0; i < remainingArgs.length; ++i) {
if ("-skip".equals(remainingArgs[i])) {
job.addCacheFile(new Path(remainingArgs[++i]).toUri());
job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
} else {
otherArgs.add(remainingArgs[i]);
}
}
FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

MapReduce Old API - Passing Command Line Argument to Map

I am coding a MapReduce job for finding the occurrence of a search string (passed through Command Line argument) in an input file stored in HDFS using old API.
Below is my Driver class -
public class StringSearchDriver
{
public static void main(String[] args) throws IOException
{
JobConf jc = new JobConf(StringSearchDriver.class);
jc.set("SearchWord", args[2]);
jc.setJobName("String Search");
FileInputFormat.addInputPath(jc, new Path(args[0]));
FileOutputFormat.setOutputPath(jc, new Path(args[1]));
jc.setMapperClass(StringSearchMap.class);
jc.setReducerClass(StringSearchReduce.class);
jc.setOutputKeyClass(Text.class);
jc.setOutputValueClass(IntWritable.class);
JobClient.runJob(jc);
}
}
Below is my Mapper Class -
public class StringSearchMap extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable>
{
String searchWord;
public void configure(JobConf jc)
{
searchWord = jc.get("SearchWord");
}
#Override
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> out, Reporter reporter)
throws IOException
{
String[] input = value.toString().split("");
for(String word:input)
{
if (word.equalsIgnoreCase(searchWord))
out.collect(new Text(word), new IntWritable(1));
}
}
}
On running the job (command line string passed is "hi"), I am getting the below error -
14/09/21 22:35:41 INFO mapred.JobClient: Task Id : attempt_201409212134_0005_m_000001_2, Status : FAILED
java.lang.ClassCastException: interface javax.xml.soap.Text
at java.lang.Class.asSubclass(Class.java:3129)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:795)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:964)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:422)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:366)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:416)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Please suggest.
You auto imported the wrong import.
Instead of import org.apache.hadoop.io.Text you import javax.xml.soap.Text
You can find a sample wrong import in this blog.
One point , It is better to adopt New API
EDIT
I used New Api
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* #author Unmesha sreeveni
* #Date 23 sep 2014
*/
public class StringSearchDriver extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String line = value.toString();
String searchString = conf.get("word");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if(token.equals(searchString)){
word.set(token);
context.write(word, one);
}
}
}
}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int res = ToolRunner.run(conf, new StringSearchDriver(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args.length != 3) {
System.out
.printf("Usage: Search String <input dir> <output dir> <search word> \n");
System.exit(-1);
}
String source = args[0];
String dest = args[1];
String searchword = args[2];
Configuration conf = new Configuration();
conf.set("word", searchword);
Job job = new Job(conf, "Search String");
job.setJarByClass(StringSearchDriver.class);
FileSystem fs = FileSystem.get(conf);
Path in =new Path(source);
Path out =new Path(dest);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
boolean sucess = job.waitForCompletion(true);
return (sucess ? 0 : 1);
}
}
This works.
For Text; required hadoop package is org.apache.hadoop.io..
Check your packages
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

Resources