I wrote a program which take pdf as an input and produces text output as a whole. I want to load this text in hbase using same program, is there any way to do it.Any help will be appreciable
//Driver Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class PdfInputDriver {
public static void main(String[] args) throws IOException,InterruptedException, ClassNotFoundException
{
Configuration conf = new Configuration();
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
args = parser.getRemainingArgs();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Pdftext");
job.setJarByClass(PdfInputDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setInputFormatClass(PdfInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
System.out.println(job.waitForCompletion(true));
}
}
//InputFormatClass
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class PdfInputFormat extends FileInputFormat<Object, Object> {
#SuppressWarnings({ "unchecked", "rawtypes" })
#Override
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
return new PdfRecordReader();
}
}
//PDF Record Reader class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PdfRecordReader extends RecordReader<Object, Object> {
private String[] lines = null;
private LongWritable key = null;
private Text value = null;
#Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
final Path file = split.getPath();
/*
* The below code contains the logic for opening the file and seek to
* the start of the split. Here we are applying the Pdf Parsing logic
*/
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
PDDocument pdf = null;
String parsedText = null;
PDFTextStripper stripper;
pdf = PDDocument.load(fileIn);
stripper = new PDFTextStripper();
parsedText = stripper.getText(pdf);
//String delims = "[ ]";
this.lines = parsedText.split("/n");
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new LongWritable();
key.set(1);
value = new Text();
value.set(lines[0]);
} else
{
int temp = (int) key.get();
if (temp < (lines.length - 1)) {
int count = (int) key.get();
value = new Text();
value.set(lines[count]);
count = count + 1;
key = new LongWritable(count);
} else {
return false;
}
}
if (key == null || value == null) {
return false;
} else {
return true;
}
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
#Override
public void close() throws IOException {
}
}
//Mapper Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>
{
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, key);
}
}
//Reducer Class
package com.tcs;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Object, Object, Object, Object> {
protected void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
context.write(key, new WordCountReducer());
}
}
i think you are making jar file of it. Just use the part-r-00000 file of what you have generated from mapreduce output.
create 'table'
Related
The mapreduce code here produces an empty output file. The code and the input is given below.
package temperature;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TemperatureMapper extends Mapper<Text, Text, Text, IntWritable> {
#Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
if (isValueValid(value.toString())) {
Text key2 = new Text(getStateFromValue(value.toString()));
IntWritable value2 = new IntWritable(getTemperatureFrom(value.toString()));
context.write(key2, value2);
}
}
private boolean isValueValid(final String value) {
// We expect that the value is a String in the form of : State, Temperature. E.g. MP,77
Pattern p = Pattern.compile("\\S\\S\\,\\d+");
Matcher m = p.matcher(value);
return m.matches();
}
private String getStateFromValue(final String value) {
final String[] subvalues = value.split("\\,");
return subvalues[0];
}
private int getTemperatureFrom(final String value) {
final String[] subvalues = value.split("\\,");
return Integer.parseInt(subvalues[1]);
}
}
public class TemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
protected void reduce(final Text key, final Iterable<IntWritable> values, final Context context) throws IOException, InterruptedException {
int sumOfTemperatures = 0;
int nbValues = 0;
int average=0;
for (IntWritable temperature : values) {
sumOfTemperatures += temperature.get();
nbValues++;
}
average = sumOfTemperatures / nbValues;
context.write(key, new IntWritable(average));
}
}
public class average {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (otherArgs.length != 2) {
System.err.println("Usage: Main <in> <out>");
System.exit(-1);
}*/
Job job = new Job(conf, "Calculate average Temperature");
job.setInputFormatClass(KeyValueTextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
job.setJarByClass(average.class);
job.setMapperClass(TemperatureMapper.class);
job.setReducerClass(TemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : -1);
}
}
The code works fine for the input:
Ujjain MP,77
Bhopal MP,76
Indore MP,72
Raipur CG,72
Durg CG,75
Raigarth CG,70
Kendujhar OR,69
Bhubaneswar OR,71
Puri OR,76
But not for some random input like:
hello VI,6
bye RE,2
It rather produces an empty output file.
modify your regular expression for the following to support that kind of input
Pattern p = Pattern.compile("[a-zA-Z]*\\s*[a-zA-Z]{2},\\d+$");
Also, you will need split again to get the state
String[] subvalues = value.split("\\,")[0].split(" ");
return subvalues[subvalues.length - 1];
I hope it helps. In my side, I had to change the key type in the value LongWritable, I am not sure why is not complaining in our side, probably a different api version
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
This code counts words and skips two given words(in & of) form a file:-
Please help why it is not skipping these words.
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
class skipwc_mapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer t = new StringTokenizer(line);
Text word = null;
while (t.hasMoreTokens()) {
word = new Text(t.nextToken());
context.write(word, new IntWritable(1));
}
}
}
class skipwc_reducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int tot = 0;
if (key.toString() != "in" && key.toString() != "of") {
while (values.iterator().hasNext()) {
tot += values.iterator().next().get();
}
context.write(key, new IntWritable(tot));
}
}
}
public static class skipwc_runner {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(skipwc_runner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(skipwc_mapper.class);
job.setReducerClass(skipwc_reducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
}
Use equals method to compare String like:
if (!"in".equals(key.toString()) && !"of".equals(key.toString()))
Also it would be beneficial if you skip of/in in the mapper rather than reducer as it would efficient to remove the data before sorting and shuffling phase, so you avoid additional IO.
When I read orcfile and write data to orcfile, I get following error:
expected org.apache.hadoop.hive.ql.io.orc.OrcStruct,
received org.apache.hadoop.hive.ql.io.orc.OrcSerde$OrcSerdeRow
Is the MapoutputValue.class not right?
this is my program:
package com.baifendian.basicPlatform.hive.ql.io.orc;
import java.io.IOException;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcNewOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class OrcInput {
final static String inputSchema = "struct<c1:string,c2:string>";
final static String outputSchema = "struct<c1:string,c2:string>";
static StructObjectInspector inputOI;
static SettableStructObjectInspector outputOI;
static OrcSerde orcsd;
public static class OrcReaderMap extends
Mapper<NullWritable, OrcStruct, NullWritable, Writable> {
public void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
TypeInfo tfin = TypeInfoUtils
.getTypeInfoFromTypeString(inputSchema);
TypeInfo tfout = TypeInfoUtils
.getTypeInfoFromTypeString(outputSchema);
inputOI = (StructObjectInspector) OrcStruct
.createObjectInspector(tfin);
outputOI = (SettableStructObjectInspector) OrcStruct
.createObjectInspector(tfout);
orcsd = new OrcSerde();
List<? extends StructField> fldlst = outputOI
.getAllStructFieldRefs();
StringBuffer sbCols = new StringBuffer();
StringBuffer sbTyps = new StringBuffer();
for (StructField sf : fldlst) {
if (sbCols.length() > 0) {
sbCols.append(",");
}
sbCols.append(sf.getFieldName());
if (sbTyps.length() > 0) {
sbTyps.append(",");
}
sbTyps.append(sf.getFieldObjectInspector().getTypeName());
}
Properties props = new Properties();
props.put(IOConstants.COLUMNS, sbCols.toString());
props.put(IOConstants.COLUMNS_TYPES, sbTyps.toString());
orcsd.initialize(context.getConfiguration(), props);
}
public void map(NullWritable meaningless, OrcStruct orc, Context context)
throws IOException, InterruptedException {
List<Object> ilst = inputOI.getStructFieldsDataAsList(orc);
Text f1 = (Text) ilst.get(0);
Text f2 = (Text) ilst.get(1);
// output orc format
OrcStruct objOut = (OrcStruct) outputOI.create();
List<? extends StructField> flst = outputOI.getAllStructFieldRefs();
outputOI.setStructFieldData(objOut, flst.get(0), f1);
outputOI.setStructFieldData(objOut, flst.get(1), f2);
context.write(NullWritable.get(), orcsd.serialize(objOut, outputOI));
}
}
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf, "OrcReader");
job.setJarByClass(OrcInput.class);
job.setInputFormatClass(OrcNewInputFormat.class);
job.setOutputFormatClass(OrcNewOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("/warehouse/bae_xinhua_test.db/orcinput"));
FileOutputFormat.setOutputPath(job, new Path("/warehouse/bae_xinhua_test.db/orcoutput"));
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(OrcStruct.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(OrcStruct.class);
job.setMapperClass(OrcInput.OrcReaderMap.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
please add job.setNumReduceTasks(0);
I am coding a MapReduce job for finding the occurrence of a search string (passed through Command Line argument) in an input file stored in HDFS using old API.
Below is my Driver class -
public class StringSearchDriver
{
public static void main(String[] args) throws IOException
{
JobConf jc = new JobConf(StringSearchDriver.class);
jc.set("SearchWord", args[2]);
jc.setJobName("String Search");
FileInputFormat.addInputPath(jc, new Path(args[0]));
FileOutputFormat.setOutputPath(jc, new Path(args[1]));
jc.setMapperClass(StringSearchMap.class);
jc.setReducerClass(StringSearchReduce.class);
jc.setOutputKeyClass(Text.class);
jc.setOutputValueClass(IntWritable.class);
JobClient.runJob(jc);
}
}
Below is my Mapper Class -
public class StringSearchMap extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable>
{
String searchWord;
public void configure(JobConf jc)
{
searchWord = jc.get("SearchWord");
}
#Override
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> out, Reporter reporter)
throws IOException
{
String[] input = value.toString().split("");
for(String word:input)
{
if (word.equalsIgnoreCase(searchWord))
out.collect(new Text(word), new IntWritable(1));
}
}
}
On running the job (command line string passed is "hi"), I am getting the below error -
14/09/21 22:35:41 INFO mapred.JobClient: Task Id : attempt_201409212134_0005_m_000001_2, Status : FAILED
java.lang.ClassCastException: interface javax.xml.soap.Text
at java.lang.Class.asSubclass(Class.java:3129)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:795)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:964)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:422)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:366)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:416)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Please suggest.
You auto imported the wrong import.
Instead of import org.apache.hadoop.io.Text you import javax.xml.soap.Text
You can find a sample wrong import in this blog.
One point , It is better to adopt New API
EDIT
I used New Api
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* #author Unmesha sreeveni
* #Date 23 sep 2014
*/
public class StringSearchDriver extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String line = value.toString();
String searchString = conf.get("word");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if(token.equals(searchString)){
word.set(token);
context.write(word, one);
}
}
}
}
public static class Reduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int res = ToolRunner.run(conf, new StringSearchDriver(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args.length != 3) {
System.out
.printf("Usage: Search String <input dir> <output dir> <search word> \n");
System.exit(-1);
}
String source = args[0];
String dest = args[1];
String searchword = args[2];
Configuration conf = new Configuration();
conf.set("word", searchword);
Job job = new Job(conf, "Search String");
job.setJarByClass(StringSearchDriver.class);
FileSystem fs = FileSystem.get(conf);
Path in =new Path(source);
Path out =new Path(dest);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
boolean sucess = job.waitForCompletion(true);
return (sucess ? 0 : 1);
}
}
This works.
For Text; required hadoop package is org.apache.hadoop.io..
Check your packages
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
I have written a custom input format and configured that in job. Still that inputformat is not getting invoked. I have kept some SOP's to get printed while running the code but none of them are printing. Even when I comment the custom inputformat in the driver class still the output remains same. Where am I missing ?
DriverClass
public class TestDriver {
public static void main(String args[]) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
Job job = new Job(conf,"Custom Format");
job.setMapperClass(CustomInputFormatmapper.class);
job.setReducerClass(CustomInputFormatReducer.class);
job.setInputFormatClass(CustomInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(LongWritable.class);
job.getConfiguration().set("fs.file.impl", "com.learn.WinLocalFileSystem");
String inputPath="In\\VISA_Details.csv";
Path inPath=new Path(inputPath);
String outputPath = "C:\\Users\\Desktop\\Hadoop learning\\output\\run1";
Path outPath=new Path(outputPath);
FileInputFormat.setInputPaths(job, inPath );
FileOutputFormat.setOutputPath(job, outPath);
System.out.println(job.waitForCompletion(true));
}
}
CUSTOM INPUTFORMAT
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class CustomInputFormat extends TextInputFormat{
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
{
System.out.println(" ------------ INSIDE createRecordReader()--------------");
return new CustomRecordReader();
}
}
CUSTOM RECORDREADER
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.LineReader;
public class CustomRecordReader extends RecordReader {
private CompressionCodecFactory compressionCodecs;
private final int NLINESTOPROCESS = 3;
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private LongWritable key;
private Text value;
#Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
#Override
public Object getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public Object getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return 0;
}
#Override
public void initialize(InputSplit inputsplit,TaskAttemptContext taskattemptcontext)
throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE INITILISE: THIS IS NOT PRINTING----------");
FileSplit split = (FileSplit)inputsplit;
Configuration job = taskattemptcontext.getConfiguration();
maxLineLength = job.getInt("mapred.linerecordreader.maxlength", 2147483647);
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
compressionCodecs = new CompressionCodecFactory(job);
CompressionCodec codec = compressionCodecs.getCodec(file);
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
boolean skipFirstLine = false;
if(codec != null)
{
in = new LineReader(codec.createInputStream(fileIn), job);
end = 9223372036854775807L;
} else
{
if(start != 0L)
{
skipFirstLine = true;
start--;
fileIn.seek(start);
}
in = new LineReader(fileIn, job);
}
if(skipFirstLine)
start += in.readLine(new Text(), 0, (int)Math.min(2147483647L, end - start));
pos = start;
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE nextKeyValue()------------");
if(key==null){
key = new LongWritable();
}
if(value==null){
value = new Text();
}
key.set(pos);
value.clear();
final Text newLine = new Text("\n");
Text newVal = new Text();
int newSize = 0;
for(int i =0;i<NLINESTOPROCESS;i++){
Text v = new Text();
while(pos<end){
newSize = in.readLine(v, maxLineLength,Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),maxLineLength));
value.append(v.getBytes(),0, v.getLength());
value.append(newLine.getBytes(),0, newLine.getLength());
if (newSize == 0) {
break;
}
pos += newSize;
if (newSize < maxLineLength) {
break;
}
}
}
return false;
}
}
MAPPER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CustomInputFormatmapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
public void map(LongWritable key, Text val, Context context)throws IOException, InterruptedException{
String value = val.toString();
String[] totalRows = value.split("\n");
int count =totalRows.length;
context.write(new LongWritable(Long.valueOf(count)), new LongWritable(1L));
}
}
REDUCER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class CustomInputFormatReducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable> {
public void reduce(LongWritable key, Iterable<LongWritable> val, Context context) throws IOException, InterruptedException{
System.out.println(" --------REDUCER--------");
long count =0;
for(LongWritable vals: val){
count++;
}
context.write(key, new LongWritable(count));
}
}
I am answering my own question as this will help others to get through the problem which I faced. There was a problem with the package which I was importing.
Mentioning the mistakes which I did.
CUSTOMINPUTFORMAT CLASS
1) Missed the #Override annotation
2) Imported from import org.apache.hadoop.mapred.InputSplit instead of org.apache.hadoop.mapreduce.InputSplit;
CUSTOMRECORDREADER
1) Imports were done from org.apache.hadoop.mapred.* and not from org.apache.hadoop.mapreduce.*;