Reading images from HDFS using mapreduce - hadoop

Please help me in this code. I am trying to reiad images from HDFS. I am using WholeFileInputFormat. with WholeFileRecordreader. No compile time errors.But the code is giving runtime errors.
The output is saying: cannot create the instance of the given class WholeFileInputFormat.
I have written this code according to the comments on How to read multiple image files as input from hdfs in map-reduce?
Please help me in this code.It contains 3 classes.How to debug it? Or any other way?
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.imageio.ImageIO;
import net.semanticmetadata.lire.imageanalysis.AutoColorCorrelogram;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class map2 extends Configured implements Tool {
public static class MapClass extends MapReduceBase
implements Mapper<NullWritable, BytesWritable, Text, Text> {
private Text input_image = new Text();
private Text input_vector = new Text();
#Override
public void map(NullWritable key,BytesWritable value,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
System.out.println("CorrelogramIndex Method:");
String featureString;
int MAXIMUM_DISTANCE = 16;
AutoColorCorrelogram.Mode mode = AutoColorCorrelogram.Mode.FullNeighbourhood;
byte[] identifier=value.getBytes();
BufferedImage bimg = ImageIO.read(new ByteArrayInputStream(identifier));
AutoColorCorrelogram vd = new AutoColorCorrelogram(MAXIMUM_DISTANCE, mode);
vd.extract(bimg);
featureString = vd.getStringRepresentation();
double[] bytearray = vd.getDoubleHistogram();
System.out.println("image: " + identifier + " " + featureString);
System.out.println(" ------------- ");
input_image.set(identifier);
input_vector.set(featureString);
output.collect(input_image, input_vector);
}
}
public static class Reduce extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
#Override
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String out_vector = "";
while (values.hasNext()) {
out_vector += (values.next().toString());
}
output.collect(key, new Text(out_vector));
}
}
static int printUsage() {
System.out.println("map2 [-m <maps>] [-r <reduces>] <input> <output>");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
#Override
public int run(String[] args) throws Exception {
JobConf conf = new JobConf(getConf(), map2.class);
conf.setJobName("image_mapreduce");
conf.setInputFormat(WholeFileInputFormat.class);
conf.setOutputFormat(NullOutputFormat.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(MapClass.class);
conf.setReducerClass(Reduce.class);
List<String> other_args = new ArrayList<>();
for (int i = 0; i < args.length; ++i) {
try {
switch (args[i]) {
case "-m":
conf.setNumMapTasks(Integer.parseInt(args[++i]));
break;
case "-r":
conf.setNumReduceTasks(Integer.parseInt(args[++i]));
break;
default:
other_args.add(args[i]);
break;
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
return printUsage();
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from "
+ args[i - 1]);
return printUsage();
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: "
+ other_args.size() + " instead of 2.");
return printUsage();
}
FileInputFormat.setInputPaths(conf, other_args.get(0));
FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new map2(), args);
System.exit(res);
}
}
-----------------------------------------------------------------------------------
//WholeFileInputFormat
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.*;
public class WholeFileInputFormat<NullWritable, BytesWritable>
extends FileInputFormat<NullWritable, BytesWritable> {
// #Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
//#Override
public WholeFileRecordReader createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
WholeFileRecordReader reader = new WholeFileRecordReader();
reader.initialize(split, context);
return reader;
}
#Override
public RecordReader<NullWritable, BytesWritable> getRecordReader(InputSplit split,
JobConf job, Reporter reporter)
throws IOException;
}
-------------------------------------------------------------------------------
//WholeInputFileRecorder
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.TaskAttemptContext;
class WholeFileRecordReader implements RecordReader<NullWritable, BytesWritable> { //recordreader
private FileSplit fileSplit;
private Configuration conf;
private BytesWritable value = new BytesWritable();
private boolean processed = false;
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
this.fileSplit = (FileSplit) split;
this.conf = context.getJobConf();
}
#Override
public boolean next(NullWritable k, BytesWritable v) throws IOException {
if (!processed) {
byte[] contents = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
org.apache.hadoop.fs.FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
} finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
#Override
public NullWritable createKey() {
return NullWritable.get();
}
#Override
public BytesWritable createValue() {
return value;
}
#Override
public long getPos() throws IOException {
throw new UnsupportedOperationException("Not supported yet.");
}
#Override
public void close() throws IOException {
throw new UnsupportedOperationException("Not supported yet.");
}
#Override
public float getProgress() throws IOException {
throw new UnsupportedOperationException("Not supported yet.");
}
}

WholeFileInputFormat is defined as abstract, how do you want to create an instance of it?
Either make it not abstract or subclass it with a concrete implementation.

Related

Map reduce to duplicate the output files in order to put in different tables in hive

I have a MASTER table and two other tables in hive
Master table contains
MsgId,NbOfTxs,InitgPty,PmtInf,DbtrAcct
Sub Master Table 1
MsgId,NbOfTxs,DbtrAcct
Sub Master Table 2
MsgId,NbOfTxs,InitgPty
The data are in xml format .I have written MR code to parse it . I would like to create different part -r files so that they put output to the tables in hive directly
How can I put or load the OUTPUT files directly to hive using MapReduce to load in corresponding hive tables or is there a better way to put these files in hive tables
My code below
package xmlcsvMR;
import javax.xml.stream.XMLStreamConstants;//XMLInputFactory;
import java.io.*;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import javax.xml.stream.*;
public class XmlParser11
{
public static class XmlInputFormat1 extends TextInputFormat {
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
/**
* XMLRecordReader class to read through a given xml document to output
* xml blocks as records as specified by the start tag and end tag
*
*/
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
FileSplit fileSplit = (FileSplit) split;
// open the file and seek to the start of the split
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws IOException,
InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0,
buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}
public static class Map extends Mapper<LongWritable, Text,
Text, Text> {
#Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws
IOException, InterruptedException {
String document = value.toString();
System.out.println("‘" + document + "‘");
try {
XMLStreamReader reader =
XMLInputFactory.newInstance().createXMLStreamReader(new
ByteArrayInputStream(document.getBytes()));
String propertyName = "";
String propertyValue = "";
String currentElement = "";
while (reader.hasNext()) {
int code = reader.next();
switch (code) {
case XMLStreamConstants.START_ELEMENT: //START_ELEMENT:
currentElement = reader.getLocalName();
break;
case XMLStreamConstants.CHARACTERS: //CHARACTERS:
if (currentElement.equalsIgnoreCase("MsgId")) {
propertyName += reader.getText();
//System.out.println(propertyName);
} else if (currentElement.equalsIgnoreCase("NbOfTxs")) {
propertyValue += reader.getText();
//System.out.println(propertyValue);
}
break;
}
}
reader.close();
context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
}
catch(Exception e){
throw new IOException(e);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
private Text outputKey = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
for (Text value : values) {
outputKey.set(constructPropertyXml(key, value));
context.write(outputKey, null);
}
}
public static String constructPropertyXml(Text name, Text value) {
StringBuilder sb = new StringBuilder();
sb.append("MsgID ").append(name)
.append(" NbOfTxs ").append(value);
return sb.toString();
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<Event>");
conf.set("xmlinput.end", "</Event>");
conf.set("mapred.textoutputformat.separatorText", ",");
Job job = new Job(conf);
job.setJarByClass(XmlParser11.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(XmlParser11.Map.class);
job.setReducerClass(XmlParser11.Reduce.class);
job.setInputFormatClass(XmlInputFormat1.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Try using MultiOutputs. You can write to different files using this option, and hence can make different copies of output to be loaded into Hive.
A very good example is here using hadoop 1.0.2
Below is the example taken from javadocs:
Usage in Reducer:
<K, V> String generateFileName(K k, V v) {
return k.toString() + "_" + v.toString();
}
public class MOReduce extends
Reducer<WritableComparable, Writable,WritableComparable, Writable> {
private MultipleOutputs mos;
public void setup(Context context) {
...
mos = new MultipleOutputs(context);
}
public void reduce(WritableComparable key, Iterator<Writable> values,
Context context)
throws IOException {
...
mos.write("text", , key, new Text("Hello"));
mos.write("seq", LongWritable(1), new Text("Bye"), "seq_a");
mos.write("seq", LongWritable(2), key, new Text("Chau"), "seq_b");
mos.write(key, new Text("value"), generateFileName(key, new Text("value")));
...
}
public void cleanup(Context) throws IOException {
mos.close();
...
}
}

XML Parsing through HADOOP

I have a huge dump of WIKI data which I need to analyze . These dumps are basically XML files . how to parse the XML files using HADOOP Map Reduce?
import java.io.ByteArrayInputStream;
import java.io.IOException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class XmlDriver
{
public static class XmlInputFormat1 extends TextInputFormat {
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
/**
* XMLRecordReader class to read through a given xml document to output
* xml blocks as records as specified by the start tag and end tag
*
*/
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
FileSplit fileSplit = (FileSplit) split;
// open the file and seek to the start of the split
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws IOException,
InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0,
buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}
public static class Map extends Mapper<LongWritable, Text,
Text, Text> {
#Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws
IOException, InterruptedException {
String document = value.toString();
System.out.println("‘" + document + "‘");
try {
XMLStreamReader reader =
XMLInputFactory.newInstance().createXMLStreamReader(new
ByteArrayInputStream(document.getBytes()));
String propertyName = "";
String propertyValue = "";
String currentElement = "";
while (reader.hasNext()) {
int code = reader.next();
switch (code) {
case XMLStreamConstants.START_ELEMENT: //START_ELEMENT:
currentElement = reader.getLocalName();
break;
case XMLStreamConstants.CHARACTERS: //CHARACTERS:
if (currentElement.equalsIgnoreCase("name")) {
propertyName += reader.getText();
System.out.println("propertName"+propertyName);
} else if (currentElement.equalsIgnoreCase("value")) {
propertyValue += reader.getText();
System.out.println("propertyValue"+propertyValue);
}
break;
}
}
reader.close();
context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
}
catch(Exception e){
throw new IOException(e);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
#Override
protected void setup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("<configuration>"), null);
}
#Override
protected void cleanup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("</configuration>"), null);
}
private Text outputKey = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
for (Text value : values) {
outputKey.set(constructPropertyXml(key, value));
context.write(outputKey, null);
}
}
public static String constructPropertyXml(Text name, Text value) {
StringBuilder sb = new StringBuilder();
sb.append("<property><name>").append(name)
.append("</name><value>").append(value)
.append("</value></property>");
return sb.toString();
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<property>");
conf.set("xmlinput.end", "</property>");
Job job = new Job(conf);
job.setJarByClass(XmlDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(XmlDriver.Map.class);
job.setReducerClass(XmlDriver.Reduce.class);
job.setInputFormatClass(XmlInputFormat1.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
argument:- /home/test.xml /home/out
Xml File(test.xml):
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs</name>
<value>2</value>
</property>
</configuration>

hadoop CustomInputFormat Not getting invoked

I have written a custom input format and configured that in job. Still that inputformat is not getting invoked. I have kept some SOP's to get printed while running the code but none of them are printing. Even when I comment the custom inputformat in the driver class still the output remains same. Where am I missing ?
DriverClass
public class TestDriver {
public static void main(String args[]) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
Job job = new Job(conf,"Custom Format");
job.setMapperClass(CustomInputFormatmapper.class);
job.setReducerClass(CustomInputFormatReducer.class);
job.setInputFormatClass(CustomInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(LongWritable.class);
job.getConfiguration().set("fs.file.impl", "com.learn.WinLocalFileSystem");
String inputPath="In\\VISA_Details.csv";
Path inPath=new Path(inputPath);
String outputPath = "C:\\Users\\Desktop\\Hadoop learning\\output\\run1";
Path outPath=new Path(outputPath);
FileInputFormat.setInputPaths(job, inPath );
FileOutputFormat.setOutputPath(job, outPath);
System.out.println(job.waitForCompletion(true));
}
}
CUSTOM INPUTFORMAT
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class CustomInputFormat extends TextInputFormat{
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
{
System.out.println(" ------------ INSIDE createRecordReader()--------------");
return new CustomRecordReader();
}
}
CUSTOM RECORDREADER
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.LineReader;
public class CustomRecordReader extends RecordReader {
private CompressionCodecFactory compressionCodecs;
private final int NLINESTOPROCESS = 3;
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private LongWritable key;
private Text value;
#Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
#Override
public Object getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public Object getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return 0;
}
#Override
public void initialize(InputSplit inputsplit,TaskAttemptContext taskattemptcontext)
throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE INITILISE: THIS IS NOT PRINTING----------");
FileSplit split = (FileSplit)inputsplit;
Configuration job = taskattemptcontext.getConfiguration();
maxLineLength = job.getInt("mapred.linerecordreader.maxlength", 2147483647);
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
compressionCodecs = new CompressionCodecFactory(job);
CompressionCodec codec = compressionCodecs.getCodec(file);
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
boolean skipFirstLine = false;
if(codec != null)
{
in = new LineReader(codec.createInputStream(fileIn), job);
end = 9223372036854775807L;
} else
{
if(start != 0L)
{
skipFirstLine = true;
start--;
fileIn.seek(start);
}
in = new LineReader(fileIn, job);
}
if(skipFirstLine)
start += in.readLine(new Text(), 0, (int)Math.min(2147483647L, end - start));
pos = start;
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE nextKeyValue()------------");
if(key==null){
key = new LongWritable();
}
if(value==null){
value = new Text();
}
key.set(pos);
value.clear();
final Text newLine = new Text("\n");
Text newVal = new Text();
int newSize = 0;
for(int i =0;i<NLINESTOPROCESS;i++){
Text v = new Text();
while(pos<end){
newSize = in.readLine(v, maxLineLength,Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),maxLineLength));
value.append(v.getBytes(),0, v.getLength());
value.append(newLine.getBytes(),0, newLine.getLength());
if (newSize == 0) {
break;
}
pos += newSize;
if (newSize < maxLineLength) {
break;
}
}
}
return false;
}
}
MAPPER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CustomInputFormatmapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
public void map(LongWritable key, Text val, Context context)throws IOException, InterruptedException{
String value = val.toString();
String[] totalRows = value.split("\n");
int count =totalRows.length;
context.write(new LongWritable(Long.valueOf(count)), new LongWritable(1L));
}
}
REDUCER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class CustomInputFormatReducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable> {
public void reduce(LongWritable key, Iterable<LongWritable> val, Context context) throws IOException, InterruptedException{
System.out.println(" --------REDUCER--------");
long count =0;
for(LongWritable vals: val){
count++;
}
context.write(key, new LongWritable(count));
}
}
I am answering my own question as this will help others to get through the problem which I faced. There was a problem with the package which I was importing.
Mentioning the mistakes which I did.
CUSTOMINPUTFORMAT CLASS
1) Missed the #Override annotation
2) Imported from import org.apache.hadoop.mapred.InputSplit instead of org.apache.hadoop.mapreduce.InputSplit;
CUSTOMRECORDREADER
1) Imports were done from org.apache.hadoop.mapred.* and not from org.apache.hadoop.mapreduce.*;

Blank / Empty Values of Key and Value retrieved in the Mapper class

I have written a MapReduce code for running it on a CDH4 cluster. My requirement was to read the complete file as the value and the file name as the key. For that I wrote custom InputFormat and RecordReader classes.
Custom input format class: FullFileInputFormat.java
import java.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import FullFileRecordReader;
public class FullFileInputFormat extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf jobConf, Reporter reporter) throws IOException {
reporter.setStatus(split.toString());
return new FullFileRecordReader((FileSplit) split, jobConf);
}
}
And the custom RecordReader class: FullFileRecordReader.java
import java.io.BufferedReader;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
public class FullFileRecordReader implements RecordReader<Text, Text> {
private BufferedReader in;
private boolean processed = false;
private int processedBytes = 0;
private FileSplit fileSplit;
private JobConf conf;
public FullFileRecordReader(FileSplit fileSplit, JobConf conf) {
this.fileSplit = fileSplit;
this.conf = conf;
}
#Override
public void close() throws IOException {
if (in != null) {
in.close();
}
}
#Override
public Text createKey() {
return new Text("");
}
#Override
public Text createValue() {
return new Text("");
}
#Override
public long getPos() throws IOException {
return processedBytes;
}
#Override
public boolean next(Text key, Text value) throws IOException {
Path filePath = fileSplit.getPath();
if (!processed) {
key = new Text(filePath.getName());
value = new Text("");
FileSystem fs = filePath.getFileSystem(conf);
FSDataInputStream fileIn = fs.open(filePath);
byte[] b = new byte[1024];
int numBytes = 0;
while ((numBytes = fileIn.read(b)) > 0) {
value.append(b, 0, numBytes);
processedBytes += numBytes;
}
processed = true;
return true;
}
return false;
}
#Override
public float getProgress() throws IOException {
return 0;
}
}
Though whenever I try to print the key-value in the RecordReader class, I get their values, but when I print the same in the mapper class, I see blank values for them. I am unable to understand why the Mapper class is unable to get any data for keys and values.
Currently I have only a Map job and no reduce job. The code is:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import FullFileInputFormat;
public class Source {
public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text> {
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws java.io.IOException {
System.out.println("Processing " + key.toString());
System.out.println("Value: " + value.toString());
}
}
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(Source.class);
job.setJobName("Source");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setJarByClass(Source.class);
job.setInputFormat(FullFileInputFormat.class);
job.setMapperClass(Map.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
}
}
You're creating new instances in your next method - hadoop re-uses objects so you are expected to populate the ones passed. It should be as simple as amending as follows:
#Override
public boolean next(Text key, Text value) throws IOException {
Path filePath = fileSplit.getPath();
if (!processed) {
// key = new Text(filePath.getName());
key.set(filePath.getName());
// value = new Text("");
value.clear();
}
I would also recommend pre-sizing the value text to avoid 'growing' pains of the value's underlying byte array. Text has a private method called setCapacity, so you unforntunately can't call it - but if you used a BytesWritable to buffer the file input, you can call setCapacity in side your next method, passing the fileSplit length (note this may still be wrong if your file is compressed - as the file size is the compressed size).

Hadoop: Help needed while implementing a custom FileInputFormat-class

i'm trying to implement some Map/Reduce job with hadoop for a college assignment. but at the moment i'm totally stuck while implementing a custom FileInputFormat class to get the whole content from a file into my mapper.
i took the example out of "hadoop: the definitive guide" without no changes. i can compile my source code but if i run it it will throw this exception (at the moment i use hadoop 1.0.2 on a debian 5.0)
Exception in thread "main" java.lang.RuntimeException: java.lang.NoSuchMethodException: org.myorg.ExampleFileInputFormat$WholeFileInputFormat.<init>()
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:115)
at org.apache.hadoop.mapred.JobConf.getInputFormat(JobConf.java:575)
at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:989)
at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:981)
at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1093)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:824)
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1261)
at org.myorg.ExampleFileInputFormat.run(ExampleFileInputFormat.java:163)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
at org.myorg.ExampleFileInputFormat.main(ExampleFileInputFormat.java:172)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.util.RunJar.main(RunJar.java:156)
Caused by: java.lang.NoSuchMethodException: org.myorg.ExampleFileInputFormat$WholeFileInputFormat.<init>()
at java.lang.Class.getConstructor0(Class.java:2706)
at java.lang.Class.getDeclaredConstructor(Class.java:1985)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:109)
... 21 more
i'm kind of frustrated because i don't understand what happens and i don't find anything using web search. Maybe some of you could take a look on my source. it's kind of stripped down at the moment for the purpose of debugging.
package org.myorg;
/*
*
*
*/
import java.io.IOException;
import java.util.*;
import java.io.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.util.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configured;
public class ExampleFileInputFormat extends Configured implements Tool {
/*
* <generics>
*/
public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
#Override
protected boolean isSplitable(FileSystem fs, Path filename) {
return false;
}
#Override
public RecordReader<NullWritable, BytesWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
return new WholeFileRecordReader((FileSplit) split, job);
}
}
public class WholeFileRecordReader implements RecordReader<NullWritable, BytesWritable> {
private FileSplit fileSplit;
private Configuration conf;
private boolean processed = false;
public WholeFileRecordReader(FileSplit fileSplit, Configuration conf) throws IOException {
this.fileSplit = fileSplit;
this.conf = conf;
}
#Override
public NullWritable createKey() {
return NullWritable.get();
}
#Override
public BytesWritable createValue() {
return new BytesWritable();
}
#Override
public long getPos() throws IOException {
return processed ? fileSplit.getLength() : 0;
}
#Override
public float getProgress() throws IOException {
return processed ? 1.0f : 0.0f;
}
#Override
public boolean next(NullWritable key, BytesWritable value) throws IOException {
if (!processed) {
byte[] contents = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
} finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
#Override
public void close() throws IOException {
// do nothing
}
}
/* </generics> */
/*
* <Task1>:
* */
public static class ExampleMap extends MapReduceBase implements Mapper<NullWritable, BytesWritable, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(NullWritable key, BytesWritable value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
output.collect(new Text("test"), one);
}
}
public static class ExampleReduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
/* </Task1> */
/*
* <run>
**/
public int run(String[] args) throws Exception {
if (args.length != 3) {
printUsage();
return 1;
}
String useCase = args[0];
String inputPath = args[1];
String outputPath = args[2];
deleteOldOutput(outputPath);
JobConf conf = new JobConf(ExampleFileInputFormat.class);
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
FileInputFormat.setInputPaths(conf, new Path(inputPath));
/* conf: Task1 */
if (useCase.equals("cc_p")) {
conf.setJobName("WordCount");
/* Output: Key:Text -> Value:Integer */
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setOutputFormat(TextOutputFormat.class);
/* Input: Key.Text -> Value:Text */
conf.setInputFormat(WholeFileInputFormat.class);
conf.setMapperClass(ExampleMap.class);
conf.setReducerClass(ExampleReduce.class);
}
/* default-option: Exit */
else {
printUsage();
return 1;
}
JobClient.runJob(conf);
return 0;
}
/* </run> */
/*
* <Main>
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new ExampleFileInputFormat(), args);
System.exit(res);
}
/* </Main> */
/*
* <Helper>
*/
private void printUsage() {
System.out.println("usage: [usecase] [input-path] [output-path]");
return;
}
private void deleteOldOutput(String outputPath) throws IOException {
// Delete the output directory if it exists already
Path outputDir = new Path(outputPath);
FileSystem.get(getConf()).delete(outputDir, true);
}
/* </Helper-> */
}
Can anyone help me out?
Greetings from Germany,
Alex
You need to make the inner class static:
public static class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
Otherwise Javac generats a constructor that expects a parent ExampleFileInputFormat class instance to be passed in.

Resources