passing arguments to record reader in mapreduce hadoop - hadoop

This is my code for using variours arg
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class Docsparser {
private static String Delimiter;
public static class DocsInputFormat extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return new DocsLineRecordReader();
}
}
public static class DocsLineRecordReader extends RecordReader<Text, Text> {
private Text key = new Text();
private Text value = new Text();
private int currentword = 0;
private String fileline;
private File file = null;
private String line;
private HWPFDocument document;
private WordExtractor extractor = null;
private String[] filedata;
StringBuilder sb = new StringBuilder();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) split;
final Path file = fileSplit.getPath();
Configuration conf = context.getConfiguration();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream filein = fs.open(fileSplit.getPath());
String Delim = conf.get("Delim");
if (filein != null)
{
HWPFDocument document = new HWPFDocument(filein);
extractor = new WordExtractor(document);
fileline = extractor.getText();
filedata = fileline.split(Delim);
}
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException
{
if (key == null) {
key = new Text();
}
if (value == null) {
value = new Text();
}
if(currentword < filedata.length)
{
for ( currentword=0;currentword < filedata.length; currentword++)
{
sb.append(filedata[currentword] +",");
line = sb.toString();
}
key.set(line);
value.set("");
return true;
}
else
{
key = null;
value = null;
return false;
}
}
#Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return (100.0f / filedata.length * currentword) / 100.0f;
}
#Override
public void close() throws IOException {
}
}
public static class Map extends Mapper<Text, Text, Text, Text>{
public void map(Text key, Text value, Context context) throws IOException, InterruptedException
{
context.write(key,value);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "Docsparser");
job.setJarByClass(Docsparser.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
job.setInputFormatClass(DocsInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Exception details:
15/09/28 03:50:04 INFO mapreduce.Job: Task Id :
attempt_1443193152998_2319_m_000000_2, Status : FAILED Error: java.lang.NullPointerException
at java.lang.String.split(String.java:2272)
at java.lang.String.split(String.java:2355)
at com.nielsen.grfe.Docsparser$DocsLineRecordReader.initialize(Docsparser.java:66)
at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.initialize(MapTask.java:548)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:786)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

All the configuration variables have to be set before initializing Job class . Move
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
before
Job job = new Job(conf, "Docsparser");

The NullPointerException occurs in the split method of the fileline string. I suspect that you haven't set the "Delim" configuration value and, thus, your variable Delim is null.

Related

Sort data Hadoop Mapreduce

I have the following algorithm that sort data with alphabetic order
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("amasort.case.sensitive", true);
}
#Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();
word.set(line+"_"+key.toString());
context.write(word, one);
System.out.println("key:"+key.toString()+";value:"+value.toString());
}
}
public static class ForwardReducer
extends Reducer<Text,NullWritable,Text,NullWritable> {
private NullWritable result = NullWritable.get();
public void reduce(Text key, Iterable<NullWritable> values,
Context context
) throws IOException, InterruptedException {
String originalWord = key.toString();
originalWord = originalWord.substring(0, originalWord.lastIndexOf("_"));
key.set(originalWord);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
Job job = Job.getInstance(conf, "word sort");
job.setJarByClass(AmaSort.class);
job.setMapperClass(LineMapper.class);
// job.setCombinerClass(ForwardReducer.class);
job.setReducerClass(ForwardReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
i tried this algorithm to sort mydata set that contain (#xxxxxxx , 0,tcp,xx,1,1,1,2,4,5,....) but the output all line start with # are deleted and data line structure 0,tcp,x1x1,1,114,.... are modified, i just want to sort my dataset with this specific character (#) all Line start with # in first of file and the rest stay same structure.
Anyone can help me please to modify this algorithm ?
You can use below modified code to perform sorting,
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class AmaSort
{
static Configuration conf = null;
private static boolean caseSensitive;
private static Text word = new Text();
public static class LineMapper extends Mapper<Object, Text, Text, NullWritable>{
public void setup(Context context) throws IOException, InterruptedException
{
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("amasort.case.sensitive", true);
}
#Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();
word.set(line);
context.write(word, NullWritable.get());
}
}
public static class ForwardReducer extends Reducer<Text, NullWritable, Text, NullWritable>
{
private NullWritable result = NullWritable.get();
public void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException
{
context.write(key, result);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
// Job job = Job.getInstance(conf, "word sort");
Job job = new Job(conf, "word sort");
job.setJarByClass(AmaSort.class);
job.setMapperClass(LineMapper.class);
// job.setCombinerClass(ForwardReducer.class);
job.setReducerClass(ForwardReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Map reduce to duplicate the output files in order to put in different tables in hive

I have a MASTER table and two other tables in hive
Master table contains
MsgId,NbOfTxs,InitgPty,PmtInf,DbtrAcct
Sub Master Table 1
MsgId,NbOfTxs,DbtrAcct
Sub Master Table 2
MsgId,NbOfTxs,InitgPty
The data are in xml format .I have written MR code to parse it . I would like to create different part -r files so that they put output to the tables in hive directly
How can I put or load the OUTPUT files directly to hive using MapReduce to load in corresponding hive tables or is there a better way to put these files in hive tables
My code below
package xmlcsvMR;
import javax.xml.stream.XMLStreamConstants;//XMLInputFactory;
import java.io.*;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import javax.xml.stream.*;
public class XmlParser11
{
public static class XmlInputFormat1 extends TextInputFormat {
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
/**
* XMLRecordReader class to read through a given xml document to output
* xml blocks as records as specified by the start tag and end tag
*
*/
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
FileSplit fileSplit = (FileSplit) split;
// open the file and seek to the start of the split
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws IOException,
InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0,
buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}
public static class Map extends Mapper<LongWritable, Text,
Text, Text> {
#Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws
IOException, InterruptedException {
String document = value.toString();
System.out.println("‘" + document + "‘");
try {
XMLStreamReader reader =
XMLInputFactory.newInstance().createXMLStreamReader(new
ByteArrayInputStream(document.getBytes()));
String propertyName = "";
String propertyValue = "";
String currentElement = "";
while (reader.hasNext()) {
int code = reader.next();
switch (code) {
case XMLStreamConstants.START_ELEMENT: //START_ELEMENT:
currentElement = reader.getLocalName();
break;
case XMLStreamConstants.CHARACTERS: //CHARACTERS:
if (currentElement.equalsIgnoreCase("MsgId")) {
propertyName += reader.getText();
//System.out.println(propertyName);
} else if (currentElement.equalsIgnoreCase("NbOfTxs")) {
propertyValue += reader.getText();
//System.out.println(propertyValue);
}
break;
}
}
reader.close();
context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
}
catch(Exception e){
throw new IOException(e);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
private Text outputKey = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
for (Text value : values) {
outputKey.set(constructPropertyXml(key, value));
context.write(outputKey, null);
}
}
public static String constructPropertyXml(Text name, Text value) {
StringBuilder sb = new StringBuilder();
sb.append("MsgID ").append(name)
.append(" NbOfTxs ").append(value);
return sb.toString();
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<Event>");
conf.set("xmlinput.end", "</Event>");
conf.set("mapred.textoutputformat.separatorText", ",");
Job job = new Job(conf);
job.setJarByClass(XmlParser11.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(XmlParser11.Map.class);
job.setReducerClass(XmlParser11.Reduce.class);
job.setInputFormatClass(XmlInputFormat1.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Try using MultiOutputs. You can write to different files using this option, and hence can make different copies of output to be loaded into Hive.
A very good example is here using hadoop 1.0.2
Below is the example taken from javadocs:
Usage in Reducer:
<K, V> String generateFileName(K k, V v) {
return k.toString() + "_" + v.toString();
}
public class MOReduce extends
Reducer<WritableComparable, Writable,WritableComparable, Writable> {
private MultipleOutputs mos;
public void setup(Context context) {
...
mos = new MultipleOutputs(context);
}
public void reduce(WritableComparable key, Iterator<Writable> values,
Context context)
throws IOException {
...
mos.write("text", , key, new Text("Hello"));
mos.write("seq", LongWritable(1), new Text("Bye"), "seq_a");
mos.write("seq", LongWritable(2), key, new Text("Chau"), "seq_b");
mos.write(key, new Text("value"), generateFileName(key, new Text("value")));
...
}
public void cleanup(Context) throws IOException {
mos.close();
...
}
}

XML Parsing through HADOOP

I have a huge dump of WIKI data which I need to analyze . These dumps are basically XML files . how to parse the XML files using HADOOP Map Reduce?
import java.io.ByteArrayInputStream;
import java.io.IOException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class XmlDriver
{
public static class XmlInputFormat1 extends TextInputFormat {
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
/**
* XMLRecordReader class to read through a given xml document to output
* xml blocks as records as specified by the start tag and end tag
*
*/
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
FileSplit fileSplit = (FileSplit) split;
// open the file and seek to the start of the split
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws IOException,
InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0,
buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}
public static class Map extends Mapper<LongWritable, Text,
Text, Text> {
#Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws
IOException, InterruptedException {
String document = value.toString();
System.out.println("‘" + document + "‘");
try {
XMLStreamReader reader =
XMLInputFactory.newInstance().createXMLStreamReader(new
ByteArrayInputStream(document.getBytes()));
String propertyName = "";
String propertyValue = "";
String currentElement = "";
while (reader.hasNext()) {
int code = reader.next();
switch (code) {
case XMLStreamConstants.START_ELEMENT: //START_ELEMENT:
currentElement = reader.getLocalName();
break;
case XMLStreamConstants.CHARACTERS: //CHARACTERS:
if (currentElement.equalsIgnoreCase("name")) {
propertyName += reader.getText();
System.out.println("propertName"+propertyName);
} else if (currentElement.equalsIgnoreCase("value")) {
propertyValue += reader.getText();
System.out.println("propertyValue"+propertyValue);
}
break;
}
}
reader.close();
context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
}
catch(Exception e){
throw new IOException(e);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
#Override
protected void setup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("<configuration>"), null);
}
#Override
protected void cleanup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("</configuration>"), null);
}
private Text outputKey = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
for (Text value : values) {
outputKey.set(constructPropertyXml(key, value));
context.write(outputKey, null);
}
}
public static String constructPropertyXml(Text name, Text value) {
StringBuilder sb = new StringBuilder();
sb.append("<property><name>").append(name)
.append("</name><value>").append(value)
.append("</value></property>");
return sb.toString();
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<property>");
conf.set("xmlinput.end", "</property>");
Job job = new Job(conf);
job.setJarByClass(XmlDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(XmlDriver.Map.class);
job.setReducerClass(XmlDriver.Reduce.class);
job.setInputFormatClass(XmlInputFormat1.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
argument:- /home/test.xml /home/out
Xml File(test.xml):
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs</name>
<value>2</value>
</property>
</configuration>

hadoop CustomInputFormat Not getting invoked

I have written a custom input format and configured that in job. Still that inputformat is not getting invoked. I have kept some SOP's to get printed while running the code but none of them are printing. Even when I comment the custom inputformat in the driver class still the output remains same. Where am I missing ?
DriverClass
public class TestDriver {
public static void main(String args[]) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
Job job = new Job(conf,"Custom Format");
job.setMapperClass(CustomInputFormatmapper.class);
job.setReducerClass(CustomInputFormatReducer.class);
job.setInputFormatClass(CustomInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(LongWritable.class);
job.getConfiguration().set("fs.file.impl", "com.learn.WinLocalFileSystem");
String inputPath="In\\VISA_Details.csv";
Path inPath=new Path(inputPath);
String outputPath = "C:\\Users\\Desktop\\Hadoop learning\\output\\run1";
Path outPath=new Path(outputPath);
FileInputFormat.setInputPaths(job, inPath );
FileOutputFormat.setOutputPath(job, outPath);
System.out.println(job.waitForCompletion(true));
}
}
CUSTOM INPUTFORMAT
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class CustomInputFormat extends TextInputFormat{
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
{
System.out.println(" ------------ INSIDE createRecordReader()--------------");
return new CustomRecordReader();
}
}
CUSTOM RECORDREADER
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.LineReader;
public class CustomRecordReader extends RecordReader {
private CompressionCodecFactory compressionCodecs;
private final int NLINESTOPROCESS = 3;
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private LongWritable key;
private Text value;
#Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
#Override
public Object getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public Object getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return 0;
}
#Override
public void initialize(InputSplit inputsplit,TaskAttemptContext taskattemptcontext)
throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE INITILISE: THIS IS NOT PRINTING----------");
FileSplit split = (FileSplit)inputsplit;
Configuration job = taskattemptcontext.getConfiguration();
maxLineLength = job.getInt("mapred.linerecordreader.maxlength", 2147483647);
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
compressionCodecs = new CompressionCodecFactory(job);
CompressionCodec codec = compressionCodecs.getCodec(file);
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
boolean skipFirstLine = false;
if(codec != null)
{
in = new LineReader(codec.createInputStream(fileIn), job);
end = 9223372036854775807L;
} else
{
if(start != 0L)
{
skipFirstLine = true;
start--;
fileIn.seek(start);
}
in = new LineReader(fileIn, job);
}
if(skipFirstLine)
start += in.readLine(new Text(), 0, (int)Math.min(2147483647L, end - start));
pos = start;
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE nextKeyValue()------------");
if(key==null){
key = new LongWritable();
}
if(value==null){
value = new Text();
}
key.set(pos);
value.clear();
final Text newLine = new Text("\n");
Text newVal = new Text();
int newSize = 0;
for(int i =0;i<NLINESTOPROCESS;i++){
Text v = new Text();
while(pos<end){
newSize = in.readLine(v, maxLineLength,Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),maxLineLength));
value.append(v.getBytes(),0, v.getLength());
value.append(newLine.getBytes(),0, newLine.getLength());
if (newSize == 0) {
break;
}
pos += newSize;
if (newSize < maxLineLength) {
break;
}
}
}
return false;
}
}
MAPPER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CustomInputFormatmapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
public void map(LongWritable key, Text val, Context context)throws IOException, InterruptedException{
String value = val.toString();
String[] totalRows = value.split("\n");
int count =totalRows.length;
context.write(new LongWritable(Long.valueOf(count)), new LongWritable(1L));
}
}
REDUCER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class CustomInputFormatReducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable> {
public void reduce(LongWritable key, Iterable<LongWritable> val, Context context) throws IOException, InterruptedException{
System.out.println(" --------REDUCER--------");
long count =0;
for(LongWritable vals: val){
count++;
}
context.write(key, new LongWritable(count));
}
}
I am answering my own question as this will help others to get through the problem which I faced. There was a problem with the package which I was importing.
Mentioning the mistakes which I did.
CUSTOMINPUTFORMAT CLASS
1) Missed the #Override annotation
2) Imported from import org.apache.hadoop.mapred.InputSplit instead of org.apache.hadoop.mapreduce.InputSplit;
CUSTOMRECORDREADER
1) Imports were done from org.apache.hadoop.mapred.* and not from org.apache.hadoop.mapreduce.*;

Blank / Empty Values of Key and Value retrieved in the Mapper class

I have written a MapReduce code for running it on a CDH4 cluster. My requirement was to read the complete file as the value and the file name as the key. For that I wrote custom InputFormat and RecordReader classes.
Custom input format class: FullFileInputFormat.java
import java.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import FullFileRecordReader;
public class FullFileInputFormat extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf jobConf, Reporter reporter) throws IOException {
reporter.setStatus(split.toString());
return new FullFileRecordReader((FileSplit) split, jobConf);
}
}
And the custom RecordReader class: FullFileRecordReader.java
import java.io.BufferedReader;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
public class FullFileRecordReader implements RecordReader<Text, Text> {
private BufferedReader in;
private boolean processed = false;
private int processedBytes = 0;
private FileSplit fileSplit;
private JobConf conf;
public FullFileRecordReader(FileSplit fileSplit, JobConf conf) {
this.fileSplit = fileSplit;
this.conf = conf;
}
#Override
public void close() throws IOException {
if (in != null) {
in.close();
}
}
#Override
public Text createKey() {
return new Text("");
}
#Override
public Text createValue() {
return new Text("");
}
#Override
public long getPos() throws IOException {
return processedBytes;
}
#Override
public boolean next(Text key, Text value) throws IOException {
Path filePath = fileSplit.getPath();
if (!processed) {
key = new Text(filePath.getName());
value = new Text("");
FileSystem fs = filePath.getFileSystem(conf);
FSDataInputStream fileIn = fs.open(filePath);
byte[] b = new byte[1024];
int numBytes = 0;
while ((numBytes = fileIn.read(b)) > 0) {
value.append(b, 0, numBytes);
processedBytes += numBytes;
}
processed = true;
return true;
}
return false;
}
#Override
public float getProgress() throws IOException {
return 0;
}
}
Though whenever I try to print the key-value in the RecordReader class, I get their values, but when I print the same in the mapper class, I see blank values for them. I am unable to understand why the Mapper class is unable to get any data for keys and values.
Currently I have only a Map job and no reduce job. The code is:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import FullFileInputFormat;
public class Source {
public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text> {
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws java.io.IOException {
System.out.println("Processing " + key.toString());
System.out.println("Value: " + value.toString());
}
}
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(Source.class);
job.setJobName("Source");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setJarByClass(Source.class);
job.setInputFormat(FullFileInputFormat.class);
job.setMapperClass(Map.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
}
}
You're creating new instances in your next method - hadoop re-uses objects so you are expected to populate the ones passed. It should be as simple as amending as follows:
#Override
public boolean next(Text key, Text value) throws IOException {
Path filePath = fileSplit.getPath();
if (!processed) {
// key = new Text(filePath.getName());
key.set(filePath.getName());
// value = new Text("");
value.clear();
}
I would also recommend pre-sizing the value text to avoid 'growing' pains of the value's underlying byte array. Text has a private method called setCapacity, so you unforntunately can't call it - but if you used a BytesWritable to buffer the file input, you can call setCapacity in side your next method, passing the fileSplit length (note this may still be wrong if your file is compressed - as the file size is the compressed size).

Resources