passing arguments to record reader in mapreduce hadoop

passing arguments to record reader in mapreduce hadoop - hadoop

This is my code for using variours arg
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class Docsparser {
private static String Delimiter;
public static class DocsInputFormat extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return new DocsLineRecordReader();
}
}
public static class DocsLineRecordReader extends RecordReader<Text, Text> {
private Text key = new Text();
private Text value = new Text();
private int currentword = 0;
private String fileline;
private File file = null;
private String line;
private HWPFDocument document;
private WordExtractor extractor = null;
private String[] filedata;
StringBuilder sb = new StringBuilder();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) split;
final Path file = fileSplit.getPath();
Configuration conf = context.getConfiguration();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream filein = fs.open(fileSplit.getPath());
String Delim = conf.get("Delim");
if (filein != null)
{
HWPFDocument document = new HWPFDocument(filein);
extractor = new WordExtractor(document);
fileline = extractor.getText();
filedata = fileline.split(Delim);
}
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException
{
if (key == null) {
key = new Text();
}
if (value == null) {
value = new Text();
}
if(currentword < filedata.length)
{
for ( currentword=0;currentword < filedata.length; currentword++)
{
sb.append(filedata[currentword] +",");
line = sb.toString();
}
key.set(line);
value.set("");
return true;
}
else
{
key = null;
value = null;
return false;
}
}
#Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return (100.0f / filedata.length * currentword) / 100.0f;
}
#Override
public void close() throws IOException {
}
}
public static class Map extends Mapper<Text, Text, Text, Text>{
public void map(Text key, Text value, Context context) throws IOException, InterruptedException
{
context.write(key,value);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "Docsparser");
job.setJarByClass(Docsparser.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
job.setInputFormatClass(DocsInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Exception details:
15/09/28 03:50:04 INFO mapreduce.Job: Task Id :
attempt_1443193152998_2319_m_000000_2, Status : FAILED Error: java.lang.NullPointerException
at java.lang.String.split(String.java:2272)
at java.lang.String.split(String.java:2355)
at com.nielsen.grfe.Docsparser$DocsLineRecordReader.initialize(Docsparser.java:66)
at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.initialize(MapTask.java:548)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:786)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

All the configuration variables have to be set before initializing Job class . Move
Delimiter = args[2].toString();
conf.set("Delim",Delimiter);
before
Job job = new Job(conf, "Docsparser");

The NullPointerException occurs in the split method of the fileline string. I suspect that you haven't set the "Delim" configuration value and, thus, your variable Delim is null.

Related

Sort data Hadoop Mapreduce

I have the following algorithm that sort data with alphabetic order
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("amasort.case.sensitive", true);
}
#Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();
word.set(line+"_"+key.toString());
context.write(word, one);
System.out.println("key:"+key.toString()+";value:"+value.toString());
}
}
public static class ForwardReducer
extends Reducer<Text,NullWritable,Text,NullWritable> {
private NullWritable result = NullWritable.get();
public void reduce(Text key, Iterable<NullWritable> values,
Context context
) throws IOException, InterruptedException {
String originalWord = key.toString();
originalWord = originalWord.substring(0, originalWord.lastIndexOf("_"));
key.set(originalWord);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
Job job = Job.getInstance(conf, "word sort");
job.setJarByClass(AmaSort.class);
job.setMapperClass(LineMapper.class);
// job.setCombinerClass(ForwardReducer.class);
job.setReducerClass(ForwardReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
i tried this algorithm to sort mydata set that contain (#xxxxxxx , 0,tcp,xx,1,1,1,2,4,5,....) but the output all line start with # are deleted and data line structure 0,tcp,x1x1,1,114,.... are modified, i just want to sort my dataset with this specific character (#) all Line start with # in first of file and the rest stay same structure.
Anyone can help me please to modify this algorithm ?

You can use below modified code to perform sorting,
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class AmaSort
{
static Configuration conf = null;
private static boolean caseSensitive;
private static Text word = new Text();
public static class LineMapper extends Mapper<Object, Text, Text, NullWritable>{
public void setup(Context context) throws IOException, InterruptedException
{
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("amasort.case.sensitive", true);
}
#Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();
word.set(line);
context.write(word, NullWritable.get());
}
}
public static class ForwardReducer extends Reducer<Text, NullWritable, Text, NullWritable>
{
private NullWritable result = NullWritable.get();
public void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException
{
context.write(key, result);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
// Job job = Job.getInstance(conf, "word sort");
Job job = new Job(conf, "word sort");
job.setJarByClass(AmaSort.class);
job.setMapperClass(LineMapper.class);
// job.setCombinerClass(ForwardReducer.class);
job.setReducerClass(ForwardReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Map reduce to duplicate the output files in order to put in different tables in hive

I have a MASTER table and two other tables in hive
Master table contains
MsgId,NbOfTxs,InitgPty,PmtInf,DbtrAcct
Sub Master Table 1
MsgId,NbOfTxs,DbtrAcct
Sub Master Table 2
MsgId,NbOfTxs,InitgPty
The data are in xml format .I have written MR code to parse it . I would like to create different part -r files so that they put output to the tables in hive directly
How can I put or load the OUTPUT files directly to hive using MapReduce to load in corresponding hive tables or is there a better way to put these files in hive tables
My code below
package xmlcsvMR;
import javax.xml.stream.XMLStreamConstants;//XMLInputFactory;
import java.io.*;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import javax.xml.stream.*;
public class XmlParser11
{
public static class XmlInputFormat1 extends TextInputFormat {
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
/**
* XMLRecordReader class to read through a given xml document to output
* xml blocks as records as specified by the start tag and end tag
*
*/
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
FileSplit fileSplit = (FileSplit) split;
// open the file and seek to the start of the split
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws IOException,
InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0,
buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}
public static class Map extends Mapper<LongWritable, Text,
Text, Text> {
#Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws
IOException, InterruptedException {
String document = value.toString();
System.out.println("‘" + document + "‘");
try {
XMLStreamReader reader =
XMLInputFactory.newInstance().createXMLStreamReader(new
ByteArrayInputStream(document.getBytes()));
String propertyName = "";
String propertyValue = "";
String currentElement = "";
while (reader.hasNext()) {
int code = reader.next();
switch (code) {
case XMLStreamConstants.START_ELEMENT: //START_ELEMENT:
currentElement = reader.getLocalName();
break;
case XMLStreamConstants.CHARACTERS: //CHARACTERS:
if (currentElement.equalsIgnoreCase("MsgId")) {
propertyName += reader.getText();
//System.out.println(propertyName);
} else if (currentElement.equalsIgnoreCase("NbOfTxs")) {
propertyValue += reader.getText();
//System.out.println(propertyValue);
}
break;
}
}
reader.close();
context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
}
catch(Exception e){
throw new IOException(e);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
private Text outputKey = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
for (Text value : values) {
outputKey.set(constructPropertyXml(key, value));
context.write(outputKey, null);
}
}
public static String constructPropertyXml(Text name, Text value) {
StringBuilder sb = new StringBuilder();
sb.append("MsgID ").append(name)
.append(" NbOfTxs ").append(value);
return sb.toString();
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<Event>");
conf.set("xmlinput.end", "</Event>");
conf.set("mapred.textoutputformat.separatorText", ",");
Job job = new Job(conf);
job.setJarByClass(XmlParser11.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(XmlParser11.Map.class);
job.setReducerClass(XmlParser11.Reduce.class);
job.setInputFormatClass(XmlInputFormat1.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}

Try using MultiOutputs. You can write to different files using this option, and hence can make different copies of output to be loaded into Hive.
A very good example is here using hadoop 1.0.2
Below is the example taken from javadocs:
Usage in Reducer:
<K, V> String generateFileName(K k, V v) {
return k.toString() + "_" + v.toString();
}
public class MOReduce extends
Reducer<WritableComparable, Writable,WritableComparable, Writable> {
private MultipleOutputs mos;
public void setup(Context context) {
...
mos = new MultipleOutputs(context);
}
public void reduce(WritableComparable key, Iterator<Writable> values,
Context context)
throws IOException {
...
mos.write("text", , key, new Text("Hello"));
mos.write("seq", LongWritable(1), new Text("Bye"), "seq_a");
mos.write("seq", LongWritable(2), key, new Text("Chau"), "seq_b");
mos.write(key, new Text("value"), generateFileName(key, new Text("value")));
...
}
public void cleanup(Context) throws IOException {
mos.close();
...
}
}

XML Parsing through HADOOP

I have a huge dump of WIKI data which I need to analyze . These dumps are basically XML files . how to parse the XML files using HADOOP Map Reduce?

import java.io.ByteArrayInputStream;
import java.io.IOException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class XmlDriver
{
public static class XmlInputFormat1 extends TextInputFormat {
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
/**
* XMLRecordReader class to read through a given xml document to output
* xml blocks as records as specified by the start tag and end tag
*
*/
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
FileSplit fileSplit = (FileSplit) split;
// open the file and seek to the start of the split
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws IOException,
InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0,
buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}
public static class Map extends Mapper<LongWritable, Text,
Text, Text> {
#Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws
IOException, InterruptedException {
String document = value.toString();
System.out.println("‘" + document + "‘");
try {
XMLStreamReader reader =
XMLInputFactory.newInstance().createXMLStreamReader(new
ByteArrayInputStream(document.getBytes()));
String propertyName = "";
String propertyValue = "";
String currentElement = "";
while (reader.hasNext()) {
int code = reader.next();
switch (code) {
case XMLStreamConstants.START_ELEMENT: //START_ELEMENT:
currentElement = reader.getLocalName();
break;
case XMLStreamConstants.CHARACTERS: //CHARACTERS:
if (currentElement.equalsIgnoreCase("name")) {
propertyName += reader.getText();
System.out.println("propertName"+propertyName);
} else if (currentElement.equalsIgnoreCase("value")) {
propertyValue += reader.getText();
System.out.println("propertyValue"+propertyValue);
}
break;
}
}
reader.close();
context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
}
catch(Exception e){
throw new IOException(e);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
#Override
protected void setup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("<configuration>"), null);
}
#Override
protected void cleanup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("</configuration>"), null);
}
private Text outputKey = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
for (Text value : values) {
outputKey.set(constructPropertyXml(key, value));
context.write(outputKey, null);
}
}
public static String constructPropertyXml(Text name, Text value) {
StringBuilder sb = new StringBuilder();
sb.append("<property><name>").append(name)
.append("</name><value>").append(value)
.append("</value></property>");
return sb.toString();
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<property>");
conf.set("xmlinput.end", "</property>");
Job job = new Job(conf);
job.setJarByClass(XmlDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(XmlDriver.Map.class);
job.setReducerClass(XmlDriver.Reduce.class);
job.setInputFormatClass(XmlInputFormat1.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
argument:- /home/test.xml /home/out
Xml File(test.xml):
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs</name>
<value>2</value>
</property>
</configuration>

hadoop CustomInputFormat Not getting invoked

I have written a custom input format and configured that in job. Still that inputformat is not getting invoked. I have kept some SOP's to get printed while running the code but none of them are printing. Even when I comment the custom inputformat in the driver class still the output remains same. Where am I missing ?
DriverClass
public class TestDriver {
public static void main(String args[]) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
Job job = new Job(conf,"Custom Format");
job.setMapperClass(CustomInputFormatmapper.class);
job.setReducerClass(CustomInputFormatReducer.class);
job.setInputFormatClass(CustomInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(LongWritable.class);
job.getConfiguration().set("fs.file.impl", "com.learn.WinLocalFileSystem");
String inputPath="In\\VISA_Details.csv";
Path inPath=new Path(inputPath);
String outputPath = "C:\\Users\\Desktop\\Hadoop learning\\output\\run1";
Path outPath=new Path(outputPath);
FileInputFormat.setInputPaths(job, inPath );
FileOutputFormat.setOutputPath(job, outPath);
System.out.println(job.waitForCompletion(true));
}
}
CUSTOM INPUTFORMAT
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class CustomInputFormat extends TextInputFormat{
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
{
System.out.println(" ------------ INSIDE createRecordReader()--------------");
return new CustomRecordReader();
}
}
CUSTOM RECORDREADER
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.LineReader;
public class CustomRecordReader extends RecordReader {
private CompressionCodecFactory compressionCodecs;
private final int NLINESTOPROCESS = 3;
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private LongWritable key;
private Text value;
#Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
#Override
public Object getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public Object getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return null;
}
#Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return 0;
}
#Override
public void initialize(InputSplit inputsplit,TaskAttemptContext taskattemptcontext)
throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE INITILISE: THIS IS NOT PRINTING----------");
FileSplit split = (FileSplit)inputsplit;
Configuration job = taskattemptcontext.getConfiguration();
maxLineLength = job.getInt("mapred.linerecordreader.maxlength", 2147483647);
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
compressionCodecs = new CompressionCodecFactory(job);
CompressionCodec codec = compressionCodecs.getCodec(file);
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
boolean skipFirstLine = false;
if(codec != null)
{
in = new LineReader(codec.createInputStream(fileIn), job);
end = 9223372036854775807L;
} else
{
if(start != 0L)
{
skipFirstLine = true;
start--;
fileIn.seek(start);
}
in = new LineReader(fileIn, job);
}
if(skipFirstLine)
start += in.readLine(new Text(), 0, (int)Math.min(2147483647L, end - start));
pos = start;
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
System.out.println(" ---------- INSIDE nextKeyValue()------------");
if(key==null){
key = new LongWritable();
}
if(value==null){
value = new Text();
}
key.set(pos);
value.clear();
final Text newLine = new Text("\n");
Text newVal = new Text();
int newSize = 0;
for(int i =0;i<NLINESTOPROCESS;i++){
Text v = new Text();
while(pos<end){
newSize = in.readLine(v, maxLineLength,Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),maxLineLength));
value.append(v.getBytes(),0, v.getLength());
value.append(newLine.getBytes(),0, newLine.getLength());
if (newSize == 0) {
break;
}
pos += newSize;
if (newSize < maxLineLength) {
break;
}
}
}
return false;
}
}
MAPPER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CustomInputFormatmapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
public void map(LongWritable key, Text val, Context context)throws IOException, InterruptedException{
String value = val.toString();
String[] totalRows = value.split("\n");
int count =totalRows.length;
context.write(new LongWritable(Long.valueOf(count)), new LongWritable(1L));
}
}
REDUCER CLASS
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class CustomInputFormatReducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable> {
public void reduce(LongWritable key, Iterable<LongWritable> val, Context context) throws IOException, InterruptedException{
System.out.println(" --------REDUCER--------");
long count =0;
for(LongWritable vals: val){
count++;
}
context.write(key, new LongWritable(count));
}
}

I am answering my own question as this will help others to get through the problem which I faced. There was a problem with the package which I was importing.
Mentioning the mistakes which I did.
CUSTOMINPUTFORMAT CLASS
1) Missed the #Override annotation
2) Imported from import org.apache.hadoop.mapred.InputSplit instead of org.apache.hadoop.mapreduce.InputSplit;
CUSTOMRECORDREADER
1) Imports were done from org.apache.hadoop.mapred.* and not from org.apache.hadoop.mapreduce.*;

Blank / Empty Values of Key and Value retrieved in the Mapper class

I have written a MapReduce code for running it on a CDH4 cluster. My requirement was to read the complete file as the value and the file name as the key. For that I wrote custom InputFormat and RecordReader classes.
Custom input format class: FullFileInputFormat.java
import java.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import FullFileRecordReader;
public class FullFileInputFormat extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf jobConf, Reporter reporter) throws IOException {
reporter.setStatus(split.toString());
return new FullFileRecordReader((FileSplit) split, jobConf);
}
}
And the custom RecordReader class: FullFileRecordReader.java
import java.io.BufferedReader;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
public class FullFileRecordReader implements RecordReader<Text, Text> {
private BufferedReader in;
private boolean processed = false;
private int processedBytes = 0;
private FileSplit fileSplit;
private JobConf conf;
public FullFileRecordReader(FileSplit fileSplit, JobConf conf) {
this.fileSplit = fileSplit;
this.conf = conf;
}
#Override
public void close() throws IOException {
if (in != null) {
in.close();
}
}
#Override
public Text createKey() {
return new Text("");
}
#Override
public Text createValue() {
return new Text("");
}
#Override
public long getPos() throws IOException {
return processedBytes;
}
#Override
public boolean next(Text key, Text value) throws IOException {
Path filePath = fileSplit.getPath();
if (!processed) {
key = new Text(filePath.getName());
value = new Text("");
FileSystem fs = filePath.getFileSystem(conf);
FSDataInputStream fileIn = fs.open(filePath);
byte[] b = new byte[1024];
int numBytes = 0;
while ((numBytes = fileIn.read(b)) > 0) {
value.append(b, 0, numBytes);
processedBytes += numBytes;
}
processed = true;
return true;
}
return false;
}
#Override
public float getProgress() throws IOException {
return 0;
}
}
Though whenever I try to print the key-value in the RecordReader class, I get their values, but when I print the same in the mapper class, I see blank values for them. I am unable to understand why the Mapper class is unable to get any data for keys and values.
Currently I have only a Map job and no reduce job. The code is:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import FullFileInputFormat;
public class Source {
public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text> {
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws java.io.IOException {
System.out.println("Processing " + key.toString());
System.out.println("Value: " + value.toString());
}
}
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(Source.class);
job.setJobName("Source");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setJarByClass(Source.class);
job.setInputFormat(FullFileInputFormat.class);
job.setMapperClass(Map.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
}
}

You're creating new instances in your next method - hadoop re-uses objects so you are expected to populate the ones passed. It should be as simple as amending as follows:
#Override
public boolean next(Text key, Text value) throws IOException {
Path filePath = fileSplit.getPath();
if (!processed) {
// key = new Text(filePath.getName());
key.set(filePath.getName());
// value = new Text("");
value.clear();
}
I would also recommend pre-sizing the value text to avoid 'growing' pains of the value's underlying byte array. Text has a private method called setCapacity, so you unforntunately can't call it - but if you used a BytesWritable to buffer the file input, you can call setCapacity in side your next method, passing the fileSplit length (note this may still be wrong if your file is compressed - as the file size is the compressed size).

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

passing arguments to record reader in mapreduce hadoop - hadoop

All the configuration variables have to be set before initializing Job class . Move Delimiter = args[2].toString(); conf.set("Delim",Delimiter); before Job job = new Job(conf, "Docsparser");

The NullPointerException occurs in the split method of the fileline string. I suspect that you haven't set the "Delim" configuration value and, thus, your variable Delim is null.

Related

Sort data Hadoop Mapreduce

Map reduce to duplicate the output files in order to put in different tables in hive

XML Parsing through HADOOP

hadoop CustomInputFormat Not getting invoked

Blank / Empty Values of Key and Value retrieved in the Mapper class

Categories

Resources