isSplitable in combineFileInputFormat does not work - hadoop

I have thousands of small files, and I want to process them with combineFileInputFormat.
In combineFileInputFormat, there are multiple small files for one mapper, each file will not be split.
the snippet of one of the small input files like this,
vers,3
period,2015-01-26-18-12-00,438469546,449329626,complete
config,libdvm.so,chromeview
pkgproc,com.futuredial.digitchat,10021,,0ns:10860078
pkgpss,com.futuredial.digitchat,10021,,0ns:9:6627:6627:6637:5912:5912:5912
pkgsvc-run,com.futuredial.digitchat,10021,.LiveScreenService,1,0n:10860078
pkgsvc-start,com.futuredial.digitchat,10021,.LiveScreenService,1,0n:10860078
pkgproc,com.google.android.youtube,10103,,0ns:10860078
pkgpss,com.google.android.youtube,10103,,0ns:9:12986:13000:13021:11552:11564:11580
pkgsvc- run,com.google.android.youtube,10103,com.google.android.apps.youtube.app.offline.transfer.OfflineTransferService,1,0n:10860078
pkgsvc- start,com.google.android.youtube,10103,com.google.android.apps.youtube.app.offline.transfer.OfflineTransferService,1,0n:10860078
I want to pass whole file content to the mapper. However, hadoop split the file to half.
For example, the above file may be split into
vers,3
period,2015-01-26-18-12-00,438469546,449329626,complete
config,libdvm.so,chromeview
pkgproc,com.futuredial.digitchat,#the line has been cut
But I want the content of whole file to be processed.
Here is my code, which reference Reading file as single record in hadoop
The driven code
public class CombineSmallfiles {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: conbinesmallfiles <in> <out>");
System.exit(2);
}
conf.setInt("mapred.min.split.size", 1);
conf.setLong("mapred.max.split.size", 26214400); // 25m
//conf.setLong("mapred.max.split.size", 134217728); // 128m
//conf.setInt("mapred.reduce.tasks", 5);
Job job = new Job(conf, "combine smallfiles");
job.setJarByClass(CombineSmallfiles.class);
job.setMapperClass(CombineSmallfileMapper.class);
//job.setReducerClass(IdentityReducer.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleOutputs.addNamedOutput(job,"pkgproc",TextOutputFormat.class,Text.class,Text.class);
MultipleOutputs.addNamedOutput(job,"pkgpss",TextOutputFormat.class,Text.class,Text.class);
MultipleOutputs.addNamedOutput(job,"pkgsvc",TextOutputFormat.class,Text.class,Text.class);
job.setInputFormatClass(CombineSmallfileInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
int exitFlag = job.waitForCompletion(true) ? 0 : 1;
System.exit(exitFlag);
}
}
My Mapper code
public class CombineSmallfileMapper extends Mapper<NullWritable, Text, Text, Text> {
private Text file = new Text();
private MultipleOutputs mos;
private String period;
private Long elapsed;
#Override
public void setup(Context context) throws IOException, InterruptedException {
mos = new MultipleOutputs(context);
}
#Override
protected void map(NullWritable key, Text value, Context context) throws IOException, InterruptedException {
String file_name = context.getConfiguration().get("map.input.file.name");
String [] filename_tokens = file_name.split("_");
String uuid = filename_tokens[0];
String [] datetime_tokens;
try{
datetime_tokens = filename_tokens[1].split("-");
}catch(ArrayIndexOutOfBoundsException err){
throw new ArrayIndexOutOfBoundsException(file_name);
}
String year,month,day,hour,minute,sec,msec;
year = datetime_tokens[0];
month = datetime_tokens[1];
day = datetime_tokens[2];
hour = datetime_tokens[3];
minute = datetime_tokens[4];
sec = datetime_tokens[5];
msec = datetime_tokens[6];
String datetime = year+"-"+month+"-"+"-"+day+" "+hour+":"+minute+":"+sec+"."+msec;
String content = value.toString();
String []lines = content.split("\n");
for(int u = 0;u<lines.length;u++){
String line = lines[u];
String []tokens = line.split(",");
if(tokens[0].equals("period")){
period = tokens[1];
try{
long startTime = Long.valueOf(tokens[2]);
long endTime = Long.valueOf(tokens[3]);
elapsed = endTime-startTime;
}catch(NumberFormatException err){
throw new NumberFormatException(line);
}
}else if(tokens[0].equals("pkgproc")){
String proc_info = "";
try{
proc_info += period+","+String.valueOf(elapsed)+","+tokens[2]+","+tokens[3];
}catch(ArrayIndexOutOfBoundsException err){
throw new ArrayIndexOutOfBoundsException("pkgproc: "+content+ "line:"+line);
}
for(int i = 4;i<tokens.length;i++){
String []state_info = tokens[i].split(":");
String state = "";
state += ","+state_info[0].charAt(0)+","+state_info[0].charAt(1)+","+state_info[0].charAt(2)+","+state_info[1];
mos.write("pkgproc",new Text(tokens[1]), new Text(proc_info+state+','+uuid+','+datetime));
}
}else if(tokens[0].equals("pkgpss")){
String proc_info = "";
proc_info += period+","+String.valueOf(elapsed)+","+tokens[2]+","+tokens[3];
for(int i = 4;i<tokens.length;i++){
String []state_info = tokens[i].split(":");
String state = "";
state += ","+state_info[0].charAt(0)+","+state_info[0].charAt(1)+","+state_info[0].charAt(2)+","+state_info[1]+","+state_info[2]+","+state_info[3]+","+state_info[4]+","+state_info[5]+","+state_info[6]+","+state_info[7];
mos.write("pkgpss",new Text(tokens[1]), new Text(proc_info+state+','+uuid+','+datetime));
}
}else if(tokens[0].startsWith("pkgsvc")){
String []stateName = tokens[0].split("-");
String proc_info = "";
//tokens[2] = uid, tokens[3] = serviceName
proc_info += stateName[1]+','+period+","+String.valueOf(elapsed)+","+tokens[2]+","+tokens[3];
String opcount = tokens[4];
for(int i = 5;i<tokens.length;i++){
String []state_info = tokens[i].split(":");
String state = "";
state += ","+state_info[0].charAt(0)+","+state_info[0].charAt(1)+","+state_info[1];
mos.write("pkgsvc",new Text(tokens[1]), new Text(proc_info+state+','+opcount+','+uuid+','+datetime));
}
}
}
}
}
My CombineFileInputFormat, which overrides isSplitable and return false
public class CombineSmallfileInputFormat extends CombineFileInputFormat<NullWritable, Text> {
#Override
public RecordReader<NullWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
return new CombineFileRecordReader<NullWritable,Text>((CombineFileSplit) split,context,WholeFileRecordReader.class);
}
#Override
protected boolean isSplitable(JobContext context,Path file ){
return false;
}
}
The WholeFileRecordReader
public class WholeFileRecordReader extends RecordReader<NullWritable, Text> {
//private static final Logger LOG = Logger.getLogger(WholeFileRecordReader.class);
/** The path to the file to read. */
private final Path mFileToRead;
/** The length of this file. */
private final long mFileLength;
/** The Configuration. */
private final Configuration mConf;
/** Whether this FileSplit has been processed. */
private boolean mProcessed;
/** Single Text to store the file name of the current file. */
// private final Text mFileName;
/** Single Text to store the value of this file (the value) when it is read. */
private final Text mFileText;
/**
* Implementation detail: This constructor is built to be called via
* reflection from within CombineFileRecordReader.
*
* #param fileSplit The CombineFileSplit that this will read from.
* #param context The context for this task.
* #param pathToProcess The path index from the CombineFileSplit to process in this record.
*/
public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context,
Integer pathToProcess) {
mProcessed = false;
mFileToRead = fileSplit.getPath(pathToProcess);
mFileLength = fileSplit.getLength(pathToProcess);
mConf = context.getConfiguration();
context.getConfiguration().set("map.input.file.name", mFileToRead.getName());
assert 0 == fileSplit.getOffset(pathToProcess);
//if (LOG.isDebugEnabled()) {
//LOG.debug("FileToRead is: " + mFileToRead.toString());
//LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths());
//try {
//FileSystem fs = FileSystem.get(mConf);
//assert fs.getFileStatus(mFileToRead).getLen() == mFileLength;
//} catch (IOException ioe) {
//// oh well, I was just testing.
//}
//}
//mFileName = new Text();
mFileText = new Text();
}
/** {#inheritDoc} */
#Override
public void close() throws IOException {
mFileText.clear();
}
/**
* Returns the absolute path to the current file.
*
* #return The absolute path to the current file.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
/**
* <p>Returns the current value. If the file has been read with a call to NextKeyValue(),
* this returns the contents of the file as a BytesWritable. Otherwise, it returns an
* empty BytesWritable.</p>
*
* <p>Throws an IllegalStateException if initialize() is not called first.</p>
*
* #return A BytesWritable containing the contents of the file to read.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return mFileText;
}
/**
* Returns whether the file has been processed or not. Since only one record
* will be generated for a file, progress will be 0.0 if it has not been processed,
* and 1.0 if it has.
*
* #return 0.0 if the file has not been processed. 1.0 if it has.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public float getProgress() throws IOException, InterruptedException {
return (mProcessed) ? (float) 1.0 : (float) 0.0;
}
/**
* All of the internal state is already set on instantiation. This is a no-op.
*
* #param split The InputSplit to read. Unused.
* #param context The context for this task. Unused.
* #throws IOException never.
* #throws InterruptedException never.
*/
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// no-op.
}
/**
* <p>If the file has not already been read, this reads it into memory, so that a call
* to getCurrentValue() will return the entire contents of this file as Text,
* and getCurrentKey() will return the qualified path to this file as Text. Then, returns
* true. If it has already been read, then returns false without updating any internal state.</p>
*
* #return Whether the file was read or not.
* #throws IOException if there is an error reading the file.
* #throws InterruptedException if there is an error.
*/
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!mProcessed) {
if (mFileLength > (long) Integer.MAX_VALUE) {
throw new IOException("File is longer than Integer.MAX_VALUE.");
}
byte[] contents = new byte[(int) mFileLength];
FileSystem fs = mFileToRead.getFileSystem(mConf);
FSDataInputStream in = null;
try {
// Set the contents of this file.
in = fs.open(mFileToRead);
IOUtils.readFully(in, contents, 0, contents.length);
mFileText.set(contents, 0, contents.length);
} finally {
IOUtils.closeQuietly(in);
}
mProcessed = true;
return true;
}
return false;
}
}
I want every mapper to parse multiple small files and each small file can not be split.
However, above code will cut(split) my input file and will raise a parsing error (since my parser will split the line into tokens).
In my concept, combineFileInputFormat will gather multiple files into one split, and each split will feed into one mapper. Therefore, one mapper can handle multiple files.
In my code, the max input split is set to 25MB, so I think the problem is that combineFileInputFormat will split the last part of small file of input split to satisfy the split size limit.
However, I have override isSplitable and return false, but it still splits the small file.
What is the correct way to do that?
I am not sure if it is possible to specify number of files to a mapper, rather than specify input split size?

Use setMaxSplitSize() method in your constructor code, it should work,
It ideally tells the split size,
public class CFInputFormat extends CombineFileInputFormat<FileLineWritable, Text> {
public CFInputFormat(){
super();
setMaxSplitSize(67108864); // 64 MB, default block size on hadoop
}
public RecordReader<FileLineWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{
return new CombineFileRecordReader<FileLineWritable, Text>((CombineFileSplit)split, context, CFRecordReader.class);
}
#Override
protected boolean isSplitable(JobContext context, Path file){
return false;
}
}

Related

How to remove r-00000 extention from reducer output in mapreduce

I am able to rename my reducer output file correctly but r-00000 is still persisting .
I have used MultipleOutputs in my reducer class .
Here is details of the that .Not sure what am i missing or what extra i have to do?
public class MyReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
private Logger logger = Logger.getLogger(MyReducer.class);
private MultipleOutputs<NullWritable, Text> multipleOutputs;
String strName = "";
public void setup(Context context) {
logger.info("Inside Reducer.");
multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}
#Override
public void reduce(NullWritable Key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
final String valueStr = value.toString();
StringBuilder sb = new StringBuilder();
sb.append(strArrvalueStr[0] + "|!|");
multipleOutputs.write(NullWritable.get(), new Text(sb.toString()),strName);
}
}
public void cleanup(Context context) throws IOException,
InterruptedException {
multipleOutputs.close();
}
}
I was able to do it explicitly after my job finishes and thats ok for me.No delay in the job
if (b){
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HHmm");
Calendar cal = Calendar.getInstance();
String strDate=dateFormat.format(cal.getTime());
FileSystem hdfs = FileSystem.get(getConf());
FileStatus fs[] = hdfs.listStatus(new Path(args[1]));
if (fs != null){
for (FileStatus aFile : fs) {
if (!aFile.isDir()) {
hdfs.rename(aFile.getPath(), new Path(aFile.getPath().toString()+".txt"));
}
}
}
}
A more suitable approach to the problem would be changing the OutputFormat.
For eg :- If you are using TextOutputFormatClass, just get the source code of the TextOutputFormat class and modify the below method to get the proper filename (without r-00000). We need to then set the modified output format in the driver.
public synchronized static String getUniqueFile(TaskAttemptContext context, String name, String extension) {
/*TaskID taskId = context.getTaskAttemptID().getTaskID();
int partition = taskId.getId();*/
StringBuilder result = new StringBuilder();
result.append(name);
/*
* result.append('-');
* result.append(TaskID.getRepresentingCharacter(taskId.getTaskType()));
* result.append('-'); result.append(NUMBER_FORMAT.format(partition));
* result.append(extension);
*/
return result.toString();
}
So whatever name is passed through the multiple outputs, filename will be created according to it.

ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.hbase.io.ImmutableBytesWritable

I have exported a table from Hbase to a file in almost like org.apache.hadoop.mapreduce.lib.output.TextOutputFormat,To import the exported Text format file I have tweaked the code of Import from the open source to support importing text based files instead of SequenceFile.
job.setInputFormatClass(TextInputFormat.class);
while running the Import class I am getting the following exception.
java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.hbase.io.ImmutableBytesWritable
at Import$Importer.map(Import.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)
here is my Export Class which was tweaked to write the content to the file from the ExpoterTable.
public class Export
{
private static final Log LOG = LogFactory.getLog(Export.class);
final static String NAME = "export";
final static String RAW_SCAN = "hbase.mapreduce.include.deleted.rows";
private static OutputStream out;
private static final String utf8 = "UTF-8";
private static final byte[] newline;
private static final byte[] keyValueSeparator;
static {
try {
newline = "\n".getBytes(utf8);
keyValueSeparator = "\t".getBytes(utf8);
}
catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
/**
* Mapper.
*/
static class ExporterTable extends TableMapper<ImmutableBytesWritable, Result>
{
/**
* #param row The current table row key.
* #param value The columns.
* #param context The current context.
* #throws IOException When something is broken with the data.
* #see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN,
* org.apache.hadoop.mapreduce.Mapper.Context)
*/
#Override
public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException {
try {
context.write(row, value);
write(row, value);
System.out.println(row);
System.out.println(value);
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
}
/**
* Sets up the actual job.
*
* #param conf The current configuration.
* #param args The command line parameters.
* #return The newly created job.
* #throws IOException When setting up the job fails.
*/
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
String tableName = args[0];
// this.out = new DataOutputStream(fos);
Path outputDir = new Path(args[1]);
Job job = new Job(conf, NAME + "_" + tableName);
job.setJobName(NAME + "_" + tableName);
job.setJarByClass(ExporterTable.class);
// Set optional scan parameters
Scan s = getConfiguredScanForJob(conf, args);
TableMapReduceUtil.initTableMapperJob(tableName, s, ExporterTable.class, ImmutableBytesWritable.class, IntWritable.class, job);
// No reducers. Just write straight to output files.
job.setNumReduceTasks(0);
job.setOutputValueClass(Text.class);
// FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormatClass(NullOutputFormat.class);
TableMapReduceUtil.addHBaseDependencyJars(conf);
TableMapReduceUtil.addDependencyJars(conf, JsonProcessingException.class);
TableMapReduceUtil.addDependencyJars(job);
return job;
}
private static Scan getConfiguredScanForJob(Configuration conf, String[] args) throws IOException {
Scan s = new Scan();
// Optional arguments.
// Set Scan Versions
int versions = args.length > 2 ? Integer.parseInt(args[2]) : 1;
s.setMaxVersions(versions);
// Set Scan Range
long startTime = args.length > 3 ? Long.parseLong(args[3]) : 0L;
long endTime = args.length > 4 ? Long.parseLong(args[4]) : Long.MAX_VALUE;
s.setTimeRange(startTime, endTime);
// Set cache blocks
s.setCacheBlocks(false);
// Set Scan Column Family
boolean raw = Boolean.parseBoolean(conf.get(RAW_SCAN));
if (raw) {
s.setRaw(raw);
}
if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
}
// Set RowFilter or Prefix Filter if applicable.
Filter exportFilter = getExportFilter(args);
if (exportFilter != null) {
LOG.info("Setting Scan Filter for Export.");
s.setFilter(exportFilter);
}
LOG.info("versions=" + versions + ", starttime=" + startTime + ", endtime=" + endTime + ", keepDeletedCells=" + raw);
return s;
}
private static Filter getExportFilter(String[] args) {
Filter exportFilter = null;
String filterCriteria = (args.length > 5) ? args[5] : null;
if (filterCriteria == null)
return null;
if (filterCriteria.startsWith("^")) {
String regexPattern = filterCriteria.substring(1, filterCriteria.length());
exportFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator(regexPattern));
}
else {
exportFilter = new PrefixFilter(Bytes.toBytes(filterCriteria));
}
return exportFilter;
}
/*
* #param errorMsg Error message. Can be null.
*/
private static void usage(final String errorMsg) {
if (errorMsg != null && errorMsg.length() > 0) {
System.err.println("ERROR: " + errorMsg);
}
System.err.println("Usage: Export [-D <property=value>]* <tablename> <outputdir> [<versions> " + "[<starttime> [<endtime>]] [^[regex pattern] or [Prefix] to filter]]\n");
System.err.println(" Note: -D properties will be applied to the conf used. ");
System.err.println(" For example: ");
System.err.println(" -D mapred.output.compress=true");
System.err.println(" -D mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec");
System.err.println(" -D mapred.output.compression.type=BLOCK");
System.err.println(" Additionally, the following SCAN properties can be specified");
System.err.println(" to control/limit what is exported..");
System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=<familyName>");
System.err.println(" -D " + RAW_SCAN + "=true");
System.err.println("For performance consider the following properties:\n" + " -Dhbase.client.scanner.caching=100\n" + " -Dmapred.map.tasks.speculative.execution=false\n" + " -Dmapred.reduce.tasks.speculative.execution=false");
}
/**
* Main entry point.
*
* #param args The command line parameters.
* #throws Exception When running the job fails.
*/
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
conf.set("mapreduce.framework.name", "local");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
usage("Wrong number of arguments: " + otherArgs.length);
System.exit(-1);
}
boolean jobStatus = false;
Job job = createSubmittableJob(conf, otherArgs);
try {
File f = new File("Test");
out = new FileOutputStream(f);
jobStatus = job.waitForCompletion(true);
}
catch (Exception e) {
e.printStackTrace();
}
finally {
IOUtils.closeStream(out);
}
// convertTextToSequence(conf);
System.exit(jobStatus ? 0 : 1);
}
public static void write(ImmutableBytesWritable key, Result value) throws IOException {
boolean nullKey = key == null;
boolean nullValue = value == null;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
* #param o the object to print
* #throws IOException if the write throws, we pass it on
*/
private static void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
}
else {
out.write(o.toString().getBytes(utf8));
}
}
}
any help is appreciated .
You have declared map method as follows and writing output key as ImmutableBytesWritable
public void map(ImmutableBytesWritable row, Result value, Context context)
throws IOException {
try {
context.write(row, value);
You have to override job parameters as follows to set MapOutputKeyClass and MapOutPutvalueClass
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Result.class);
Have a look at working Example : 7. Export an HBase table to File

Hadoop Mapreduce: Custom Input Format

I have a file with data having text and "^" in between:
SOME TEXT^GOES HERE^
AND A FEW^MORE
GOES HERE
I am writing a custom input format to delimit the rows using "^" character. i.e The output of the mapper should be like:
SOME TEXT
GOES HERE
AND A FEW
MORE GOES HERE
I have written a written a custom input format which extends FileInputFormat and also written a custom record reader that extends RecordReader. Code for my custom record reader is given below. I dont know how to proceed with this code. Having trouble with the nextKeyValue() method in the WHILE loop part. How should I read the data from a split and generate my custom key-value? I am using all new mapreduce package instead of the old mapred package.
public class MyRecordReader extends RecordReader<LongWritable, Text>
{
long start, current, end;
Text value;
LongWritable key;
LineReader reader;
FileSplit split;
Path path;
FileSystem fs;
FSDataInputStream in;
Configuration conf;
#Override
public void initialize(InputSplit inputSplit, TaskAttemptContext cont) throws IOException, InterruptedException
{
conf = cont.getConfiguration();
split = (FileSplit)inputSplit;
path = split.getPath();
fs = path.getFileSystem(conf);
in = fs.open(path);
reader = new LineReader(in, conf);
start = split.getStart();
current = start;
end = split.getLength() + start;
}
#Override
public boolean nextKeyValue() throws IOException
{
if(key==null)
key = new LongWritable();
key.set(current);
if(value==null)
value = new Text();
long readSize = 0;
while(current<end)
{
Text tmpText = new Text();
readSize = read //here how should i read data from the split, and generate key-value?
if(readSize==0)
break;
current+=readSize;
}
if(readSize==0)
{
key = null;
value = null;
return false;
}
return true;
}
#Override
public float getProgress() throws IOException
{
}
#Override
public LongWritable getCurrentKey() throws IOException
{
}
#Override
public Text getCurrentValue() throws IOException
{
}
#Override
public void close() throws IOException
{
}
}
There is no need to implement that yourself. You can simply set the configuration value textinputformat.record.delimiter to be the circumflex character.
conf.set("textinputformat.record.delimiter", "^");
This should work fine with the normal TextInputFormat.

Writing a value to file without moving to reducer

I have an input of records like this,
a|1|Y,
b|0|N,
c|1|N,
d|2|Y,
e|1|Y
Now, in mapper, i has to check the value of third column. If it is 'Y' then that record has to write directly to output file without moving that record to reducer or else i.e, 'N' value records has to move to reducer for further processing..
So,
a|1|Y,
d|2|Y,
e|1|Y
should not go to reducer but
b|0|N,
c|1|N
should go to reducer and then to output file.
How can i do this??
What you can probably do is use MultipleOutputs - click here to separate out records of 'Y' and 'N' type to two different files from mappers.
Next, you run saparate jobs for the two newly generated 'Y' and 'N' type data sets.
For 'Y' types set number of reducers to 0, so that, Reducers aren't use. And, for 'N' types do it the way you want using reducers.
Hope this helps.
See if this works,
public class Xxxx {
public static class MyMapper extends
Mapper<LongWritable, Text, LongWritable, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(context.getConfiguration());
Random r = new Random();
FileSplit split = (FileSplit)context.getInputSplit();
String fileName = split.getPath().getName();
FSDataOutputStream out = fs.create(new Path(fileName + "-m-" + r.nextInt()));
String parts[];
String line = value.toString();
String[] splits = line.split(",");
for(String s : splits) {
parts = s.split("\\|");
if(parts[2].equals("Y")) {
out.writeBytes(line);
}else {
context.write(key, value);
}
}
out.close();
fs.close();
}
}
public static class MyReducer extends
Reducer<LongWritable, Text, LongWritable, Text> {
public void reduce(LongWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
for(Text t : values) {
context.write(key, t);
}
}
}
/**
* #param args
* #throws IOException
* #throws InterruptedException
* #throws ClassNotFoundException
*/
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://localhost:9000");
conf.set("mapred.job.tracker", "localhost:9001");
Job job = new Job(conf, "Xxxx");
job.setJarByClass(Xxxx.class);
Path outPath = new Path("/output_path");
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
FileInputFormat.addInputPath(job, new Path("/input.txt"));
FileOutputFormat.setOutputPath(job, outPath);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
In your map function, you will get input line by line. Split it according by using | as the delimiter. (by using the String.split() method to be exact)
It will look like this
String[] line = value.toString().split('|');
Access the third element of this array by line[2]
Then, using a simple if else statement, emit the output with N value for further processing.

Custom WritableCompare displays object reference as output

I am new to Hadoop and Java, and I feel there is something obvious I am just missing. I am using Hadoop 1.0.3 if that means anything.
My goal for using hadoop is to take a bunch of files and parse them one file at a time (as opposed to line by line). Each file will produce multiple key-values, but context to the other lines is important. The key and value are multi-value/composite, so I have implemented WritableCompare for the key and Writable for the value. Because the processing of each file take a bit of CPU, I want to save the output of the mapper, then run multiple reducers later on.
For the composite keys, I followed [http://stackoverflow.com/questions/12427090/hadoop-composite-key][1]
The problem is, the output is just Java object references as opposed to the composite key and value. Example:
LinkKeyWritable#bd2f9730 LinkValueWritable#8752408c
I am not sure if the problem is related to not reducing the data at all or
Here is my main class:
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Parser.class);
conf.setJobName("raw_parser");
conf.setOutputKeyClass(LinkKeyWritable.class);
conf.setOutputValueClass(LinkValueWritable.class);
conf.setMapperClass(RawMap.class);
conf.setNumMapTasks(0);
conf.setInputFormat(PerFileInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
PerFileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
And my Mapper class:
public class RawMap extends MapReduceBase implements
Mapper {
public void map(NullWritable key, Text value,
OutputCollector<LinkKeyWritable, LinkValueWritable> output,
Reporter reporter) throws IOException {
String json = value.toString();
SerpyReader reader = new SerpyReader(json);
GoogleParser parser = new GoogleParser(reader);
for (String page : reader.getPages()) {
String content = reader.readPageContent(page);
parser.addPage(content);
}
for (Link link : parser.getLinks()) {
LinkKeyWritable linkKey = new LinkKeyWritable(link);
LinkValueWritable linkValue = new LinkValueWritable(link);
output.collect(linkKey, linkValue);
}
}
}
Link is basically a struct of various information that get's split between LinkKeyWritable and LinkValueWritable
LinkKeyWritable:
public class LinkKeyWritable implements WritableComparable<LinkKeyWritable>{
protected Link link;
public LinkKeyWritable() {
super();
link = new Link();
}
public LinkKeyWritable(Link link) {
super();
this.link = link;
}
#Override
public void readFields(DataInput in) throws IOException {
link.batchDay = in.readLong();
link.source = in.readUTF();
link.domain = in.readUTF();
link.path = in.readUTF();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeLong(link.batchDay);
out.writeUTF(link.source);
out.writeUTF(link.domain);
out.writeUTF(link.path);
}
#Override
public int compareTo(LinkKeyWritable o) {
return ComparisonChain.start().
compare(link.batchDay, o.link.batchDay).
compare(link.domain, o.link.domain).
compare(link.path, o.link.path).
result();
}
#Override
public int hashCode() {
return Objects.hashCode(link.batchDay, link.source, link.domain, link.path);
}
#Override
public boolean equals(final Object obj){
if(obj instanceof LinkKeyWritable) {
final LinkKeyWritable o = (LinkKeyWritable)obj;
return Objects.equal(link.batchDay, o.link.batchDay)
&& Objects.equal(link.source, o.link.source)
&& Objects.equal(link.domain, o.link.domain)
&& Objects.equal(link.path, o.link.path);
}
return false;
}
}
LinkValueWritable:
public class LinkValueWritable implements Writable{
protected Link link;
public LinkValueWritable() {
link = new Link();
}
public LinkValueWritable(Link link) {
this.link = new Link();
this.link.type = link.type;
this.link.description = link.description;
}
#Override
public void readFields(DataInput in) throws IOException {
link.type = in.readUTF();
link.description = in.readUTF();
}
#Override
public void write(DataOutput out) throws IOException {
out.writeUTF(link.type);
out.writeUTF(link.description);
}
#Override
public int hashCode() {
return Objects.hashCode(link.type, link.description);
}
#Override
public boolean equals(final Object obj){
if(obj instanceof LinkKeyWritable) {
final LinkKeyWritable o = (LinkKeyWritable)obj;
return Objects.equal(link.type, o.link.type)
&& Objects.equal(link.description, o.link.description);
}
return false;
}
}
I think the answer is in the implementation of the TextOutputFormat. Specifically, the LineRecordWriter's writeObject method:
/**
* Write the object to the byte stream, handling Text as a special
* case.
* #param o the object to print
* #throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
As you can see, if your key or value is not a Text object, it calls the toString method on it and writes that out. Since you've left toString unimplemented in your key and value, it's using the Object class's implementation, which is writing out the reference.
I'd say that you should try writing an appropriate toString function or using a different OutputFormat.
It looks like you have a list of objects just like you wanted. You need to implement toString() on your writable if you want a human-readable version printed out instead of an ugly java reference.

Resources