hadoop : reduce output records=0 - hadoop

I am writing MapReduce code with 2 mapper class and a reducer , but I don't know why I have an reduce output records=0.
Please tell me how to solve this problem
package reducesidejoin;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class ReduceSideJoinReducer extends Reducer<IntWritable,
Text, IntWritable, Text> {
#Override
public void reduce(IntWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
String output = null;
Text achat;
Text vins;
Text valeur2;
Text valeur1;
Iterator<Text> itr = values.iterator();
valeur1 = itr.next();
if (valeur1.charAt(0) == 1) {
vins = valeur1;
while (itr.hasNext()) {
valeur2 = itr.next();
if (valeur2.charAt(0) == 2) {
achat = valeur2;
output = vins.toString() + achat.toString();
context.write(key, new Text(output));
}
context.write(key, new Text(output));
}
} else if (valeur1.charAt(0) == 2) {
achat = valeur1;
while (itr.hasNext()) {
valeur2 = itr.next();
if (valeur2.charAt(0) == 1) {
vins = valeur2;
output = vins.toString() + achat.toString();
System.out.println(key + "," + output);
}
context.write(key, new Text(output));
}
}
}
}

The only way your reducer can output anything is if your char comparisons are working. This is assuming you actually have records entering your reducer.
I would have a look at these lines: valeur1.charAt(0) == 1
You're comparing an integer to a char and i suspect your looking for the printable value of 1 (49 if you did an integer comparison) so you probably want:
valeur1.charAt(0) == '1'
You're also doing this a lot - vins = valeur1; which is going to cause problems because hadoop is going to be reusing the Text objects it gives you via the Iterable.
You should change these to vins.set(valeur1);

Related

Reducer not able to group by key for different mappers

Use-case :
File 1 contain impression data which contains trackerId + other fields
File 2 contains click details contains trackerId + clicked
I am using different mappers for above two and one reducer but it seems reducer is not able to combine both files data.
package com.hadoop.intellipaat;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import com.google.common.collect.Lists;
/**
* This job will combine click and impression on TrackerId
*
* #author raghunandangupta
*
*/
public class JoinClickImpressionDetailJob {
public static final String IMPRESSION_PREFIX = "IMPRESSION_PREFIX";
public static final String CLICK_PREFIX = "CLICK_PREFIX";
public static final String SEPERATOR = "~";
private static class ImpressionMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
/**
* Excluding header
*/
if (!(value.toString().indexOf("accountId") != -1)) {
String words[] = value.toString().split(",");
if (words.length > 18) {
context.write(new Text(words[18].trim()), new Text(IMPRESSION_PREFIX + SEPERATOR + value.toString()));
}
} else {
context.write(new Text(""), value);
}
}
}
private static class ClickMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String words[] = value.toString().split(",");
if (words.length > 18) {
context.write(new Text(words[18].trim()), new Text(CLICK_PREFIX + SEPERATOR + value.toString()));
} else {
context.write(new Text(""), new Text("1"));
}
}
}
private static class ImpressionClickReducer extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) {
try {
System.out.println("=========="+key.toString());
if (key.toString().length() != 0) {
List<Text> myList = Lists.newArrayList(values);
for(Text t : myList){
System.out.println("#######"+t.toString());
}
System.out.println("#########################");
if (myList.size() == 2) {
if (myList.get(0).toString().indexOf(IMPRESSION_PREFIX) != -1 && myList.get(1).toString().indexOf(CLICK_PREFIX) != -1) {
String line = myList.get(0).toString().split(SEPERATOR)[1] + ",1";
context.write(key, new Text(line));
} else if (myList.get(1).toString().indexOf(IMPRESSION_PREFIX) != -1
&& myList.get(0).toString().indexOf(CLICK_PREFIX) != -1) {
String line = myList.get(1).toString().split(SEPERATOR)[1] + ",1";
context.write(key, new Text(line));
}
}
}
} catch (Exception exception) {
exception.printStackTrace();
}
}
}
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
// conf.set("mapreduce.output.fileoutputformat.compress", "true");
// conf.set("mapreduce.output.fileoutputformat.compress.codec",
// "org.apache.hadoop.io.compress.GzipCodec");
// conf.set("mapreduce.map.output.compress.codec",
// "org.apache.hadoop.io.compress.SnappyCodec");
// conf.set("mapreduce.output.fileoutputformat.compress.type",
// "BLOCK");
Job job = Job.getInstance(conf, "IMPRESSION_CLICK_COMBINE_JOB");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setReducerClass(ImpressionClickReducer.class);
FileInputFormat.setInputDirRecursive(job, true);
// FileInputFormat.addInputPath(job, new Path(args[0]));
// job.setMapperClass(ImpressionMapper.class);
/**
* Here directory of impressions will be present
*/
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, ImpressionMapper.class);
/**
* Here directory of clicks will be present
*/
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, ClickMapper.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}
Any leads will be appreciated.
Eg. File 1 [trackerId1,record1]
File2 [treackerId1, Clicked]
In reducer I am getting :
trackerId,[record1,record1] ideally it should be trackerId ,[record1,clicked]
Your problem is most likely with this line in the reducer:
List<Text> myList = Lists.newArrayList(values);
The main thing to remember is that Iterable<Text> values is reusing the Text object it gives you as you iterate. So you might be adding two Text objects to the array, but they are pointing at the same object.
If you look at how Lists.newArrayList() works, it just adds objects to the array, without creating a new one.
So if you're going to use Text objects you need to create a new one each time you add a value to the array. This is typically a reason why people use Strings in cases like this. A quick check to see if this is the problem would be to change this code to something like:
List<Text> myList = new ArrayList<Text>();
for (Text v : values) {
myList.add(new Text(v));
}
Thus, you create a new Text each time.

Hadoop Total Order Partitioner

import java.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.mapreduce.lib.partition.*;
import org.apache.hadoop.mapreduce.lib.reduce.*;
import org.apache.hadoop.util.*;
/**
* Demonstrates how to use Total Order Partitioner on Word Count.
*/
public class TotalOrderPartitionerExample {
public static class WordCount extends Configured implements Tool {
private final static int REDUCE_TASKS = 8;
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new WordCount(), args);
System.exit(exitCode);
}
#Override #SuppressWarnings({ "unchecked", "rawtypes" })
public int run(String[] args) throws Exception {
// Check arguments.
if (args.length != 2) {
String usage =
"Usage: " +
"hadoop jar TotalOrderPartitionerExample$WordCount " +
"<input dir> <output dir>\n"
System.out.printf(usage);
System.exit(-1);
}
String jobName = "WordCount";
String mapJobName = jobName + "-Map";
String reduceJobName = jobName + "-Reduce";
// Get user args.
String inputDir = args[0];
String outputDir = args[1];
// Define input path and output path.
Path mapInputPath = new Path(inputDir);
Path mapOutputPath = new Path(outputDir + "-inter");
Path reduceOutputPath = new Path(outputDir);
// Define partition file path.
Path partitionPath = new Path(outputDir + "-part.lst");
// Configure map-only job for sampling.
Job mapJob = new Job(getConf());
mapJob.setJobName(mapJobName);
mapJob.setJarByClass(WordCount.class);
mapJob.setMapperClass(WordMapper.class);
mapJob.setNumReduceTasks(0);
mapJob.setOutputKeyClass(Text.class);
mapJob.setOutputValueClass(IntWritable.class);
TextInputFormat.setInputPaths(mapJob, mapInputPath);
// Set the output format to a sequence file.
mapJob.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(mapJob, mapOutputPath);
// Submit the map-only job.
int exitCode = mapJob.waitForCompletion(true) ? 0 : 1;
if (exitCode != 0) { return exitCode; }
// Set up the second job, the reduce-only.
Job reduceJob = new Job(getConf());
reduceJob.setJobName(reduceJobName);
reduceJob.setJarByClass(WordCount.class);
// Set the input to the previous job's output.
reduceJob.setInputFormatClass(SequenceFileInputFormat.class);
SequenceFileInputFormat.setInputPaths(reduceJob, mapOutputPath);
// Set the output path to the final output path.
TextOutputFormat.setOutputPath(reduceJob, reduceOutputPath);
// Use identity mapper for key/value pairs in SequenceFile.
reduceJob.setReducerClass(IntSumReducer.class);
reduceJob.setMapOutputKeyClass(Text.class);
reduceJob.setMapOutputValueClass(IntWritable.class);
reduceJob.setOutputKeyClass(Text.class);
reduceJob.setOutputValueClass(IntWritable.class);
reduceJob.setNumReduceTasks(REDUCE_TASKS);
// Use Total Order Partitioner.
reduceJob.setPartitionerClass(TotalOrderPartitioner.class);
// Generate partition file from map-only job's output.
TotalOrderPartitioner.setPartitionFile(
reduceJob.getConfiguration(), partitionPath);
InputSampler.writePartitionFile(reduceJob, new InputSampler.RandomSampler(
1, 10000));
// Submit the reduce job.
return reduceJob.waitForCompletion(true) ? 0 : 2;
}
}
public static class WordMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
for (String word : line.split("\\W+")) {
if (word.length() == 0) { continue; }
context.write(new Text(word), new IntWritable(1));
}
}
}
}
I got this code from github.
I compared elapsed time of maps and reduces.
Regular wordcount does better job performing than total order paritioner.
Why is that?
Any optimizations or changes needed to meet average performance?
Hashpartitioner performance vs TotalOrderPartitioner Performance?
Yes, HashPartitioner will perform better than TotalOrderPartitioner because the HashPartitioner does not have the overhead or running the InputSampler and writing the Partition file etc.,
TotalOrderPartitioner is only used when you need a globally sorted output and will be slower than HashPartitioner.

Using Multiple Mappers for multiple output directories in Hadoop MapReduce

I want to run two mappers that produce two different outputs in different directories.The output of the first mapper(Send as argument) should be send to the input of the second mapper.i have this code in the driver class
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Export_Column_Mapping
{
private static String[] Detail_output_column_array = new String[27];
private static String[] Shop_output_column_array = new String[8];
private static String details_output = null ;
private static String Shop_output = null;
public static void main(String[] args) throws Exception
{
String Output_filetype = args[3];
String Input_column_number = args[4];
String Output_column_number = args[5];
Configuration Detailsconf = new Configuration(false);
Detailsconf.setStrings("output_filetype",Output_filetype);
Detailsconf.setStrings("Input_column_number",Input_column_number);
Detailsconf.setStrings("Output_column_number",Output_column_number);
Job Details = new Job(Detailsconf," Export_Column_Mapping");
Details.setJarByClass(Export_Column_Mapping.class);
Details.setJobName("DetailsFile_Job");
Details.setMapperClass(DetailFile_Mapper.class);
Details.setNumReduceTasks(0);
Details.setInputFormatClass(TextInputFormat.class);
Details.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Details, new Path(args[0]));
FileOutputFormat.setOutputPath(Details, new Path(args[1]));
if(Details.waitForCompletion(true))
{
Configuration Shopconf = new Configuration();
Job Shop = new Job(Shopconf,"Export_Column_Mapping");
Shop.setJarByClass(Export_Column_Mapping.class);
Shop.setJobName("ShopFile_Job");
Shop.setMapperClass(ShopFile_Mapper.class);
Shop.setNumReduceTasks(0);
Shop.setInputFormatClass(TextInputFormat.class);
Shop.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(Shop, new Path(args[1]));
FileOutputFormat.setOutputPath(Shop, new Path(args[2]));
MultipleOutputs.addNamedOutput(Shop, "text", TextOutputFormat.class,LongWritable.class, Text.class);
System.exit(Shop.waitForCompletion(true) ? 0 : 1);
}
}
public static class DetailFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String str_Output_filetype = context.getConfiguration().get("output_filetype");
String str_Input_column_number = context.getConfiguration().get("Input_column_number");
String[] input_columns_number = str_Input_column_number.split(",");
String str_Output_column_number= context.getConfiguration().get("Output_column_number");
String[] output_columns_number = str_Output_column_number.split(",");
String str_line = value.toString();
String[] input_column_array = str_line.split(",");
try
{
for(int i = 0;i<=input_column_array.length+1; i++)
{
int int_outputcolumn = Integer.parseInt(output_columns_number[i]);
int int_inputcolumn = Integer.parseInt(input_columns_number[i]);
if((int_inputcolumn != 0) && (int_outputcolumn != 0) && output_columns_number.length == input_columns_number.length)
{
Detail_output_column_array[int_outputcolumn-1] = input_column_array[int_inputcolumn-1];
if(details_output != null)
{
details_output = details_output+" "+ Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output+" "+ Shop_output_column_array[int_outputcolumn-1];
}else
{
details_output = Detail_output_column_array[int_outputcolumn-1];
Shop_output = Shop_output_column_array[int_outputcolumn-1];
}
}
}
}catch (Exception e)
{
}
context.write(null,new Text(details_output));
}
}
public static class ShopFile_Mapper extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
try
{
for(int i = 0;i<=Shop_output_column_array.length; i++)
{
Shop_output_column_array[0] = Detail_output_column_array[0];
Shop_output_column_array[1] = Detail_output_column_array[1];
Shop_output_column_array[2] = Detail_output_column_array[2];
Shop_output_column_array[3] = Detail_output_column_array[3];
Shop_output_column_array[4] = Detail_output_column_array[14];
if(details_output != null)
{
Shop_output = Shop_output+" "+ Shop_output_column_array[i];
}else
{
Shop_output = Shop_output_column_array[i-1];
}
}
}catch (Exception e){
}
context.write(null,new Text(Shop_output));
}
}
}
I get the error..
Error:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
Input path does not exist:
file:/home/Barath.B.Natarajan.ap/rules/text.txt
I want to run the jobs one by one can any one help me in this?...
There is something called jobcontrol with which you will be able to achieve it.
Suppose there are two jobs A and B
ControlledJob A= new ControlledJob(JobConf for A);
ControlledJob B= new ControlledJob(JobConf for B);
B.addDependingJob(A);
JobControl jControl = newJobControl("Name");
jControl.addJob(A);
jControl.addJob(B);
Thread runJControl = new Thread(jControl);
runJControl.start();
while (!jControl.allFinished()) {
code = jControl.getFailedJobList().size() == 0 ? 0 : 1;
Thread.sleep(1000);
}
System.exit(1);
Initialize code at the beginning like this:
int code =1;
Let the first job in your case be the first mapper with zero reducer and second job be the second mapper with zero reducer.The configuration should be such that the input path of B and output path of A should be same.

Number of parallel mapper tasks in Hadoop Streaming job

I'm just starting to learn about Hadoop. I'm trying to use the streaming interface in conjunction with a Python script that processes files: for each input file I create an output file with some information about it, so this is a map job with no reducer. What I'm finding is that files are being processed one at a time, which isn't quite what I'd wanted.
I'll explain what I've done, but I'll also post some code afterwards in case there's something I'm missing there.
I've got an input format and record reader that reads whole files and uses their content as values and file names as keys. (The files aren't huge.) On the other end, I've got an output format and record writer that writes out values to files with names based on the keys. I'm using -io rawbytes and my Python script knows how to read and write key/value pairs.
It all works fine, in terms of producing the output I'm expecting. If I run with, e.g., 10 input files I get 10 splits. That means that each time my script runs it only gets one key/value pair - which isn't ideal, but it's not a big deal, and I can see that this might be unavoidable. What's less good is that it that there is only one running instance of the script at any one time. Setting mapreduce.job.maps doesn't make any difference (although I vaguely remember seeing something about this value only being a suggestions, so perhaps Hadoop is making a different decision). What am I missing?
Here's my code:-
#!/bin/bash
hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
-libjars mimi.jar \
-D mapreduce.job.reduces=0 \
-files rawbytes_mapper.py,irrelevant.py \
-inputformat "mimi.WholeFileInputFormat" \
-outputformat "mimi.NamedFileOutputFormat" \
-io rawbytes \
-mapper "rawbytes_mapper.py irrelevant blah blah blah" \
-input "input/*.xml" \
-output output
#!/usr/bin/python
def read_raw_bytes(input):
length_bytes = input.read(4)
if len(length_bytes) < 4:
return None
length = 0
for b in length_bytes:
length = (length << 8) + ord(b)
return input.read(length)
def write_raw_bytes(output, s):
length = len(s)
length_bytes = []
for _ in range(4):
length_bytes.append(chr(length & 0xff))
length = length >> 8
length_bytes.reverse()
for b in length_bytes:
output.write(b)
output.write(s)
def read_keys_and_values(input):
d = {}
while True:
key = read_raw_bytes(input)
if key is None: break
value = read_raw_bytes(input)
d[key] = value
return d
def write_keys_and_values(output, d):
for key in d:
write_raw_bytes(output, key)
write_raw_bytes(output, d[key])
if __name__ == "__main__":
import sys
module = __import__(sys.argv[1])
before = read_keys_and_values(sys.stdin)
module.init(sys.argv[2:])
after = module.process(before)
write_keys_and_values(sys.stdout, after)
package mimi;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
public class WholeFileInputFormat extends FileInputFormat<BytesWritable, BytesWritable>
{
private static class WholeFileRecordReader implements RecordReader<BytesWritable, BytesWritable>
{
private FileSplit split;
private JobConf conf;
private boolean processed = false;
public WholeFileRecordReader(FileSplit split, JobConf conf)
{
this.split = split;
this.conf = conf;
}
#Override
public BytesWritable createKey()
{
return new BytesWritable();
}
#Override
public BytesWritable createValue()
{
return new BytesWritable();
}
#Override
public boolean next(BytesWritable key, BytesWritable value) throws IOException
{
if (processed)
{
return false;
}
byte[] contents = new byte[(int) split.getLength()];
Path file = split.getPath();
String name = file.getName();
byte[] bytes = name.getBytes(StandardCharsets.UTF_8);
key.set(bytes, 0, bytes.length);
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try
{
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
}
finally
{
IOUtils.closeStream(in);
}
processed = true;
return true;
}
#Override
public float getProgress() throws IOException
{
return processed ? 1.0f : 0.0f;
}
#Override
public long getPos() throws IOException
{
return processed ? 0l : split.getLength();
}
#Override
public void close() throws IOException
{
// do nothing
}
}
#Override
protected boolean isSplitable(FileSystem fs, Path file)
{
return false;
}
#Override
public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split,
JobConf conf,
Reporter reporter)
throws IOException
{
return new WholeFileRecordReader((FileSplit) split, conf);
}
}
package mimi;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
public class NamedFileOutputFormat extends MultipleOutputFormat<BytesWritable, BytesWritable>
{
private static class BytesValueWriter implements RecordWriter<BytesWritable, BytesWritable>
{
FSDataOutputStream out;
BytesValueWriter(FSDataOutputStream out)
{
this.out = out;
}
#Override
public synchronized void write(BytesWritable key, BytesWritable value) throws IOException
{
out.write(value.getBytes(), 0, value.getLength());
}
#Override
public void close(Reporter reporter) throws IOException
{
out.close();
}
}
#Override
protected String generateFileNameForKeyValue(BytesWritable key, BytesWritable value, String name)
{
return new String(key.getBytes(), 0, key.getLength(), StandardCharsets.UTF_8);
}
#Override
public RecordWriter<BytesWritable, BytesWritable> getBaseRecordWriter(FileSystem ignored,
JobConf conf,
String name,
Progressable progress)
throws IOException
{
Path file = FileOutputFormat.getTaskOutputPath(conf, name);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream out = fs.create(file, progress);
return new BytesValueWriter(out);
}
}
I think I can help you with this part of your problem:
each time my script runs it only gets one key/value pair - which isn't ideal
If isSplitable method returns false only one file per mapper will be processed. So if you won't override isSplitable method and leave it return true you should have more than one key/value pair in one mapper. In your case every file is one key/value pair so they can't be splitted even when isSplitable returns true.
I cannot figure out why only one mapper starts at one time, but I'm still thinking about it :)

Why does the last reducer stop with java heap error during merge step

I keep increasing the number of reducers and I see that while all except one reducers run quickly and finish their job, one last reducer just hangs at the merge step with this message in its tasktracker log:
Down to the last merge-pass, with 3 segments left of total size: 171207264 bytes
... and after a long time staying at this statement, it throws a java heap error and starts some cleaning which just doesn't finish.
I increased the child.opts memory to 3.5GB (unable to go beyond this limit) and compressed the map output too.
What might be the cause?
Here is the driver code:
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.task.timeout", "6000000");
conf.set("mapred.compress.map.output", "true");
Job job = new Job(conf, "FreebasePreprocess_Phase2");
job.setNumReduceTasks(6);
job.setJarByClass(FreebasePreprocess.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("/user/watsonuser/freebase_data100m120m_output"));
FileOutputFormat.setOutputPath(job, new Path("/user/watsonuser/freebase_data100m120m_output_2"));
job.waitForCompletion(true);
}
Here is the mapper:
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class Map extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String[] entities = value.toString().split("\\t");
String[] strings = {"/type/object/type", "/common/topic/notable_for", "/type/user/usergroup"};
List<String> filteredPredicates = Arrays.asList(strings);
FileSplit fileSplit = (FileSplit)context.getInputSplit();
String filename = fileSplit.getPath().getName();
// System.out.println("File name "+filename);
if(filename.startsWith("part-r")) {
// if(filename.equalsIgnoreCase("quad.tsv")) {
//this is a quad dump file
String name = null;
String predicate = null;
String oid = null;
String outVal = null;
String outKey = null;
if(entities.length==3) {
oid = entities[0].trim();
predicate = entities[1].trim();
name = entities[2].trim();
/*if(predicate.contains("/type/object/name/lang"))
{
if(predicate.endsWith("/en"))
{*/
/*outKey = sid;
outVal = oid+"#-#-#-#"+"topic_name";
context.write(new Text(outKey), new Text(outVal));*/
/* }
}*/
outKey = oid;
outVal = predicate+"#-#-#-#"+name;
context.write(new Text(outKey), new Text(outVal));
}
}
else if(filename.equalsIgnoreCase("freebase-simple-topic-dump.tsv")) {
//this is a simple topic dump file
String sid = null;
String name = null;
String outKey = null;
String outVal = null;
if(entities.length>1) {
sid = entities[0];
name = entities[1];
outKey = sid;
outVal = name+"#-#-#-#"+"topic_name";
context.write(new Text(outKey), new Text(outVal));
}
}
}
}
Here is the reducer
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Reduce extends Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException
{
String name = null;
String sid = null;
String predicate = null;
String oid = null;
String id = null;
String outKey = null;
String outVal = null;
ArrayList<Text> valuesList = new ArrayList<Text>();
Iterator<Text> ite = values.iterator();
while(ite.hasNext()) {
Text t = ite.next();
Text txt = new Text();
txt.set(t.toString());
valuesList.add(txt);
String[] entities = t.toString().split("#-#-#-#");
if(entities[entities.length-1].equalsIgnoreCase("topic_name"))
{
name = entities[0];
}
}
for(int i=0; i<valuesList.size(); i++) {
{
Text t2 = valuesList.get(i);
String[] entities = t2.toString().split("#-#-#-#");
if(!entities[entities.length-1].contains("topic_name"))
{
if(name!=null) {
outKey = entities[1]+"\t"+entities[0]+"\t"+name;
}
else {
outKey = entities[1]+"\t"+entities[0]+"\t"+key.toString();
}
context.write(new Text(outKey), null);
}
}
}
}
My guess is that you have a single key with a huge number of values and the following line in your reducer is causing you problems:
valuesList.add(txt);
Lets say you had a key with 100m values, you're trying to build an arraylist of size 100m - at some stage your reducer JVM is going to run out of memory.
You can probably confirm this by putting in some debug and inspecting the logs for the reducer that never ends:
valuesList.add(txt);
if (valuesList.size() % 10000 == 0) {
System.err.println(key + "\t" + valueList.size());
}
I haven't written raw MR in a while, but I would approach it in a way similar to this:
Keeping all values for a key in memory is always dangerous. I would instead add another MR phase to your job. In the first stage emit newkey = (key, 0), newValue = value when value contains "topic-name", and newkey = (key, 1), newValue = value when value doesn't contain "topic-name". This will require writing a custom writablecomparable that can handle a pair, and knows how to sort it.
For the reducer in the next phase write a partitioner that partitions on the first element of the new key. Now because of the last reducer's sorted-by-key output, you are guaranteed that you get the k,v pair with the 'name' before you get the other k,v pairs for each key. Now you have access to the "name" for each value corresponding to a key.

Resources