hadoop input data problems - hadoop

I'm having trouble with the map functions:
The original data is stored in the tsv file:
I just want the last two columns saved:
the first is the original node(383), second is the target(4575), third is the weight(1)
383 4575 1
383 4764 1
383 5458 1
383 5491 1
public void map(LongWritable key, Text value,OutputCollector output, Reporter reporter) throws IOException {
String line = value.toString();
String[] tokens = line.split("t");
int weight = Integer.parseInt(tokens[2]);
int target = Integer.parseInt(tokens[0]);
}
Here is my code:
public void map(LongWritable key, Text value, Context context) throws IOException InterruptedException
{
String line = value.toString();
//split the tsv file
String[] tokens = line.split("/t");
//save the weight and target
private Text target = Integer.parsetxt(tokens[0]);
int weight = Integer.parseInt(tokens[2]);
context.write(new Text(target), new Intwritable(weight) );
}
}
public class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>
{
#Override
public void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException
{
//initialize the count variable
int weightsum = 0;
for (IntWritable value : values) {
weightsum += value.get();
}
context.write(key, new IntWritable(weightsum));
}
}

String[] tokens = line.split("t");
should be
String[] tokens = line.split("\t");

split with spaces.
String[] tokens = line.split("\\s+");
private Text target = Integer.parsetxt(tokens[1]);
int weight = Integer.parseInt(tokens[2]);

Related

Count the occurrences of the key and number format

In my reducer part of MapReduce, I have the code
public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {
private Text textValue = new Text();
private FloatWritable floatWritable = new FloatWritable();
#Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
double total = 0.00;
int count = 0;
for (Text val: values) {
String line = val.toString();
String[] field = line.split(",");
count+=1;
total += Float.parseFloat(field[1]);
}
String v = String.valueOf(count) + "," + String.valueOf(total);
textValue.set(v);
context.write(key, textValue);
}
}
However, the output returns 1 for every key but not the occurrences of the key. Also, how can I format the total to a number that has "," for a thousand value with 2 decimals?
Current Output
1 1, 1201.22135
Desired Output
1 3, 1,201.22
(3 is number of occurrences of key 1 and 1,201.22 is the total value of all key 1)

Multiple input files and output the each file result in 1 line in 1 file (Hadoop:MapReduce)

I am stuck in separate each file wordcount result in 1 line.
I hope to output all files result in 1 file and each file result represent in 1 line.
Expected output.txt format
file1 1, 2, 3, 4, …, 100
file2 5, 2, 9, 6, …, 30
Currently output results in
each file wordcount result have sum up together
file1 123,22,31,...,100
file2 123,22,31,...,100
run()
MultipleInputs.addInputPath(job,in_path1,TextInputFormat.class,Map.class);
MultipleInputs.addInputPath(job,in_path2,TextInputFormat.class,Map.class);
Map
context.write(new Text("file1"),output);
context.write(new Text("file2"),output);
Reduce
context.write(new Text("file1"),new Text(sp.toString()));
context.write(new Text("file2"),new Text(sp.toString()));
Map
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
private static final HashMap<String, Object> counter = new HashMap<>();
private Text output = new Text();
private String mapToString(HashMap<String, Object> map) {
StringBuilder sb = new StringBuilder();
Iterator<Entry<String, Object>> iter = map.entrySet().iterator();
while (iter.hasNext()) {
Entry<String, Object> entry = iter.next();
sb.append(entry.getKey());
sb.append('=');
sb.append(entry.getValue().toString());
if (iter.hasNext()) {
sb.append(';').append(' ');
}
}
return sb.toString();
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// TODO: Get filename
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (!counter.containsKey(token)) {
counter.put(token, 1);
} else {
counter.put(token, (Integer) counter.get(token) + 1);
}
}
output.set(mapToString(counter));
context.write(new Text("filename1"), output);
}
}
Reduce
public static class Reduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int number = 0;
System.out.println(key);
for (Text val : values) { // val line record
String[] input = val.toString().split(";\\s");
for (int i = 0; i < input.length; i++) {
String[] temp = input[i].split("=");
String topValue = temp[0];
topValue = temp[0].replaceAll("[^a-zA-Z0-9]", "");
topValue = topValue.toLowerCase();
if (resultMap.containsKey(topValue)) {
int original = resultMap.get(topValue);
int sum = original + Integer.parseInt(temp[1]);
resultMap.put(topValue, sum);
}
}
}
}
#Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
StringBuilder sp = new StringBuilder();
System.out.println("MapSize: " + resultMap);
int i = 0;
Iterator iterator = resultMap.entrySet().iterator();
while (iterator.hasNext()) {
Entry me2 = (Entry) iterator.next();
//System.out.println("key : " + me2.getKey());
sp.append(me2.getKey());
sp.append(":");
sp.append(me2.getValue());
System.out.println(me2.getValue());
sp.append(",");
}
context.write(new Text("file1"), new Text(sp.toString()));
context.write(new Text("file2"), new Text(sp.toString()));
}
}
I am stuck in two file word count combine together. And i hope to print each file word count in 1 line.

Hadoop not all values get assembled for one key

I have some data that I would like to aggregate by key using Mapper code and then perform something on all values that belong to a key using Reducer code. For example if I have:
key = 1, val = 1,
key = 1, val = 2,
key = 1, val = 3
I would like to get key=1, val=[1,2,3] in my Reducer.
The thing is, I get something like
key = 1, val=[1,2]
key = 1, val=[3]
Why is that so?
I thought that all the values for one specific key will be assembled in one reducer, but now it seems that there can be more key, val [ ] pairs, since there can be multiple reducers, is that so?
Should I set number of reducers to be 1?
I'm new to Hadoop so this confuses me.
Here's the code
public class SomeJob {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException
{
Job job = new Job();
job.setJarByClass(SomeJob.class);
FileInputFormat.addInputPath(job, new Path("/home/pera/data/input/some.csv"));
FileOutputFormat.setOutputPath(job, new Path("/home/pera/data/output"));
job.setMapperClass(SomeMapper.class);
job.setReducerClass(SomeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
}
}
public class SomeMapper extends Mapper<LongWritable, Text, Text, Text>{
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String parts[] = line.split(";");
context.write(new Text(parts[0]), new Text(parts[4]));
}
}
public class SomeReducer extends Reducer<Text, Text, Text, Text>{
#Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String properties = "";
for(Text value : values)
{
properties += value + " ";
}
context.write(key, new Text(properties));
}
}

Can I get a Partition number of Hadoop?

I am a hadoop newbie.
I want to get a partition number on output file.
At first, I made a customized partitioner.
public static class MyPartitioner extends Partitioner<Text, LongWritable> {
public int getPartition(Text key, LongWritable value, int numReduceTasks) {
int numOfChars = key.toString().length();
return numOfChars % numReduceTasks;
}
}
It works. But, I want to output partition numbers 'visually' on Reducer.
How can I get a partiton number ??
Below is my reducer source.
public static class MyReducer extends Reducer<Text, LongWritable, Text, Text>{
private Text textList = new Text();
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
String list = new String();
for(LongWritable value: values) {
list = new String(list + "\t" + value.toString());
}
textList.set(list);
context.write(key, textList);
}
}
I want to put a partition number on 'list' respectively. There will be '0' or '1'.
list = new String(list + "\t" + value.toString() + "\t" + ??);
It would be great if someone helps me.
+
Thanks to the answer, I got a solution. But, It didn't work and I think I did something wrong.
Below is the modified MyPartitioner.
public static class MyPartitioner extends Partitioner {
public int getPartition(Text key, LongWritable value, int numReduceTasks) {
int numOfChars = key.toString().length();
return numOfChars % numReduceTasks;
private int bring_num = 0;
public void configure(JobConf job) {
bring_num = jobConf.getInt(numOfChars & numReduceTasks);
}
}
}
Add the below code to the Reducer class to get the partition number in a class variable which can be later used in the reducer method.
String partition;
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
partition = conf.get("mapred.task.partition");
}

Read values wrapped in Hadoop ArrayWritable

I am new to Hadoop and Java. My mapper outputs text and Arraywritable. I having trouble to read ArrayWritable values. Unbale to cast .get() values to integer. Mapper and reducer code are attached. Can someone please help me to correct my reducer code in order to read ArrayWritable values?
public static class Temp2Mapper extends Mapper<LongWritable, Text, Text, ArrayWritable>{
private static final int MISSING=9999;
#Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line = value.toString();
String date = line.substring(07,14);
int maxTemp,minTemp,avgTemp;
IntArrayWritable carrier = new IntArrayWritable();
IntWritable innercarrier[] = new IntWritable[3];
maxTemp=Integer.parseInt(line.substring(39,45));
minTemp=Integer.parseInt(line.substring(47,53));
avgTemp=Integer.parseInt(line.substring(63,69));
if (maxTemp!= MISSING)
innercarrier[0]=new IntWritable(maxTemp); // maximum Temperature
if (minTemp!= MISSING)
innercarrier[1]=new IntWritable(minTemp); //minimum temperature
if (avgTemp!= MISSING)
innercarrier[2]=new IntWritable(avgTemp); // average temperature of 24 hours
carrier.set(innercarrier);
context.write(new Text(date), carrier); // Output Text and ArrayWritable
}
}
public static class Temp2Reducer
extends Reducer<Text, ArrayWritable, Text, IntWritable>{
#Override public void reduce(Text key, Iterable<ArrayWritable> values, Context context )
throws IOException, InterruptedException {
int max = Integer.MIN_VALUE;
int[] arr= new int[3];
for (ArrayWritable val : values) {
arr = (Int) val.get(); // Error: cannot cast Writable to int
max = Math.max(max, arr[0]);
}
context.write( key, new IntWritable(max) );
}
}
ArrayWritable#get method returns an array of Writable.
You can't cast an array of Writable to int. What you can do is:
iterate over this array
cast each item (which will be of type Writable) of the array to IntWritable
use IntWritable#get method to get the int value.
for (ArrayWritable val: values) {
for (Writable writable: val.get()) { // iterate
IntWritable intWritable = (IntWritable)writable; // cast
int value = intWritable.get(); // get
// do your thing with int value
}
}

Resources