Read from HDFS and write to HBASE - hadoop

The Mapper is reading file from two places
1) Articles visited by user(sorting by country)
2) Statistics of country (country wise)
The output of both Mapper is Text, Text
I am running program of Amazon Cluster
My aim is read data from two different set and combine the result and store it in hbase.
HDFS to HDFS is working.
The code is getting stuck at reducing 67% and gives error as
17/02/24 10:45:31 INFO mapreduce.Job: map 0% reduce 0%
17/02/24 10:45:37 INFO mapreduce.Job: map 100% reduce 0%
17/02/24 10:45:49 INFO mapreduce.Job: map 100% reduce 67%
17/02/24 10:46:00 INFO mapreduce.Job: Task Id : attempt_1487926412544_0016_r_000000_0, Status : FAILED
Error: java.lang.IllegalArgumentException: Row length is 0
at org.apache.hadoop.hbase.client.Mutation.checkRow(Mutation.java:565)
at org.apache.hadoop.hbase.client.Put.<init>(Put.java:110)
at org.apache.hadoop.hbase.client.Put.<init>(Put.java:68)
at org.apache.hadoop.hbase.client.Put.<init>(Put.java:58)
at com.happiestminds.hadoop.CounterReducer.reduce(CounterReducer.java:45)
at com.happiestminds.hadoop.CounterReducer.reduce(CounterReducer.java:1)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:635)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:390)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Driver class is
package com.happiestminds.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Main extends Configured implements Tool {
/**
* #param args
* #throws Exception
*/
public static String outputTable = "mapreduceoutput";
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Main(), args);
System.exit(exitCode);
}
#Override
public int run(String[] args) throws Exception {
Configuration config = HBaseConfiguration.create();
try{
HBaseAdmin.checkHBaseAvailable(config);
}
catch(MasterNotRunningException e){
System.out.println("Master not running");
System.exit(1);
}
Job job = Job.getInstance(config, "Hbase Test");
job.setJarByClass(Main.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, ArticleMapper.class);
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, StatisticsMapper.class);
TableMapReduceUtil.addDependencyJars(job);
TableMapReduceUtil.initTableReducerJob(outputTable, CounterReducer.class, job);
//job.setReducerClass(CounterReducer.class);
job.setNumReduceTasks(1);
return job.waitForCompletion(true) ? 0 : 1;
}
}
Reducer class is
package com.happiestminds.hadoop;
import java.io.IOException;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class CounterReducer extends TableReducer<Text, Text, ImmutableBytesWritable> {
public static final byte[] CF = "counter".getBytes();
public static final byte[] COUNT = "combined".getBytes();
#Override
protected void reduce(Text key, Iterable<Text> values,
Reducer<Text, Text, ImmutableBytesWritable, Mutation>.Context context)
throws IOException, InterruptedException {
String vals = values.toString();
int counter = 0;
StringBuilder sbr = new StringBuilder();
System.out.println(key.toString());
for (Text val : values) {
String stat = val.toString();
if (stat.equals("***")) {
counter++;
} else {
sbr.append(stat + ",");
}
}
sbr.append("Article count : " + counter);
Put put = new Put(Bytes.toBytes(key.toString()));
put.addColumn(CF, COUNT, Bytes.toBytes(sbr.toString()));
if (counter != 0) {
context.write(null, put);
}
}
}
Dependencies
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>1.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.2</version>
</dependency>
</dependencies>

A good practice is to validate your values before submitting them somewhere. In your particular case you can validate your key and sbr or wrap them into try-catch section with proper notification policy. You should output them into some log if they are not correct and update you unit tests with new test-cases:
try
{
Put put = new Put(Bytes.toBytes(key.toString()));
put.addColumn(CF, COUNT, Bytes.toBytes(sbr.toString()));
if (counter != 0) {
context.write(null, put);
}
}
catch (IllegalArgumentException ex)
{
System.err.println("Error processing record - Key: "+ key.toString() +", values: " +sbr.ToString());
}

According to the exception thrown by the program it is clear that key length is 0 so before putting into hbase you can check if key length is 0 or not then only you can put into the hbase.
More clarity why key length's 0 is not supported by hbase
Becuase HBase data model does not allow 0-length row key, it should be at least 1 byte. 0-byte row key is reserved for internal usage (to designate empty start key and end keys).

Can you try to check whether you are inserting any null values or not ?
HBase data model does not allow zero length row key, it should be at least 1 byte.
Please check in your reducer code before executing the put command , whether some of the values are populated to null or not.

The error you get is quite self-explanatory. Row keys in HBase can't be empty (though values can be).
#Override
protected void reduce(Text key, Iterable<Text> values,
Reducer<Text, Text, ImmutableBytesWritable, Mutation>.Context context)
throws IOException, InterruptedException {
if (key == null || key.getLength() == 0) {
// Log a warning about the empty key.
return;
}
// Rest of your reducer follows.
}

Related

Hadoop MapReduce Program for removing duplicate records

Could anyone help me to write the mapper and reducer for merging these two files and then removing the duplicate records?
These are the two text files:
file1.txt
2012-3-1a
2012-3-2b
2012-3-3c
2012-3-4d
2012-3-5a
2012-3-6b
2012-3-7c
2012-3-3c
and file2.txt:
2012-3-1b
2012-3-2a
2012-3-3b
2012-3-4d
2012-3-5a
2012-3-6c
2012-3-7d
2012-3-3c
A simple word count program will do the job for you. The only change you need to make is, set the output value of the Reducer as NullWritable.get()
Is there a common key in both the files which helps identify if record matched or not? If so then:
Mappers Input: Standard TextInputFormat
Mapper's Output Key : Common Key and Mapper's output Value : Entire Record.
At reducer : It will not be required to iterate over the Keys just take only 1 instance of the Value for Write.
If the match or duplicacy can be concluded only if complete record matched: then
Mappers Input: Standard TextInputFormat
Mapper's Output Key : Entire Record and Mapper's output Value : NullWritable.
At reducer: It will not be required to iterate over the Keys. Just take only one instance of Key and write that as a Value.
Reducer Output Key: Reducer Input Key, Reducer Output Value : NullWritable
Here's code to remove duplicate lines in large text data, which uses hash for efficiency:
DRMapper.java
import com.google.common.hash.Hashing;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
class DRMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text hashKey = new Text();
private Text mappedValue = new Text();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
hashKey.set(Hashing.murmur3_32().hashString(line, StandardCharsets.UTF_8).toString());
mappedValue.set(line);
context.write(hashKey, mappedValue);
}
}
DRReducer.java
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class DRReducer extends Reducer<Text, Text, Text, NullWritable> {
#Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
Text value;
if (values.iterator().hasNext()) {
value = values.iterator().next();
if (!(value.toString().isEmpty())) {
context.write(value, NullWritable.get());
}
}
}
}
DuplicateRemover.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DuplicateRemover {
private static final int DEFAULT_NUM_REDUCERS = 210;
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: DuplicateRemover <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(DuplicateRemover.class);
job.setJobName("Duplicate Remover");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(DRMapper.class);
job.setReducerClass(DRReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(DEFAULT_NUM_REDUCERS);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
compile with:
javac -encoding UTF8 -cp $(hadoop classpath) *.java
jar cf dr.jar *.class
Assuming that the input text files are in in_folder, run as:
hadoop jar dr.jar in_folder out_folder

Mapreduce with HCATALOG integration with oozie in MAPR

I have written a mapreduce program that reads the data from hive table using HCATLOG and writes into HBase. This is a map only job with no reducers. I have ran the program from command line and it works as expected(Created a fat jar to avoid Jar issues). I wanted to integrate it oozie (with Help of HUE) . I have two options to run it
Use Mapreduce Action
Use Java Action
Since my Mapreduce program has a driver method that holds the below code
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat;
public class HBaseValdiateInsertDriver {
public static void main(String[] args) throws Exception {
String dbName = "Test";
String tableName = "emp";
Configuration conf = new Configuration();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "HBase Get Put Demo");
job.setInputFormatClass(HCatInputFormat.class);
HCatInputFormat.setInput(job, dbName, tableName, null);
job.setJarByClass(HBaseValdiateInsertDriver.class);
job.setMapperClass(HBaseValdiateInsert.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, new Path("maprfs:///user/input"));
FileOutputFormat.setOutputPath(job, new Path("maprfs:///user/output"));
job.waitForCompletion(true);
}
}
How do i specify the driver method in oozie, All that i can see is to specify mapper and reducer class.Can someone guide me how do i set the properties ?
Using java action i can specify my driver class as the main class and get this executed , but i face errors like table not found, HCATLOG jars not found etc. I have include hive-site.xml in the workflow(Using Hue) but i feel the system is not able to pick up the properties. Can someone advise me what all do i have to take care of, are there any other configuration properties that i need to include ?
Also the sample program i referred in cloudera website uses
HCatInputFormat.setInput(job, InputJobInfo.create(dbName,
inputTableName, null));
where as i use the below (I dont see a method that accept the above input
HCatInputFormat.setInput(job, dbName, tableName, null);
Below is my mapper code
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hive.hcatalog.data.HCatRecord;
public class HBaseValdiateInsert extends Mapper<WritableComparable, HCatRecord, Text, Text> {
static HTableInterface table;
static HTableInterface inserted;
private String hbaseDate = null;
String existigValue=null;
List<Put> putList = new ArrayList<Put>();
#Override
public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
String tablename = "dev_arch186";
Utils.getHBConnection();
table = Utils.getTable(tablename);
table.setAutoFlushTo(false);
}
#Override
public void cleanup(Context context) {
try {
table.put(putList);
table.flushCommits();
table.close();
} catch (IOException e) {
e.printStackTrace();
}
Utils.closeConnection();
}
#Override
public void map(WritableComparable key, HCatRecord value, Context context) throws IOException, InterruptedException {
String name_hive = (String) value.get(0);
String id_hive = (String) value.get(1);
String rec[] = test.toString().split(",");
Get g = new Get(Bytes.toBytes(name_hive));
existigValue=getOneRecord(Bytes.toBytes("Info"),Bytes.toBytes("name"),name_hive);
if (existigValue.equalsIgnoreCase("NA") || !existigValue.equalsIgnoreCase(id_hive)) {
Put put = new Put(Bytes.toBytes(rec[0]));
put.add(Bytes.toBytes("Info"),
Bytes.toBytes("name"),
Bytes.toBytes(rec[1]));
put.setDurability(Durability.SKIP_WAL);
putList.add(put);
if(putList.size()>25000){
table.put(putList);
table.flushCommits();
}
}
}
public String getOneRecord(byte[] columnFamily, byte[] columnQualifier, String rowKey)
throws IOException {
Get get = new Get(rowKey.getBytes());
get.setMaxVersions(1);
Result rs = table.get(get);
rs.getColumn(columnFamily, columnQualifier);
System.out.println(rs.containsColumn(columnFamily, columnQualifier));
KeyValue result = rs.getColumnLatest(columnFamily,columnQualifier);
if (rs.containsColumn(columnFamily, columnQualifier))
return (Bytes.toString(result.getValue()));
else
return "NA";
}
public boolean columnQualifierExists(String tableName, String ColumnFamily,
String ColumnQualifier, String rowKey) throws IOException {
Get get = new Get(rowKey.getBytes());
Result rs = table.get(get);
return(rs.containsColumn(ColumnFamily.getBytes(),ColumnQualifier.getBytes()));
}
}
Note:
I use MapR (M3) Cluster with HUE as the interface for oozie.
Hive Version : 1-0
HCAT Version: 1-0
I couldn't find any way to initialize HCatInputFormat from Oozie mapreduce action.
But I have a workaround as below.
Created LazyHCatInputFormat by extending HCatInputFormat.
Override the getJobInfo method, to handle initalization. This will be called as part of getSplits(..) call.
private static void lazyInit(Configuration conf){
try{
if(conf==null){
conf = new Configuration(false);
}
conf.addResource(new Path(System.getProperty("oozie.action.conf.xml")));
conf.addResource(new org.apache.hadoop.fs.Path("hive-config.xml"));
String databaseName = conf.get("LazyHCatInputFormat.databaseName");
String tableName = conf.get("LazyHCatInputFormat.tableName");
String partitionFilter = conf.get("LazyHCatInputFormat.partitionFilter");
setInput(conf, databaseName, tableName);
//setFilter(partitionFilter);
//System.out.println("After lazyinit : "+conf.get("mapreduce.lib.hcat.job.info"));
}catch(Exception e){
System.out.println("*** LAZY INIT FAILED ***");
//e.printStackTrace();
}
}
public static InputJobInfo getJobInfo(Configuration conf)
throws IOException {
String jobString = conf.get("mapreduce.lib.hcat.job.info");
if (jobString == null) {
lazyInit(conf);
jobString = conf.get("mapreduce.lib.hcat.job.info");
if(jobString == null){
throw new IOException("job information not found in JobContext. HCatInputFormat.setInput() not called?");
}
}
return (InputJobInfo) HCatUtil.deserialize(jobString);
}
In the oozie map-redcue action, configured as below.
<property>
<name>mapreduce.job.inputformat.class</name>
<value>com.xyz.LazyHCatInputFormat</value>
</property>
<property>
<name>LazyHCatInputFormat.databaseName</name>
<value>HCAT DatabaseNameHere</value>
</property>
<property>
<name>LazyHCatInputFormat.tableName</name>
<value>HCAT TableNameHere</value>
</property>
This might not be the best implementation, but a quick hack to make it work.

Setting number of Reduce tasks using command line

I am a beginner in Hadoop. When trying to set the number of reducers using command line using Generic Options Parser, the number of reducers is not changing. There is no property set in the configuration file "mapred-site.xml" for the number of reducers and I think, that would make the number of reducers=1 by default. I am using cloudera QuickVM and hadoop version : "Hadoop 2.5.0-cdh5.2.0".
Pointers Appreciated. Also my issue was I wanted to know the preference order of the ways to set the number of reducers.
Using configuration File "mapred-site.xml"
mapred.reduce.tasks
By specifying in the driver class
job.setNumReduceTasks(4)
By specifying at the command line using Tool interface:
-Dmapreduce.job.reduces=2
Mapper :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
//Split the line into words
for(String word: line.split("\\W+"))
{
//Make sure that the word is legitimate
if(word.length() > 0)
{
//Emit the word as you see it
context.write(new Text(word), new IntWritable(1));
}
}
}
}
Reducer :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
//Initializing the word count to 0 for every key
int count=0;
for(IntWritable value: values)
{
//Adding the word count counter to count
count += value.get();
}
//Finally write the word and its count
context.write(key, new IntWritable(count));
}
}
Driver :
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool
{
public int run(String[] args) throws Exception
{
//Instantiate the job object for configuring your job
Job job = new Job();
//Specify the class that hadoop needs to look in the JAR file
//This Jar file is then sent to all the machines in the cluster
job.setJarByClass(WordCount.class);
//Set a meaningful name to the job
job.setJobName("Word Count");
//Add the apth from where the file input is to be taken
FileInputFormat.addInputPath(job, new Path(args[0]));
//Set the path where the output must be stored
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//Set the Mapper and the Reducer class
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//Set the type of the key and value of Mapper and reducer
/*
* If the Mapper output type and Reducer output type are not the same then
* also include setMapOutputKeyClass() and setMapOutputKeyValue()
*/
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(4);
//Start the job and wait for it to finish. And exit the program based on
//the success of the program
System.exit(job.waitForCompletion(true)?0:1);
return 0;
}
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
}
And I have tried the following commands to run the job :
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -Dmapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take13
and
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -D mapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take14
Answering your query on order. It would always be 2>3>1
The option specified in your driver class takes precedence over the ones you specify as an argument to your GenOptionsParser or the ones you specify in your site specific config.
I would recommend debugging the configurations inside your driver class by printing it out before you submit the job. This way , you can be sure what the configurations are , right before you submit the job to the cluster.
Configuration conf = getConf(); // This is available to you since you extended Configured
for(Entry entry: conf)
//Sysout the entries here

Hadoop ClassCastException for default value of InputFormat

I'm having a issue getting started with my first map-reduce code on Hadoop. I copied the following code from "Hadoop: The definitive guide" but I'm not able to run it on my single node Hadoop installation.
My Code snippet:
Main:
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
Mapper:
public void map(LongWritable key, Text value, Context context)
Reducer:
public void reduce(Text key, Iterable<IntWritable> values,
Context context)
Implementations of map and reduce function are also picked from the book only. But when I try to execute this code, this is the error I get:
INFO mapred.JobClient: Task Id : attempt_201304021022_0016_m_000000_0, Status : FAILED
java.lang.ClassCastException: interface javax.xml.soap.Text
at java.lang.Class.asSubclass(Class.java:3027)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:774)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:959)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:674)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:756)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Answers to similar questions in the past (Hadoop type mismatch in key from map expected value Text received value LongWritable) helped me to figure out that InputFormatClass should match the input to the map function. So I also tried using job.setInputFormatClass(TextInputFormat.class); in my main method, but it also did not solve the issue. What could be the issue here?
Here is the implementation of the Mapper class
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String year = line.substring(15, 19);
int airTemperature;
if (line.charAt(45) == '+') { // parseInt doesn't like leading plus signs
airTemperature = Integer.parseInt(line.substring(46, 50));
} else {
airTemperature = Integer.parseInt(line.substring(45, 50));
}
String quality = line.substring(50, 51);
if (airTemperature != MISSING && quality.matches("[01459]")) {
context.write(new Text(year), new IntWritable(airTemperature));
}
}
}
You auto imported the wrong import. Instead of import org.apache.hadoop.io.Text you imported import javax.xml.soap.Text
You can find a sample wrong import in this blog.
Looks like you have the wrong Text class imported (javax.xml.soap.Text). You want org.apache.hadoop.io.Text

Hadoop type mismatch in key from map expected value Text received value LongWritable

Anyone have any idea why I would be getting this error? I have looked at alot of other similar posts but most of them did not apply to me, I also tried the few solutions that were posted that did apply to me but they did not work, I'm sure I'm just missing something stupid, thanks for the help
chris#chrisUHadoop:/usr/local/hadoop-1.0.3/build$ hadoop MaxTemperature 1901 output4
12/07/03 17:23:08 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
12/07/03 17:23:08 INFO input.FileInputFormat: Total input paths to process : 1
12/07/03 17:23:08 INFO util.NativeCodeLoader: Loaded the native-hadoop library
12/07/03 17:23:08 WARN snappy.LoadSnappy: Snappy native library not loaded
12/07/03 17:23:09 INFO mapred.JobClient: Running job: job_201207031642_0005
12/07/03 17:23:10 INFO mapred.JobClient: map 0% reduce 0%
12/07/03 17:23:28 INFO mapred.JobClient: Task Id : attempt_201207031642_0005_m_000000_0, Status : FAILED
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1014)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:691)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:124)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Program:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String year = line.substring(15,19);
int airTemperature;
if (line.charAt(87) == '+')
{
airTemperature = Integer.parseInt(line.substring(88,92));
}
else
{
airTemperature = Integer.parseInt(line.substring(87,92));
}
String quality = line.substring(92,93);
if (airTemperature != MISSING && quality.matches("[01459]"))
{
context.write(new Text(year), new IntWritable(airTemperature));
}
}
}
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MaxTemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int maxValue = Integer.MIN_VALUE;
for (IntWritable value : values)
{
maxValue = Math.max(maxValue, value.get());
}
context.write(key, new IntWritable(maxValue));
}
}
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTemperature
{
public static void main(String[] args) throws Exception
{
if (args.length != 2)
{
System.out.println("Usage: MaxTemperature <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
You appear to be missing a number of configuration properties:
Mapper and Reducer classes? - if not defined, you'll be defaulted to the 'Identity' Mapper / Reducer
Your specific error message is because the identity mapper just outputs the same key / value types it was passed in, in this case probably a key of type LongWritable and value of type Text (as you haven't defined an Input format, the default is probably TextInputFormat). In your configuration you have defined the output key type as Text, but the mapper is outputting LongWritable, hence the error message.
You should set the following property in job.xml
<property>
<name>mapred.input.format.class</name>
<value>org.apache.hadoop.mapred.TextInputFormat</value>
<description>The full class name of the InputFormat class to be used for obtaining the input to the mapper.</description>
</property>

Resources