DistributedCache - third party jar not found - hadoop

I'm trying to get a hold of DistributedCache. I'm using Apache Hadoop 1.2.1 on two nodes.
I referred to the Cloudera post which is simply extended in the other posts that explain how to use third-party jars using -libjars
Note:
In my jar, I haven't included any jar libs. - neither Hadoop core nor commons lang.
The code :
public class WordCounter extends Configured implements Tool {
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
// Job job = new Job(getConf(), args[0]);
Job job = new Job(super.getConf(), args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setJarByClass(WordCounter.class);
FileInputFormat.setInputPaths(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
int jobState = job.waitForCompletion(true) ? 0 : 1;
return jobState;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
if (args == null || args.length < 3) {
System.out.println("The below three arguments are expected");
System.out
.println("<job name> <hdfs path of the input file> <hdfs path of the output file>");
return;
}
WordCounter wordCounter = new WordCounter();
// System.exit(ToolRunner.run(wordCounter, args));
System.exit(ToolRunner.run(new Configuration(), wordCounter, args));
}
}
The Mapper class is naive, its only attempting to use the StringUtils from Apache Commons(and NOT hadoop)
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* #author 298790
*
*/
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static IntWritable one = new IntWritable(1);
#Override
protected void map(
LongWritable key,
Text value,
org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
StringTokenizer strTokenizer = new StringTokenizer(value.toString());
Text token = new Text();
while (strTokenizer.hasMoreTokens()) {
token.set(strTokenizer.nextToken());
context.write(token, one);
}
System.out.println("Converting " + value + " to upper case "
+ StringUtils.upperCase(value.toString()));
}
}
The commands that I use :
bigdata#slave3:~$ export HADOOP_CLASSPATH=dumphere/lib/commons-lang3-3.1.jar
bigdata#slave3:~$
bigdata#slave3:~$ echo $HADOOP_CLASSPATH
dumphere/lib/commons-lang3-3.1.jar
bigdata#slave3:~$
bigdata#slave3:~$ echo $LIBJARS
dumphere/lib/commons-lang3-3.1.jar
bigdata#slave3:~$ hadoop jar dumphere/code/jars/hdp_3rdparty.jar com.hadoop.basics.WordCounter "WordCount" "/input/dumphere/Childhood_days.txt" "/output/dumphere/wc" -libjars ${LIBJARS}
The exception I get :
Warning: $HADOOP_HOME is deprecated.
14/08/13 21:56:05 INFO input.FileInputFormat: Total input paths to process : 1
14/08/13 21:56:05 INFO util.NativeCodeLoader: Loaded the native-hadoop library
14/08/13 21:56:05 WARN snappy.LoadSnappy: Snappy native library not loaded
14/08/13 21:56:05 INFO mapred.JobClient: Running job: job_201408111719_0190
14/08/13 21:56:06 INFO mapred.JobClient: map 0% reduce 0%
14/08/13 21:56:37 INFO mapred.JobClient: Task Id : attempt_201408111719_0190_m_000000_0, Status : FAILED
Error: java.lang.ClassNotFoundException: org.apache.commons.lang3.StringUtils
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at com.hadoop.basics.WCMapper.map(WCMapper.java:40)
at com.hadoop.basics.WCMapper.map(WCMapper.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
14/08/13 21:56:42 INFO mapred.JobClient: Task Id : attempt_201408111719_0190_m_000000_1, Status : FAILED
Error: java.lang.ClassNotFoundException: org.apache.commons.lang3.StringUtils
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:423)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:356)
at com.hadoop.basics.WCMapper.map(WCMapper.java:40)
at com.hadoop.basics.WCMapper.map(WCMapper.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
The Cloudera post mentions :
The jar will be placed in distributed cache and will be made available to all of the job’s task attempts. More specifically, you will find the JAR in one of the ${mapred.local.dir}/taskTracker/archive/${user.name}/distcache/… subdirectories on local nodes.
But on that path, I'm not able to find the commons-lang3-3.1.jar
What am I missing?

Related

Kite Dataset map-reduce

I am trying to do map-reduce with kite-dataset api.
I have followed below urls to refer.
https://community.cloudera.com/t5/Kite-SDK-includes-Morphlines/Map-Reduce-with-Kite/td-p/22165
https://github.com/kite-sdk/kite/blob/master/kite-data/kite-data-mapreduce/src/test/java/org/kitesdk/data/mapreduce/TestMapReduce.java
My code snippet as below
public class MapReduce {
private static final String sourceDatasetURI = "dataset:hive:test_avro";
private static final String destinationDatasetURI = "dataset:hive:test_parquet";
private static class LineCountMapper
extends Mapper<GenericData.Record, Void, Text, IntWritable> {
#Override
protected void map(GenericData.Record record, Void value,
Context context)
throws IOException, InterruptedException {
System.out.println("Record is "+record);
context.write(new Text(record.get("index").toString()), new IntWritable(1));
}
}
private Job createJob() throws Exception {
System.out.println("Inside Create Job");
Job job = new Job();
job.setJarByClass(getClass());
Dataset<GenericData.Record> inputDataset = Datasets.load(sourceDatasetURI, GenericData.Record.class);
Dataset<GenericData.Record> outputDataset = Datasets.load(destinationDatasetURI, GenericData.Record.class);
DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);
job.setMapperClass(LineCountMapper.class);
DatasetKeyOutputFormat.configure(job).writeTo(outputDataset).withType(GenericData.Record.class);
job.waitForCompletion(true);
return job;
}
public static void main(String[] args) throws Exception {
MapReduce httAvroToParquet = new MapReduce();
httAvroToParquet.createJob();
}
}
I am using HDP 2.3.2 box ,creating assembly jar and submitting my application through spark-submit.
I am getting below error when I submit my application.
2015-12-18 04:09:07,156 WARN [main] org.apache.hadoop.hdfs.shortcircuit.DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
2015-12-18 04:09:07,282 INFO [main] org.apache.hadoop.mapreduce.v2.app.MRAppMaster: OutputCommitter set in config null
2015-12-18 04:09:07,333 WARN [main] org.kitesdk.data.spi.Registration: Not loading URI patterns in org.kitesdk.data.spi.hive.Loader
2015-12-18 04:09:07,334 INFO [main] org.apache.hadoop.service.AbstractService: Service org.apache.hadoop.mapreduce.v2.app.MRAppMaster failed in state INITED; cause: org.apache.hadoop.yarn.exceptions.YarnRuntimeException: org.kitesdk.data.DatasetNotFoundException: Unknown dataset URI: hive://{}:9083/default/test_parquet. Check that JARs for hive datasets are on the classpath.
org.apache.hadoop.yarn.exceptions.YarnRuntimeException: org.kitesdk.data.DatasetNotFoundException: Unknown dataset URI: hive://{}:9083/default/test_parquet. Check that JARs for hive datasets are on the classpath.
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster$1.call(MRAppMaster.java:478)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster$1.call(MRAppMaster.java:458)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster.callWithJobClassLoader(MRAppMaster.java:1560)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster.createOutputCommitter(MRAppMaster.java:458)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster.serviceInit(MRAppMaster.java:377)
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster$4.run(MRAppMaster.java:1518)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster.initAndStartAppMaster(MRAppMaster.java:1515)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster.main(MRAppMaster.java:1448)
Caused by: org.kitesdk.data.DatasetNotFoundException: Unknown dataset URI: hive://{}:9083/default/test_parquet. Check that JARs for hive datasets are on the classpath.
at org.kitesdk.data.spi.Registration.lookupDatasetUri(Registration.java:109)
at org.kitesdk.data.Datasets.load(Datasets.java:103)
at org.kitesdk.data.Datasets.load(Datasets.java:165)
at org.kitesdk.data.mapreduce.DatasetKeyOutputFormat.load(DatasetKeyOutputFormat.java:510)
at org.kitesdk.data.mapreduce.DatasetKeyOutputFormat.getOutputCommitter(DatasetKeyOutputFormat.java:473)
at org.apache.hadoop.mapreduce.v2.app.MRAppMaster$1.call(MRAppMaster.java:476)
... 11 more
I am not getting what's wrong ? Is there any class-path problem ? If yes then where do I set it ?
You effectively have a classpath problem
Your project is missing org.kitesdk:kite-data-hive
You can
Add this jar to your fat jar before submitting it to Spark
Tells Spark to add it to your classpath when you submit

getting Error while importing data from mongodb to hdfs

I am getting errors while importing data from mongodb to hdfs.
I as using:
Ambari Sandbox [Hortonworks] Hadoop 2.7
MongoDB version 3.0
These are the jar files I am including:
mongo-java-driver-2.11.4.jar
mongo-hadoop-core-1.3.0.jar
Here is the code I am using:
package com.mongo.test;
import java.io.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.mapreduce.*;
import org.bson.*;
import com.mongodb.MongoClient;
import com.mongodb.hadoop.*;
import com.mongodb.hadoop.util.*;
public class ImportFromMongoToHdfs {
private static final Log log =
LogFactory.getLog(ImportFromMongoToHdfs.class);
public static class ReadEmpDataFromMongo extends Mapper<Object,
BSONObject, Text, Text>{
public void map(Object key, BSONObject value, Context context) throws
IOException, InterruptedException{
System.out.println("Key: " + key);
System.out.println("Value: " + value);
String md5 = value.get("md5").toString();
String name = value.get("name").toString();
String dev = value.get("dev").toString();
String salary = value.get("salary").toString();
String location = value.get("location").toString();
String output = "\t" + name + "\t" + dev + "\t" + salary + "\t" +
location;
context.write( new Text(md5), new Text(output));
}
}
public static void main(String[] args)throws Exception {
final Configuration conf = new Configuration();
MongoConfigUtil.setInputURI(conf,"mongodb://10.25.3.196:27017/admin.emp")
;
MongoConfigUtil.setCreateInputSplits(conf, false);
System.out.println("Configuration: " + conf);
final Job job = new Job(conf, "ReadWeblogsFromMongo");
Path out = new Path("/mongodb3");
FileOutputFormat.setOutputPath(job, out);
job.setJarByClass(ImportFromMongoToHdfs.class);
job.setMapperClass(ReadEmpDataFromMongo.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(com.mongodb.hadoop.MongoInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(0);
System.exit(job.waitForCompletion(true) ? 0 : 1 );
}
}
This is the error I am getting back:
[root#sandbox ~]# hadoop jar /mongoinput/mongdbconnect.jar com.mongo.test.ImportFromMongoToHdfs
WARNING: Use "yarn jar" to launch YARN applications.
Configuration: Configuration: core-default.xml, core-site.xml
15/09/09 09:22:51 INFO impl.TimelineClientImpl: Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
15/09/09 09:22:53 INFO client.RMProxy: Connecting to ResourceManager at sandbox.hortonworks.com/10.25.3.209:8050
15/09/09 09:22:53 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
15/09/09 09:22:54 INFO splitter.SingleMongoSplitter: SingleMongoSplitter calculating splits for mongodb://10.25.3.196:27017/admin.emp
15/09/09 09:22:54 INFO mapreduce.JobSubmitter: number of splits:1
15/09/09 09:22:55 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1441784509780_0003
15/09/09 09:22:55 INFO impl.YarnClientImpl: Submitted application application_1441784509780_0003
15/09/09 09:22:55 INFO mapreduce.Job: The url to track the job: http://sandbox.hortonworks.com:8088/proxy/application_1441784509780_0003/
15/09/09 09:22:55 INFO mapreduce.Job: Running job: job_1441784509780_0003
15/09/09 09:23:05 INFO mapreduce.Job: Job job_1441784509780_0003 running in uber mode : false
15/09/09 09:23:05 INFO mapreduce.Job: map 0% reduce 0%
15/09/09 09:23:12 INFO mapreduce.Job: Task Id : attempt_1441784509780_0003_m_000000_0, Status : FAILED
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.mapreduce.task.JobContextImpl.getInputFormatClass(JobContextImpl.java:174)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:749)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 8 more
15/09/09 09:23:18 INFO mapreduce.Job: Task Id : attempt_1441784509780_0003_m_000000_1, Status : FAILED
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.mapreduce.task.JobContextImpl.getInputFormatClass(JobContextImpl.java:174)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:749)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 8 more
15/09/09 09:23:24 INFO mapreduce.Job: Task Id : attempt_1441784509780_0003_m_000000_2, Status : FAILED
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.mapreduce.task.JobContextImpl.getInputFormatClass(JobContextImpl.java:174)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:749)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 8 more
15/09/09 09:23:32 INFO mapreduce.Job: map 100% reduce 0%
15/09/09 09:23:32 INFO mapreduce.Job: Job job_1441784509780_0003 failed with state FAILED due to: Task failed task_1441784509780_0003_m_000000
Job failed as tasks failed. failedMaps:1 failedReduces:0
15/09/09 09:23:32 INFO mapreduce.Job: Counters: 9
Job Counters
Failed map tasks=4
Launched map tasks=4
Other local map tasks=3
Rack-local map tasks=1
Total time spent by all maps in occupied slots (ms)=16996
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=16996
Total vcore-seconds taken by all map tasks=16996
Total megabyte-seconds taken by all map tasks=4249000
[root#sandbox ~]#
Does anyone know what is wrong?
make sure you keep mongo-hadoop jar in Hadoop class path and restart the Hadoop.
The error java.lang.ClassNotFoundException: Class com.mongodb.hadoop.MongoInputFormat should be resolved.
You are getting ClassNotFoundException becuase you is unable to reach to jar "mongo-hadoop-core*.jar". You have to make "mongo-hadoop-core*.jar" available to your code
There are many ways you resolve this error -
Create Fat Jar for your program. Fat jar will contain all necessary dependent jars. You can easily create fat jar if you are using any IDE.
use "-libjars" argument while submitting your yarn job
Copy mongo jars to Hadoop_Classpath location
I have just resolved a problem like this. In fact, this is an error at run time. If we set Hadoop_ClassPath pointing to the external necessary jar files, this was not enough yet. Because, I think at run time, Hadoop will look for jar files in the folder in which Hadoop is installed. I realize that we need to copy all necessary external jar files in the folder installed Hadoop. So :
First, you need to check HADOOP_CLASSPATH by typing :
- hadoop classpath
Then copy the necessary external jar file in one the HADOOP_CLASSPATH. For exemple, I will copy mongo-hadoop-1.5.1.jar and some others jar files to folder /usr/local/hadoop/share/hadoop/mapreduce.
Then it works for me!

How to import table data stored in Hive in my MapReduce job?

I am using a single node cluster setup of Apache Hadoop 2.5.0 on Ubuntu 14.04
I stored tweets in my HDFS using Flume.
Then, I used the following Hive commands to create a table in Hive which stores all the tweets in tabular format:
CREATE EXTERNAL TABLE tweets (
id BIGINT,
created_at STRING,
source STRING,
favorited BOOLEAN,
retweet_count INT,
retweeted_status STRUCT<
text:STRING,
user:STRUCT<screen_name:STRING,name:STRING>>,
entities STRUCT<
urls:ARRAY<STRUCT<expanded_url:STRING>>,
user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
hashtags:ARRAY<STRUCT<text:STRING>>>,
text STRING,
user STRUCT<
screen_name:STRING,
name:STRING,
friends_count:INT,
followers_count:INT,
statuses_count:INT,
verified:BOOLEAN,
utc_offset:INT,
time_zone:STRING>,
in_reply_to_screen_name STRING
)
ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe'
LOCATION '/user/flume/tweets';
I have verified that the data exists in the table 'tweets' by querying the database using HiveQL (from the Hive Command Line Interface). I also created an output table using the following command:
CREATE TABLE outputtable (
a STRING,
b INT );
I am using Apache Hive 0.13.1 which already has HCatalog in it. After all this, I am trying to write a MapReduce Job using Java language in Eclipse. I have added the following libraries to my project as external jars:
All the libraries present in path-of-installation-of-hadoop/share/hadoop/common
All the libraries present in path-of-installation-of-hadoop/share/hadoop/mapreduce
All the libraries present in the lib folder of Hive
All the libraries present in path-of-installation-of-Hive/hcatalog/share/hcatalog
My MapReduce code is trying to import the text of the tweets from the table 'tweets' and then process it. My MapReduce code is:
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hcatalog.common.*;
import org.apache.hcatalog.mapreduce.*;
import org.apache.hcatalog.data.*;
import org.apache.hcatalog.data.schema.*;
public class UseHCat extends Configured implements Tool {
public static class Map extends Mapper<WritableComparable, HCatRecord, Text, IntWritable> {
String tweetText;
#Override
protected void map( WritableComparable key,
HCatRecord value,
org.apache.hadoop.mapreduce.Mapper<WritableComparable, HCatRecord,
Text, IntWritable>.Context context)
throws IOException, InterruptedException {
tweetText = (String) value.get(7);
int i = 1;
context.write(new Text(tweetText), new IntWritable(i));
}
}
public static class Reduce extends Reducer<Text, IntWritable,
WritableComparable, HCatRecord> {
protected void reduce( Text key,
java.lang.Iterable<IntWritable> values,
org.apache.hadoop.mapreduce.Reducer<Text, IntWritable,
WritableComparable, HCatRecord>.Context context)
throws IOException, InterruptedException {
Iterator<IntWritable> iter = values.iterator();
IntWritable iw = iter.next();
int id = iw.get();
HCatRecord record = new DefaultHCatRecord(2);
record.set(0, key.toString());
record.set(1, id);
context.write(null, record);
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
String inputTableName = "tweets";
String outputTableName = "outputtable";
String dbName = null;
Job job = new Job(conf, "UseHCat");
HCatInputFormat.setInput(job, InputJobInfo.create(dbName, inputTableName, null));
job.setJarByClass(UseHCat.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
// An HCatalog record as input
job.setInputFormatClass(HCatInputFormat.class);
// Mapper emits a string as key and an integer as value
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// Ignore the key for the reducer output; emitting an HCatalog record as value
job.setOutputKeyClass(WritableComparable.class);
job.setOutputValueClass(DefaultHCatRecord.class);
job.setOutputFormatClass(HCatOutputFormat.class);
HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
HCatSchema s = HCatOutputFormat.getTableSchema(job);
System.err.println("INFO: output schema explicitly set for writing:" + s);
HCatOutputFormat.setSchema(job, s);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new UseHCat(), args);
System.exit(exitCode);
}
}
The first problem that we are facing is that we are getting many warnings specifying that some of the types and constructors are deprecated. We ignored the warnings and created a jar file of our Project whose main class is 'UseHCat'. Then we browsed to the location where the jar file was created using the terminal provided in Ubuntu and ran the following command:
hadoop jar MyProject.jar
We got the following error:
14/11/16 17:17:29 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveStorageHandler
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:800)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:449)
at java.net.URLClassLoader.access$100(URLClassLoader.java:71)
at java.net.URLClassLoader$1.run(URLClassLoader.java:361)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
at org.apache.hcatalog.mapreduce.InitializeInput.getInputJobInfo(InitializeInput.java:146)
at org.apache.hcatalog.mapreduce.InitializeInput.setInput(InitializeInput.java:86)
at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:86)
at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:55)
at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:47)
at UseHCat.run(UseHCat.java:64)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
at UseHCat.main(UseHCat.java:91)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.main(RunJar.java:212)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.metadata.HiveStorageHandler
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
... 26 more
Hive is developed to minimize the writing mapreduce program.You can perform the process using Hive queries, internally it will convert into mapreduce job.
However, if you want to access the Hivedb data,you can access. Hive is not a database. All the data stored under warehouse dir in readable format. You can give full path as a input to your mapreduce program.
Have you tried a sample mapreduce program in eclipse. Because you have build the Hadoop plugin or you can use the existing plugin in your eclipse to run mapreduce.

Getting error while running storm connection with accumulo

I have Storm Bolt like follows,
package storm.bolt;
import java.util.Map;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.MultiTableBatchWriter;
import org.apache.accumulo.core.client.TableExistsException;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.ZooKeeperInstance;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Value;
import org.apache.hadoop.io.Text;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichBolt;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Tuple;
public class AccumuloBolt implements IRichBolt {
private static final long serialVersionUID = 1L;
private OutputCollector collector;
private ZooKeeperInstance instance;
private Connector connector;
private BatchWriter bw;
private Text colf;
private MultiTableBatchWriter mtbw;
private final String instanceName;
private final String zooServers;
private final String userName;
private final String password;
Map<String, Integer> counters;
/**
* #param zooServers The host on which Zookeeper is running.
* #param userName for which Accumula username.
* #param password The Acumula passowrd
* written to.
* String instanceName = "myistance";
* String zooServers = "192.168.1.81:2181";
* String userName = "root";
* String password = "aryadevi";
*/
public AccumuloBolt(String instanceName, String zooServers, String userName,
String password) {
this.instanceName = instanceName;
this.zooServers = zooServers;
this.userName = userName;
this.password = password;
}
public void prepare( Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
try {
//this.instance = new ZooKeeperInstance(instanceName, zooServers);
this.instance = new ZooKeeperInstance("myistance", "192.168.1.81:2181");
//this.connector= instance.getConnector(userName, password);
this.connector= instance.getConnector("root", "aryadevi");
this.mtbw=connector.createMultiTableBatchWriter(200000l, 300, 4);
this.bw=null;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public void execute(Tuple input) {
if (shouldActOnInput(input)) {
try{
if (!this.connector.tableOperations().exists("new2"))
this.connector.tableOperations().create("new2");
this.bw = this.mtbw.getBatchWriter("new2");
this.colf=new Text("colfam");
System.out.println("writing ...");
String str = input.getString(0);
if(!counters.containsKey(str)){
counters.put(str, 1);
}else{
Integer c = counters.get(str) + 1;
counters.put(str, c);
}
}catch (Exception e) {
throw new RuntimeException(e);
}
//DBObject updateObj = getDBObjectForInput(input);
//this.bw.addMutation(m);
} else {
collector.ack(input);
}
}
public void cleanup() {
try{
for(Map.Entry<String, Integer> entry : counters.entrySet()){
Mutation m = new Mutation(new Text(String.format("row_%d",entry.getKey() )));
m.put(this.colf, new Text(String.format("colqual_%d", entry.getKey())), new Value((String.format("value_%d", entry.getValue())).getBytes()));
System.out.println(entry.getKey()+": "+entry.getValue());
bw.addMutation(m);
}
this.mtbw.close();
}catch (Exception e) {
throw new RuntimeException(e);
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// TODO Auto-generated method stub
}
public boolean shouldActOnInput(Tuple input) {
return true;
}
public Map<String, Object> getComponentConfiguration() {
// TODO Auto-generated method stub
return null;
}
}
I just compile the this torm using "mvn comple" and create a package using mvn package
then i just run the storm using following commend
storm jar target/storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar storm.TwitterStorm
after running this commend getting error like follows
java.lang.NoClassDefFoundError: Could not initialize class org.apache.accumulo.core.client.ZooKeeperInstance
at storm.bolt.AccumuloBolt.prepare(AccumuloBolt.java:60) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at backtype.storm.daemon.executor$fn__5641$fn__5653.invoke(executor.clj:690) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at backtype.storm.util$async_loop$fn__457.invoke(util.clj:429) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_55]
46217 [Thread-8-count] ERROR backtype.storm.util - Async loop died!
java.lang.ExceptionInInitializerError: null
at org.apache.log4j.Logger.getLogger(Logger.java:39) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
at org.apache.log4j.Logger.getLogger(Logger.java:43) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
at org.apache.accumulo.core.client.ZooKeeperInstance.<clinit>(ZooKeeperInstance.java:63) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at storm.bolt.AccumuloBolt.prepare(AccumuloBolt.java:60) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at backtype.storm.daemon.executor$fn__5641$fn__5653.invoke(executor.clj:690) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at backtype.storm.util$async_loop$fn__457.invoke(util.clj:429) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_55]
Caused by: java.lang.IllegalStateException: Detected both log4j-over-slf4j.jar AND slf4j-log4j12.jar on the class path, preempting StackOverflowError. See also http://www.slf4j.org/codes.html#log4jDelegationLoop for more details.
at org.apache.log4j.Log4jLoggerFactory.<clinit>(Log4jLoggerFactory.java:49) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
... 8 common frames omitted
46218 [Thread-10-count] ERROR backtype.storm.daemon.executor -
java.lang.NoClassDefFoundError: Could not initialize class org.apache.accumulo.core.client.ZooKeeperInstance
at storm.bolt.AccumuloBolt.prepare(AccumuloBolt.java:60) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at backtype.storm.daemon.executor$fn__5641$fn__5653.invoke(executor.clj:690) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at backtype.storm.util$async_loop$fn__457.invoke(util.clj:429) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_55]
46218 [Thread-8-count] ERROR backtype.storm.daemon.executor -
java.lang.ExceptionInInitializerError: null
at org.apache.log4j.Logger.getLogger(Logger.java:39) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
at org.apache.log4j.Logger.getLogger(Logger.java:43) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
at org.apache.accumulo.core.client.ZooKeeperInstance.<clinit>(ZooKeeperInstance.java:63) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at storm.bolt.AccumuloBolt.prepare(AccumuloBolt.java:60) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at backtype.storm.daemon.executor$fn__5641$fn__5653.invoke(executor.clj:690) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at backtype.storm.util$async_loop$fn__457.invoke(util.clj:429) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_55]
Caused by: java.lang.IllegalStateException: Detected both log4j-over-slf4j.jar AND slf4j-log4j12.jar on the class path, preempting StackOverflowError. See also http://www.slf4j.org/codes.html#log4jDelegationLoop for more details.
at org.apache.log4j.Log4jLoggerFactory.<clinit>(Log4jLoggerFactory.java:49) ~[log4j-over-slf4j-1.6.6.jar:1.6.6]
... 8 common frames omitted
46218 [Thread-6] INFO backtype.storm.daemon.executor - Loading executor count:[4 4]
46219 [Thread-6] INFO backtype.storm.daemon.task - Emitting: count __system ["startup"]
46220 [Thread-6] INFO backtype.storm.daemon.executor - Loaded executor tasks count:[4 4]
46224 [Thread-6] INFO backtype.storm.daemon.executor - Finished loading executor count:[4 4]
46224 [Thread-12-count] INFO backtype.storm.daemon.executor - Preparing bolt count:(4)
46225 [Thread-12-count] ERROR backtype.storm.util - Async loop died!
java.lang.NoClassDefFoundError: Could not initialize class org.apache.accumulo.core.client.ZooKeeperInstance
at storm.bolt.AccumuloBolt.prepare(AccumuloBolt.java:60) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at backtype.storm.daemon.executor$fn__5641$fn__5653.invoke(executor.clj:690) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at backtype.storm.util$async_loop$fn__457.invoke(util.clj:429) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_55]
46226 [Thread-12-count] ERROR backtype.storm.daemon.executor -
java.lang.NoClassDefFoundError: Could not initialize class org.apache.accumulo.core.client.ZooKeeperInstance
at storm.bolt.AccumuloBolt.prepare(AccumuloBolt.java:60) ~[storm-twitter-0.0.1-SNAPSHOT-jar-with-dependencies.jar:na]
at backtype.storm.daemon.executor$fn__5641$fn__5653.invoke(executor.clj:690) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at backtype.storm.util$async_loop$fn__457.invoke(util.clj:429) ~[storm-core-0.9.2-incubating.jar:0.9.2-incubating]
at clojure.lang.AFn.run(AFn.java:24) [clojure-1.5.1.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_55]
46321 [Thread-10-count] INFO backtype.storm.util - Halting process: ("Worker died")
46321 [Thread-8-count] INFO backtype.storm.util - Halting process: ("Worker died")
It looks like this Storm ticket has a relevant discussion:
https://issues.apache.org/jira/browse/STORM-122
I think Accumulo has an slf4j-log4j12 dependency, and Storm uses log4j-over-slf4j which is incompatible. The discussion seems to suggest excluding logging dependencies like slf4j-log4j12 and log4j from your Accumulo dependency. I don't know if this will work, but it's worth a try.

hadoop not running in the multinode cluster

I have a jar file "Tsp.jar" that I made myself. This same jar files executes well in single node cluster setup of hadoop. However when I run it on a cluster comprising 2 machines, a laptop and desktop it gives me an exception when the map function reach 50%. Here is the output
`hadoop#psycho-O:/usr/local/hadoop$ bin/hadoop jar Tsp.jar clust-Tsp_ip1 clust_Tsp_op4
11/04/27 16:13:06 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
11/04/27 16:13:06 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
11/04/27 16:13:06 INFO mapred.FileInputFormat: Total input paths to process : 1
11/04/27 16:13:06 INFO mapred.JobClient: Running job: job_201104271608_0001
11/04/27 16:13:07 INFO mapred.JobClient: map 0% reduce 0%
11/04/27 16:13:17 INFO mapred.JobClient: map 50% reduce 0%
11/04/27 16:13:20 INFO mapred.JobClient: Task Id : attempt_201104271608_0001_m_000001_0, Status : FAILED
java.lang.RuntimeException: java.lang.RuntimeException: java.lang.ClassNotFoundException: Tsp$TspReducer
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:841)
at org.apache.hadoop.mapred.JobConf.getCombinerClass(JobConf.java:853)
at org.apache.hadoop.mapred.Task$CombinerRunner.create(Task.java:1100)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:812)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:350)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307)
at org.apache.hadoop.mapred.Child.main(Child.java:170)
Caused by: java.lang.RuntimeException: java.lang.ClassNotFoundException: Tsp$TspReducer
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:809)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:833)
... 6 more
Caused by: java.lang.ClassNotFoundException: Tsp$TspReducer
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:762)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:807)
... 7 more
11/04/27 16:13:20 WARN mapred.JobClient: Error reading task outputemil-desktop
11/04/27 16:13:20 WARN mapred.JobClient: Error reading task outputemil-desktop
^Z
[1]+ Stopped bin/hadoop jar Tsp.jar clust-Tsp_ip1 clust_Tsp_op4
hadoop#psycho-O:~$ jps
4937 Jps
3976 RunJar
`
Alse the cluster worked fine executing the wordcount example. So I guess its the problem with the Tsp.jar file.
1) Is it necessary to have a jar file to run on a cluster?
2) Here I tried to run a jar file in the cluster which I made. But is still gives a warning that jar file is not found. Why is that?
3) What all should be taken care of when running a jar file? Like what all it must contain other than the program which I wrote? My jar file contains a a Tsp.class, Tsp$TspReducer.class and a Tsp$TspMapper.class. The terminal says it cant find Tsp$TspReducer when it is already there in the jar file.
Thankyou
EDIT
public class Tsp {
public static void main(String[] args) throws IOException {
JobConf conf = new JobConf(Tsp.class);
conf.setJobName("Tsp");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(TspMapper.class);
conf.setCombinerClass(TspReducer.class);
conf.setReducerClass(TspReducer.class);
FileInputFormat.addInputPath(conf,new Path(args[0]));
FileOutputFormat.setOutputPath(conf,new Path(args[1]));
JobClient.runJob(conf);
}
public static class TspMapper extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
function findCost() {
}
public void map(LongWritable key,Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
find adjacency matrix from the input;
for(int i = 0; ...) {
.....
output.collect(new Text(string1), new Text(string2));
}
}
}
public static class TspReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
Text t1 = new Text();
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String a;
a = values.next().toString();
output.collect(key,new Text(a));
}
}
}
You currently have
conf.setJobName("Tsp");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(TspMapper.class);
conf.setCombinerClass(TspReducer.class);
conf.setReducerClass(TspReducer.class);
and as the error is stating No job jar file set you are not setting a jar.
You will need to something similar to
conf.setJarByClass(Tsp.class);
From what I'm seeing, that should resolve the error seen here.
11/04/27 16:13:06 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
Do what they say, when setting up your job, set the jar where the class is contained. Hadoop copies the jar into the DistributedCache (a filesystem on every node) and uses the classes out of it.
I had the exact same issue. Here is how I solved the problem(imagine your map reduce class is called A). After creating the job call:
job.setJarByClass(A.class);

Resources