Getting trouble on running hadoop word count program - hadoop

I am trying to run the word count program given in puma benchmark
The WordCount.java file is as follows:
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
List<String> other_args = new ArrayList<String>();
for(int i=0; i < args.length; ++i) {
try {
if ("-r".equals(args[i])) {
job.setNumReduceTasks(Integer.parseInt(args[++i]));
} else {
other_args.add(args[i]);
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
System.err.println("Usage: wordcount <numReduces> <in> <out>");
System.exit(2);
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " +
args[i-1]);
System.err.println("Usage: wordcount <numReduces> <in> <out>");
System.exit(2);
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: " +
other_args.size() + " instead of 2.");
System.err.println("Usage: wordcount <numReduces> <in> <out>");
System.exit(2);
}
FileInputFormat.addInputPath(job, new Path(other_args.get(0)));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
Date startIteration = new Date();
Boolean waitforCompletion = job.waitForCompletion(true) ;
Date endIteration = new Date();
System.out.println("The iteration took "
+ (endIteration.getTime() - startIteration.getTime()) / 1000
+ " seconds.");
System.exit(waitforCompletion ? 0 : 1);
}
}
I used the following commands and got the following result:
#javac -cp /opt/local/share/java/hadoop-1.2.1/hadoop-core-1.2.1.jar -d wordcount_classes WordCount.java
#jar -cvf wordcount.jar -C wordcount_classes/ .
and output that i got is:
added manifest
adding: org/(in = 0) (out= 0)(stored 0%)
adding: org/apache/(in = 0) (out= 0)(stored 0%)
adding: org/apache/hadoop/(in = 0) (out= 0)(stored 0%)
adding: org/apache/hadoop/examples/(in = 0) (out= 0)(stored 0%)
adding: org/apache/hadoop/examples/WordCount$IntSumReducer.class(in = 1793) (out= 750)(deflated 58%)
adding: org/apache/hadoop/examples/WordCount$TokenizerMapper.class(in = 1790) (out= 764)(deflated 57%)
adding: org/apache/hadoop/examples/WordCount.class(in = 3131) (out= 1682)(deflated 46%)
adding: org/myorg/(in = 0) (out= 0)(stored 0%)
adding: org/myorg/WordCount$IntSumReducer.class(in = 1759) (out= 745)(deflated 57%)
adding: org/myorg/WordCount$TokenizerMapper.class(in = 1756) (out= 759)(deflated 56%)
adding: org/myorg/WordCount.class(in = 3080) (out= 1676)(deflated 45%)
#hadoop jar wordcount.jar WordCount ../input/file01.txt ../output/
I got the following output:
Exception in thread "main" java.lang.NoClassDefFoundError: WordCount (wrong name: org/apache/hadoop/examples/WordCount)
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:800)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:449)
at java.net.URLClassLoader.access$100(URLClassLoader.java:71)
at java.net.URLClassLoader$1.run(URLClassLoader.java:361)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:412)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:270)
at org.apache.hadoop.util.RunJar.main(RunJar.java:205)
I applied all the procedure described before in this site but nothing is working for me.
I would be very thankful if anyone tells me how to solve this problem.

Change the package statement to
package org.myorg;
And run the program with the full class name.
Looking at your output, you seem to include the WordCount class twice in different paths (= packages), but when you run the program, you don't specify any package.

hadoop jar wordcount.jar org/apache/hadoop/examples/WordCount ../input/file01.txt ../output/
I think the problem is there, because you are not using the full class name.

your wordcount.jar has two Wordount classes specify the class with qualifier which one you want to run.
e.g
hadoop jar wordcount.jar org.apache.hadoop.examples.WordCount ../input/file01.txt ../output/
or
hadoop jar wordcount.jar org.myorg.WordCount ../input/file01.txt ../output/

Your WordCount class has got two nested classes inside itself, i.e.: TokenizerMapper and IntSumReducer.
You need to make sure that these classes are included in the jar file you are generating. try this:
jar cvf WordCount.jar WordCount.class WordCount\$TokenizerMapper.class WordCount\$IntSumReducer.class

Related

Hadoop WordCount Tutorial java.lang.ClassNotFoundException

I'm relatively new to hadoop and I'm struggling a little bit to understand the ClassNotFoundException I get when trying to run the job. I'm using the standard tutorial found here and here is my WordCount class (running on ubuntu 16.04 hadoop 2.7.3 distributed cluster mode):
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
To try and remain organized, I added a couple paths to my ~/.bashrc file:
hduser#mynode:~$ cd $HADOOP_CODE
hduser#mynode:/usr/local/hadoop/code$
This is one directory down from the $HADOOP_HOME directory. To compile the WordCount.JAVA file, I ran:
hduser#mynode:/usr/local/hadoop$ hadoop com.sun.tools.javac.Main $HADOOP_CODE/WordCount.java
hduser#mynode:/usr/local/hadoop$ jar cf wc.jar $HADOOP_CODE/WordCount*.class
I then tried:
hduser#mynode:/usr/local/hadoop$ hadoop jar $HADOOP_CODE/wc.jar $HADOOP_CODE/WordCount /home/hduser/input /home/hduser/output/wordcount
which bombed with the following error:
Exception in thread "main" java.lang.ClassNotFoundException: /usr/local/hadoop/code/WordCount
EDIT
This gave me the same error:
hduser#mynode:/usr/local/hadoop/code$ hadoop jar $HADOOP_CODE/wc.jar WordCount /home/hduser/input /home/hduser/output/wordcount
To get it to run without error, I moved the WordCount.Java file up one directory to the default hadoop ($HADOOP_HOME) folder. I also know from here and here that the solution is to add a package to the file.
What I'm trying to understand is why that is the solution. With no package name, where is hadoop looking for the specified package, and why can't I pass it a full path to get it to run correctly? This may be a basic java question (apologies - I'm from the python world), but what is the package name doing during the compile process that makes it so I could run without a path name, but leaving off the package name means it has to be in that default directory? I'd prefer not to have to add a package name to every job I run. An explanation would be greatly appreciated!

Map-reduce job giving ClassNotFound exception even though mapper is present when running with yarn?

I am running a hadoop job which is working fine when I am running it without yarn in pseudo-distributed mode, but it is giving me class not found exception when running with yarn
16/03/24 01:43:40 INFO mapreduce.Job: Task Id : attempt_1458775953882_0002_m_000003_1, Status : FAILED
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.hadoop.keyword.count.ItemMapper not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.mapreduce.task.JobContextImpl.getMapperClass(JobContextImpl.java:186)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:745)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassNotFoundException: Class com.hadoop.keyword.count.ItemMapper not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 8 more
Here is the source-code for the job
Configuration conf = new Configuration();
conf.set("keywords", args[2]);
Job job = Job.getInstance(conf, "item count");
job.setJarByClass(ItemImpl.class);
job.setMapperClass(ItemMapper.class);
job.setReducerClass(ItemReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
Here is the command I am running
hadoop jar ~/itemcount.jar /user/rohit/tweets /home/rohit/outputs/23mar-yarn13 vodka,wine,whisky
Edit Code, after suggestion
package com.hadoop.keyword.count;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
public class ItemImpl {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("keywords", args[2]);
Job job = Job.getInstance(conf, "item count");
job.setJarByClass(ItemImpl.class);
job.setMapperClass(ItemMapper.class);
job.setReducerClass(ItemReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class ItemMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
JSONParser parser = new JSONParser();
#Override
public void map(Object key, Text value, Context output) throws IOException,
InterruptedException {
JSONObject tweetObject = null;
String[] keywords = this.getKeyWords(output);
try {
tweetObject = (JSONObject) parser.parse(value.toString());
} catch (ParseException e) {
e.printStackTrace();
}
if (tweetObject != null) {
String tweetText = (String) tweetObject.get("text");
if(tweetText == null){
return;
}
tweetText = tweetText.toLowerCase();
/* StringTokenizer st = new StringTokenizer(tweetText);
ArrayList<String> tokens = new ArrayList<String>();
while (st.hasMoreTokens()) {
tokens.add(st.nextToken());
}*/
for (String keyword : keywords) {
keyword = keyword.toLowerCase();
if (tweetText.contains(keyword)) {
output.write(new Text(keyword), one);
}
}
output.write(new Text("count"), one);
}
}
String[] getKeyWords(Mapper<Object, Text, Text, IntWritable>.Context context) {
Configuration conf = (Configuration) context.getConfiguration();
String param = conf.get("keywords");
return param.split(",");
}
}
public static class ItemReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
protected void reduce(Text key, Iterable<IntWritable> values, Context output)
throws IOException, InterruptedException {
int wordCount = 0;
for (IntWritable value : values) {
wordCount += value.get();
}
output.write(key, new IntWritable(wordCount));
}
}
}
Running in full distributed mode your TaskTracker/NodeManager (the thing running your mapper) is running in a separate JVM and it sounds like your class is not making it onto that JVM's classpath.
Try using the -libjars <csv,list,of,jars> command line arg on job invocation. This will have Hadoop distribute the jar to the TaskTracker JVM and load your classes from that jar. (Note, this copies the jar out to each node in your cluster and makes it available only for that specific job. If you have common libraries that would need to be invoked for a lot of jobs, you'd want to look into using the Hadoop distributed cache.)
You may also want to try yarn -jar ... when launching your job versus hadoop -jar ... since that's the new/preferred way to launch yarn jobs.
Can you check the content of your itemcount.jar ?( jar -tvf itemcount.jar). I faced this issue once only to find that the .class was missing from the jar.
I had the same error a few days ago.
Changing map and reduce classes to static fixed my problem.
Make your map and reduce classes inner classes.
Control constructors of map and reduce classes (i/o values and override statement)
Check your jar command
old one
hadoop jar ~/itemcount.jar /user/rohit/tweets /home/rohit/outputs/23mar-yarn13 vodka,wine,whisky
new
hadoop jar ~/itemcount.jar com.hadoop.keyword.count.ItemImpl /user/rohit/tweets /home/rohit/outputs/23mar-yarn13 vodka,wine,whisky
add packageName.mainclass after you specified .jar file
Try-catch
try {
tweetObject = (JSONObject) parser.parse(value.toString());
} catch (Exception e) { **// Change ParseException to Exception if you don't only expect Parse error**
e.printStackTrace();
return; **// return from function in case of any error**
}
}
extends Configured and implement Tool
public class ItemImpl extends Configured implements Tool{
public static void main (String[] args) throws Exception{
int res =ToolRunner.run(new ItemImpl(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
Job job=Job.getInstance(getConf(),"ItemImpl ");
job.setJarByClass(this.getClass());
job.setJarByClass(ItemImpl.class);
job.setMapperClass(ItemMapper.class);
job.setReducerClass(ItemReducer.class);
job.setMapOutputKeyClass(Text.class);//probably not essential but make it certain and clear
job.setMapOutputValueClass(IntWritable.class); //probably not essential but make it certain and clear
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
add public static map
add public static reduce
I'm not an expert about this topic but This implementation is from one of my working projects. Try this if doesn't work for you I would suggest you check the libraries you added to your project.
Probably first step will solve it but
If these steps doesn't work , share the code with us.

Setting number of Reduce tasks using command line

I am a beginner in Hadoop. When trying to set the number of reducers using command line using Generic Options Parser, the number of reducers is not changing. There is no property set in the configuration file "mapred-site.xml" for the number of reducers and I think, that would make the number of reducers=1 by default. I am using cloudera QuickVM and hadoop version : "Hadoop 2.5.0-cdh5.2.0".
Pointers Appreciated. Also my issue was I wanted to know the preference order of the ways to set the number of reducers.
Using configuration File "mapred-site.xml"
mapred.reduce.tasks
By specifying in the driver class
job.setNumReduceTasks(4)
By specifying at the command line using Tool interface:
-Dmapreduce.job.reduces=2
Mapper :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
//Split the line into words
for(String word: line.split("\\W+"))
{
//Make sure that the word is legitimate
if(word.length() > 0)
{
//Emit the word as you see it
context.write(new Text(word), new IntWritable(1));
}
}
}
}
Reducer :
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
//Initializing the word count to 0 for every key
int count=0;
for(IntWritable value: values)
{
//Adding the word count counter to count
count += value.get();
}
//Finally write the word and its count
context.write(key, new IntWritable(count));
}
}
Driver :
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool
{
public int run(String[] args) throws Exception
{
//Instantiate the job object for configuring your job
Job job = new Job();
//Specify the class that hadoop needs to look in the JAR file
//This Jar file is then sent to all the machines in the cluster
job.setJarByClass(WordCount.class);
//Set a meaningful name to the job
job.setJobName("Word Count");
//Add the apth from where the file input is to be taken
FileInputFormat.addInputPath(job, new Path(args[0]));
//Set the path where the output must be stored
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//Set the Mapper and the Reducer class
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//Set the type of the key and value of Mapper and reducer
/*
* If the Mapper output type and Reducer output type are not the same then
* also include setMapOutputKeyClass() and setMapOutputKeyValue()
*/
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(4);
//Start the job and wait for it to finish. And exit the program based on
//the success of the program
System.exit(job.waitForCompletion(true)?0:1);
return 0;
}
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
}
And I have tried the following commands to run the job :
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -Dmapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take13
and
hadoop jar /home/cloudera/Misc/wordCount.jar WordCount -D mapreduce.job.reduces=2 hdfs:/Input/inputdata hdfs:/Output/wordcount_tool_D=2_take14
Answering your query on order. It would always be 2>3>1
The option specified in your driver class takes precedence over the ones you specify as an argument to your GenOptionsParser or the ones you specify in your site specific config.
I would recommend debugging the configurations inside your driver class by printing it out before you submit the job. This way , you can be sure what the configurations are , right before you submit the job to the cluster.
Configuration conf = getConf(); // This is available to you since you extended Configured
for(Entry entry: conf)
//Sysout the entries here

Hadoop Custom Java Program

I have a simple java program called putmerge that I am trying to execute. I have been at it for like 6hrs, researched many places on the web but could not find solution. Basically I try to build the jar with all class libraries with the following command:
javac -classpath *:lib/* -d playground/classes playground/src/PutMerge.java
And then I build the jar with the following command.
jar -cvf playground/putmerge.jar -C playground/classes/ .
And then I try to execute it with the following command:
bin/hadoop jar playground/putmerge.jar org.scd.putmerge "..inputPath.." "..outPath"
..
Exception in thread "main" java.lang.ClassNotFoundException: com.scd.putmerge
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:270)
at org.apache.hadoop.util.RunJar.main(RunJar.java:153)
I tried every permutation/combination to run this simple jar, however I always get some kind of exception as shown above.
My source code:
package org.scd.putmerge;
import java.io.IOException;
import java.util.Scanner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
*
* #author Anup V. Saumithri
*
*/
public class PutMerge
{
public static void main(String[] args) throws IOException
{
Configuration conf = new Configuration();
FileSystem hdfs = FileSystem.get(conf);
FileSystem local = FileSystem.getLocal(conf);
Path inputDir = new Path(args[0]);
Path hdfsFile = new Path(args[1]);
try
{
FileStatus[] inputFiles = local.listStatus(inputDir);
FSDataOutputStream out = hdfs.create(hdfsFile);
for(int i=0; i<inputFiles.length; i++)
{
System.out.println(inputFiles[i].getPath().getName());
FSDataInputStream in = local.open(inputFiles[i].getPath());
byte buffer[] = new byte[256];
int bytesRead = 0;
while((bytesRead = in.read(buffer)) > 0)
{
out.write(buffer, 0, bytesRead);
}
in.close();
}
out.close();
}
catch(IOException ex)
{
ex.printStackTrace();
}
}
}
The way you are putting your PutMerge class inside the jar may be a little incorrect.
If you do a jar tf putmerge.jar, you must see the PutMerge class inside the path mentioned in your package (org.scd.putmerge) in your code (i.e. org/scd/putmerge).
If not try doing the following to achieve that. Make sure you have copied PutMerge.class inside org/scd/putmerge/ directory.
jar -cvf playground/putmerge.jar org/scd/putmerge/PutMerge.class
Next, verify again with jar tf putmerge.jar to check if now see org/scd/putmerge/PutMerge.class in the output.
If everything's fine, you can try to run the hadoop jar again. But looking at the errors, I see that you haven't actually included the PutMerge class with the package. You should use org.scd.putmerge.PutMerge. So, the correct way should be something like --
bin/hadoop jar playground/putmerge.jar org.scd.putmerge.PutMerge "..inputPath.." "..outPath"

How to use Hadoop FS API inside Storm Bolt in java

I want to store the data in hdfs which is emitted by Storm Spout. I have added hadoop FS API code in Bolt Class, but It is throwing compilation error with storm.
Following is the Storm bolt Class :
package bolts;
import java.io.*;
import java.util.*;
import java.net.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseBasicBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class DataNormalizer extends BaseBasicBolt {
public void execute(Tuple input, BasicOutputCollector collector) {
String sentence = input.getString(0);
String[] process = sentence.split(" ");
int n = 1;
String rec = "";
try {
String filepath = "/root/data/top_output.csv";
String dest = "hdfs://localhost:9000/user/root/nishu/top_output/top_output_1.csv";
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(conf);
System.out.println(fileSystem);
Path srcPath = new Path(source);
Path dstPath = new Path(dest);
String filename = source.substring(source.lastIndexOf('/') + 1,
source.length());
try {
if (!(fileSystem.exists(dstPath))) {
FSDataOutputStream out = fileSystem.create(dstPath, true);
InputStream in = new BufferedInputStream(
new FileInputStream(new File(source)));
byte[] b = new byte[1024];
int numBytes = 0;
while ((numBytes = in.read(b)) > 0) {
out.write(b, 0, numBytes);
}
in.close();
out.close();
} else {
fileSystem.copyFromLocalFile(srcPath, dstPath);
}
} catch (Exception e) {
System.err.println("Exception caught! :" + e);
System.exit(1);
} finally {
fileSystem.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
I have added hadoop jars in CLASSPATH also..
Following is the value of classpath :
$STORM_HOME/storm-0.8.1.jar:$JAVA_HOME/lib/:$HADOOP_HOME/hadoop-core-1.0.4.jar:$HADOOP_HOME/lib/:$STORM_HOME/lib/
Also copied hadoop libraries : hadoop-cor-1.0.4.jar, commons-collection-3.2.1.jar and commons-cli-1.2.jar in Storm/lib directory.
When I am building this project, It is throwing following error :
3006 [Thread-16] ERROR backtype.storm.daemon.executor -
java.lang.NoClassDefFoundError: org/apache/commons/configuration/Configuration
at org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.<init>(DefaultMetricsSystem.java:37)
at org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.<clinit>(DefaultMetricsSystem.java:34)
at org.apache.hadoop.security.UgiInstrumentation.create(UgiInstrumentation.java:51)
at org.apache.hadoop.security.UserGroupInformation.initialize(UserGroupInformation.java:216)
at org.apache.hadoop.security.UserGroupInformation.ensureInitialized(UserGroupInformation.java:184)
at org.apache.hadoop.security.UserGroupInformation.isSecurityEnabled(UserGroupInformation.java:236)
at org.apache.hadoop.security.UserGroupInformation.getLoginUser(UserGroupInformation.java:466)
at org.apache.hadoop.security.UserGroupInformation.getCurrentUser(UserGroupInformation.java:452)
at org.apache.hadoop.fs.FileSystem$Cache$Key.<init>(FileSystem.java:1494)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:1395)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:254)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:123)
at bolts.DataNormalizer.execute(DataNormalizer.java:67)
at backtype.storm.topology.BasicBoltExecutor.execute(BasicBoltExecutor.java:32)
......................
The error message tells you that Apache commons configuration is missing. You have to add it to the classpath.
More generally, you should add all Hadoop dependencies to your classpath. You can find them using a dependency manager (Maven, Ivy, Gradle etc.) or look into /usr/lib/hadoop/lib on a machine on which Hadoop is installed.

Resources