Hadoop Exception in thread "main" java.io.FileNotFoundException: hadoop-mapreduce-client-core-2.6.0.jar even though that file exists - hadoop

I am working with hadoop-2.6.0 and hbase-0.98.9. While running the hadoop job, it is throwing java.io.FileNotFoundException even though that file exists and also exists in the classpath, but still it is looking for in hdfs:// path. What could be the problem? I did check here, but that question is for third party jars. Here it is in the classpath. Here is the error.
15/05/23 02:08:39 INFO zookeeper.ZooKeeper: Initiating client connection, connectString=localhost:2181 sessionTimeout=90000 watcher=hconnection-0x6737ca, quorum=localhost:2181, baseZNode=/hbase
15/05/23 02:08:39 INFO zookeeper.ClientCnxn: Opening socket connection to server localhost/127.0.0.1:2181. Will not attempt to authenticate using SASL (unknown error)
15/05/23 02:08:39 INFO zookeeper.ClientCnxn: Socket connection established to localhost/127.0.0.1:2181, initiating session
15/05/23 02:08:39 INFO zookeeper.ClientCnxn: Session establishment complete on server localhost/127.0.0.1:2181, sessionid = 0x14d7fd352eb000c, negotiated timeout = 40000
15/05/23 02:08:40 INFO mapreduce.TableOutputFormat: Created table instance for Energy
15/05/23 02:08:40 WARN mapreduce.JobSubmitter: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
15/05/23 02:08:40 INFO mapreduce.JobSubmitter: Cleaning up the staging area file:/home/vijaykumar/hadoop/hadoop_tmpdir/mapred/staging/vijaykumar1706101359/.staging/job_local1706101359_0001
Exception in thread "main" java.io.FileNotFoundException: File does not exist: hdfs://localhost:54310/home/vijaykumar/hadoop/hadoop-2.6.0/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.6.0.jar
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:1122)
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:1114)
at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1114)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:288)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:224)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestamps(ClientDistributedCacheManager.java:93)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(ClientDistributedCacheManager.java:57)
at org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:269)
at org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:390)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:483)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1296)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1293)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1293)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1314)
at habseWrite.run(habseWrite.java:142)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at habseWrite.main(habseWrite.java:107)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Mapper class
public static class WriteMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
IntWritable k = new IntWritable();
Text res = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String val = value.toString();
int row = val.hashCode();
k.set(row);
res.set(val);
context.write(k, res);
}
}
Reducer code
public static class WriteReducer extends TableReducer<IntWritable, Text, Text> {
public static final byte[] area = "Area".getBytes();
public static final byte[] prop = "Property".getBytes();
private Text rowkey = new Text();
private int rowCount = 0;
public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String X1 = "",X2="",X3="",X4="",X5="",
X6="",X7="",X8="",Y1="",Y2="";
for (Text val : values) {
String[] v = val.toString().split("\t");
X1 = v[0];
X2 = v[1];
X3 = v[2];
X4 = v[3];
X5 = v[4];
X6 = v[5];
X7 = v[6];
X8 = v[7];
Y1 = v[8];
Y2 = v[9];
}
String k = "row"+rowCount;
Put put = new Put(Bytes.toBytes(k.toString()));
put.add(area, "X1".getBytes(), Bytes.toBytes(X1));
put.add(area, "X5".getBytes(), Bytes.toBytes(X5));
put.add(area, "X6".getBytes(), Bytes.toBytes(X6));
put.add(area, "Y1".getBytes(), Bytes.toBytes(Y1));
put.add(area, "Y2".getBytes(), Bytes.toBytes(Y2));
put.add(prop, "X2".getBytes(), Bytes.toBytes(X2));
put.add(prop, "X3".getBytes(), Bytes.toBytes(X3));
put.add(prop, "X4".getBytes(), Bytes.toBytes(X4));
put.add(prop, "X7".getBytes(), Bytes.toBytes(X7));
put.add(prop, "X8".getBytes(), Bytes.toBytes(X8));
rowCount++;
rowkey.set(k);
context.write(rowkey, put);
}
}
And main
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new habseWrite(), args);
System.exit(res);
}
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = HBaseConfiguration.create();
String inputPath = args[0];
Job job = new Job(conf,"HBase_write");
job.setInputFormatClass(TextInputFormat.class);
job.setJarByClass(habseWrite.class);
job.setMapperClass(habseWrite.WriteMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
TableMapReduceUtil.initTableReducerJob(
"Energy", // output table
WriteReducer.class, // reducer class
job);
job.setNumReduceTasks(1);
FileInputFormat.setInputPaths(job, inputPath);
return(job.waitForCompletion(true) ? 0 : 1 );
}

Related

How to fix this issue that I'm getting while executing map-reduce java code in Eclipse in Windows 10?

I am trying to execute a Java Program to Count the words in the input file using mapreduce in hadoop. I am using windows 10 and eclipse IDE.
I am getting fileNotFoundException when the reducer starts executing. Mapper executes completely. Please help to resolve the issue. Stuck here for quite a while.
public class CountMax {
public static class Map extends Mapper<LongWritable,Text,Text,IntWritable> {
public void map(LongWritable key, Text value,Context context) throws IOException,InterruptedException{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
value.set(tokenizer.nextToken());
context.write(value, new IntWritable(1));
}
System.out.println("In mapper");
}
}
public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
public int maxCount = 0;
public String maxCountWord = "";
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException,InterruptedException {
int sum=0;
for(IntWritable x: values)
sum+=x.get();
if(sum>maxCount){
maxCountWord = key.toString();
maxCount = sum;
System.out.println(sum);
System.out.println(key);
}
System.out.println(maxCountWord);
}
public void setup(Context context)throws IOException, InterruptedException {
System.out.println("in SETUP");
}
protected void cleanup(Context context)throws IOException,InterruptedException {
context.write( new Text(maxCountWord), new IntWritable(maxCount) );
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(CountMax.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path inp = new Path("C:/input.txt");
Path out = new Path("C:/output");
FileInputFormat.addInputPath(job, inp);
FileOutputFormat.setOutputPath(job, out);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
This is what I'm getting in the console :
2019-10-07 00:07:39,461 INFO mapred.LocalJobRunner (LocalJobRunner.java:run(249)) - Finishing task: attempt_local2096667908_0001_m_000000_0
2019-10-07 00:07:39,461 INFO mapred.LocalJobRunner (LocalJobRunner.java:runTasks(456)) - map task executor complete.
2019-10-07 00:07:39,463 INFO mapred.LocalJobRunner (LocalJobRunner.java:runTasks(448)) - Waiting for reduce tasks
2019-10-07 00:07:39,463 INFO mapred.LocalJobRunner (LocalJobRunner.java:run(302)) - Starting task: attempt_local2096667908_0001_r_000000_0
2019-10-07 00:07:39,472 INFO output.FileOutputCommitter (FileOutputCommitter.java:<init>(108)) - File Output Committer Algorithm version is 1
2019-10-07 00:07:39,472 INFO util.ProcfsBasedProcessTree (ProcfsBasedProcessTree.java:isAvailable(192)) - ProcfsBasedProcessTree currently is supported only on Linux.
2019-10-07 00:07:39,499 INFO mapred.Task (Task.java:initialize(614)) - Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree#7734a2ef
2019-10-07 00:07:39,501 INFO mapred.ReduceTask (ReduceTask.java:run(362)) - Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle#18daa432
2019-10-07 00:07:39,509 INFO reduce.MergeManagerImpl (MergeManagerImpl.java:<init>(205)) - MergerManager: memoryLimit=1314232704, maxSingleShuffleLimit=328558176, mergeThreshold=867393600, ioSortFactor=10, memToMemMergeOutputsThreshold=10
2019-10-07 00:07:39,510 INFO reduce.EventFetcher (EventFetcher.java:run(61)) - attempt_local2096667908_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events
2019-10-07 00:07:39,528 INFO mapred.LocalJobRunner (LocalJobRunner.java:runTasks(456)) - reduce task executor complete.
2019-10-07 00:07:39,532 WARN mapred.LocalJobRunner (LocalJobRunner.java:run(560)) - job_local2096667908_0001
java.lang.Exception: org.apache.hadoop.mapreduce.task.reduce.Shuffle$ShuffleError: error in shuffle in localfetcher#1
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:529)
Caused by: org.apache.hadoop.mapreduce.task.reduce.Shuffle$ShuffleError: error in shuffle in localfetcher#1
at org.apache.hadoop.mapreduce.task.reduce.Shuffle.run(Shuffle.java:134)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:376)
at org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:319)
at java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.io.FileNotFoundException: C:/tmp/hadoop-SahilJ%20PC/mapred/local/localRunner/SahilJ%20PC/jobcache/job_local2096667908_0001/attempt_local2096667908_0001_m_000000_0/output/file.out.index
at org.apache.hadoop.fs.RawLocalFileSystem.open(RawLocalFileSystem.java:200)
at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:768)
at org.apache.hadoop.io.SecureIOUtils.openFSDataInputStream(SecureIOUtils.java:155)
at org.apache.hadoop.mapred.SpillRecord.<init>(SpillRecord.java:71)
at org.apache.hadoop.mapred.SpillRecord.<init>(SpillRecord.java:62)
at org.apache.hadoop.mapred.SpillRecord.<init>(SpillRecord.java:57)
at org.apache.hadoop.mapreduce.task.reduce.LocalFetcher.copyMapOutput(LocalFetcher.java:124)
at org.apache.hadoop.mapreduce.task.reduce.LocalFetcher.doCopy(LocalFetcher.java:102)
at org.apache.hadoop.mapreduce.task.reduce.LocalFetcher.run(LocalFetcher.java:85)
Just making a new user account without any space in the name, with all the admin rights and doing the whole setup again in the new account worked and solved the issue.

Hadoop mapreduce - mapping NullPointerException

I need to write a simple map-reduce program that , given as input a directed graph represented as a list of edges, produces the same graph where each edge (x,y) with x>y is replaced by (y,x) and there are no repetitions of edges in the output graph.
INPUT
1;3
2;1
0;1
3;1
2;0
1;1
2;1
OUTPUT
1;3
1;2
0;1
0;2
1;1
This is the code :
public class ExamGraph {
// mapper class
public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
value = new Text( value.toString());
String[] campi = value.toString().split(";");
if (Integer.getInteger(campi[0]) > Integer.getInteger(campi[1]))
context.write(new Text(campi[1]+";"+campi[0]), NullWritable.get());
else context.write(new Text(campi[0]+";"+campi[1]), NullWritable.get());
}
}
// reducer class
public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
#Override
protected void reduce(Text key, Iterable <NullWritable> values , Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
// create new job
Job job = Job.getInstance(new Configuration());
// job is based on jar containing this class
job.setJarByClass(ExamGraph.class);
// for logging purposes
job.setJobName("ExamGraph");
// set input path in HDFS
FileInputFormat.addInputPath(job, new Path(args[0]));
// set output path in HDFS (destination must not exist)
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// set mapper and reducer classes
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// An InputFormat for plain text files.
// Files are broken into lines. Either linefeed or carriage-return are used
// to signal end of line. Keys are the position in the file, and values
// are the line of text.
job.setInputFormatClass(TextInputFormat.class);
// set type of output keys and values for both mappers and reducers
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// start job
job.waitForCompletion(true);
}
}
When I run the jar file using :
hadoop jar path/jar JOBNAME /inputlocation /outputlocation
I got this error :
18/05/22 02:13:11 INFO mapreduce.Job: Task Id : attempt_1526979627085_0001_m_000000_1, Status : FAILED
Error: java.lang.NullPointerException
at ExamGraph$MyMapper.map(ExamGraph.java:38)
at ExamGraph$MyMapper.map(ExamGraph.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:793)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1917)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
But I did not find the error in the code.
Found the problem , I confused the method getInteger() with the parseInt() in the mapper.

apache storm, load balance, json

I am using Kafka storm, kafka sends/emits json string to storm, in the storm, I want to distribute the load to a couple of workers based on the key/field in the json. How to do that? In my case, it is groupid field in json string.
For example, I have json like that:
{groupid: 1234, userid: 145, comments:"I want to distribute all this group 1234 to one worker", size:50,type:"group json"}
{groupid: 1235, userid: 134, comments:"I want to distribute all this group 1234 to another worker", size:90,type:"group json"}
{groupid: 1234, userid: 158, comments:"I want to be sent to same worker as group 1234", size:50,type:"group json"}
I try too use following codes:
1. TopologyBuilder builder = new TopologyBuilder();
2. builder.setSpout(SPOUTNAME, kafkaSpout, 1);
3. builder.setBolt(MYDISTRIBUTEDWORKER, new DistributedBolt()).setFieldsGroup(SPOUTNAME,new Fields("groupid")); <---???
I am wondering how to put arguments in setFieldsGroup method in line 3. Could someone give me a hint?
Juhani
==Testing using storm 0.9.4 ============
=============source codes==============
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class KafkaBoltMain {
private static final String SPOUTNAME="TopicSpout";
private static final String ANALYSISBOLT = "AnalysisWorker";
private static final String CLIENTID = "Storm";
private static final String TOPOLOGYNAME = "LocalTopology";
private static class AppAnalysisBolt extends BaseRichBolt {
private static final long serialVersionUID = -6885792881303198646L;
private OutputCollector _collector;
private long groupid=-1L;
private String log="test";
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
_collector = collector;
}
public void execute(Tuple tuple) {
List<Object> objs = tuple.getValues();
int i=0;
for(Object obj:objs){
System.out.println(""+i+"th object's value is:"+obj.toString());
i++;
}
// _collector.emit(new Values(groupid,log));
_collector.ack(tuple);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("groupid","log"));
}
}
public static void main(String[] args){
String zookeepers = null;
String topicName = null;
if(args.length == 2 ){
zookeepers = args[0];
topicName = args[1];
}else if(args.length == 1 && args[0].equalsIgnoreCase("help")){
System.out.println("xxxx");
System.exit(0);
}
else{
System.out.println("You need to have two arguments: kafka zookeeper:port and topic name");
System.out.println("xxxx");
System.exit(-1);
}
SpoutConfig spoutConfig = new SpoutConfig(new ZkHosts(zookeepers),
topicName,
"",// zookeeper root path for offset storing
CLIENTID);
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(SPOUTNAME, kafkaSpout, 1);
builder.setBolt(ANALYSISBOLT, new AppAnalysisBolt(),2)
.fieldsGrouping(SPOUTNAME,new Fields("groupid"));
//Configuration
Config conf = new Config();
conf.setDebug(false);
//Topology run
conf.put(Config.TOPOLOGY_MAX_SPOUT_PENDING, 1);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(TOPOLOGYNAME, conf, builder.createTopology());
}
}
==================================================
when I start to submit topology(local cluster), it
gives following error:
11658 [SyncThread:0] INFO org.apache.storm.zookeeper.server.ZooKeeperServer - Established session 0x14d097d338c0009 with negotiated timeout 20000 for client /127.0.0.1:34656
11658 [main-SendThread(localhost:2000)] INFO org.apache.storm.zookeeper.ClientCnxn - Session establishment complete on server localhost/127.0.0.1:2000, sessionid = 0x14d097d338c0009, negotiated timeout = 20000
11659 [main-EventThread] INFO org.apache.storm.curator.framework.state.ConnectionStateManager - State change: CONNECTED
12670 [main] INFO backtype.storm.daemon.supervisor - Starting supervisor with id ccc57de0-29ff-4cb4-89de-fea1ea9b6e28 at host storm-VirtualBox
12794 [main] WARN backtype.storm.daemon.nimbus - Topology submission exception. (topology name='LocalTopology') #<InvalidTopologyException InvalidTopologyException(msg:Component: [AnalysisWorker] subscribes from stream: [default] of component [TopicSpout] with non-existent fields: #{"groupid"})>
12800 [main] ERROR org.apache.storm.zookeeper.server.NIOServerCnxnFactory - Thread Thread[main,5,main] died
backtype.storm.generated.InvalidTopologyException: null
at backtype.storm.daemon.common$validate_structure_BANG_.invoke(common.clj:178) ~[storm-core-0.9.4.jar:0.9.4]
at backtype.storm.daemon.common$system_topology_BANG_.invoke(common.clj:307) ~[storm-core-0.9.4.jar:0.9.4]
at backtype.storm.daemon.nimbus$fn__4290$exec_fn__1754__auto__$reify__4303.submitTopologyWithOpts(nimbus.clj:948) ~[storm-core-0.9.4.jar:0.9.4]
at backtype.storm.daemon.nimbus$fn__4290$exec_fn__1754__auto__$reify__4303.submitTopology(nimbus.clj:966) ~[storm-core-0.9.4.jar:0.9.4]
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[na:1.7.0_80]
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) ~[na:1.7.0_80]
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[na:1.7.0_80]
at java.lang.reflect.Method.invoke(Method.java:606) ~[na:1.7.0_80]
at clojure.lang.Reflector.invokeMatchingMethod(Reflector.java:93) ~[clojure-1.5.1.jar:na]
at clojure.lang.Reflector.invokeInstanceMethod(Reflector.java:28) ~[clojure-1.5.1.jar:na]
at backtype.storm.testing$submit_local_topology.invoke(testing.clj:264) ~[storm-core-0.9.4.jar:0.9.4]
at backtype.storm.LocalCluster$_submitTopology.invoke(LocalCluster.clj:43) ~[storm-core-0.9.4.jar:0.9.4]
at backtype.storm.LocalCluster.submitTopology(Unknown Source) ~[storm-core-0.9.4.jar:0.9.4]
at com.callstats.stream.analyzer.KafkaBoltMain.main(KafkaBoltMain.java:94) ~[StreamAnalyzer-1.0-SNAPSHOT-jar-with-dependencies.jar:na]
I'm not sure which version of Storm you are using, as of 0.9.4, your requirement can be implemented as follows.
builder.setBolt(MYDISTRIBUTEDWORKER, new DistributedBolt()).fieldsGrouping(SPOUTNAME, new Fields("groupid"));
In prepare method of DistributedBolt,
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("groupid", "log"));
}
Somewhere in execute method of it, you will call
collector.emit(new Values(groupid, log));
then tuples which have same groupid will be delivered to same instance of next bolt.

Getting Hbase Exception No regions passed

Hi i am new to Hbase and im trying to learn how to load bulk data to Hbase table using MapReduce
But i am getting below Exception
Exception in thread "main" java.lang.IllegalArgumentException: No regions passed
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.writePartitions(HFileOutputFormat2.java:307)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.configurePartitioner(HFileOutputFormat2.java:527)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.configureIncrementalLoad(HFileOutputFormat2.java:391)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.configureIncrementalLoad(HFileOutputFormat2.java:356)
at JobDriver.run(JobDriver.java:108)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
at JobDriver.main(JobDriver.java:34)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.main(RunJar.java:212)
This is mY Mapper Code
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
System.out.println("Value in Mapper"+value.toString());
String[] values = value.toString().split(",");
byte[] row = Bytes.toBytes(values[0]);
ImmutableBytesWritable k = new ImmutableBytesWritable(row);
KeyValue kvProtocol = new KeyValue(row, "PROTOCOLID".getBytes(), "PROTOCOLID".getBytes(), values[1]
.getBytes());
context.write(k, kvProtocol);
}
This is my Job Configuration
public class JobDriver extends Configured implements Tool{
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
ToolRunner.run(new JobDriver(), args);
System.exit(0);
}
#Override
public int run(String[] arg0) throws Exception {
// TODO Auto-generated method stub
// HBase Configuration
System.out.println("**********Starting Hbase*************");
Configuration conf = HBaseConfiguration.create();
Job job = new Job(conf, "TestHFileToHBase");
job.setJarByClass(JobDriver.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setMapperClass(LoadMapper.class);
job.setOutputFormatClass(HFileOutputFormat2.class);
HTable table = new HTable(conf, "kiran");
FileInputFormat.addInputPath(job, new Path("hdfs://192.168.61.62:9001/sampledata.csv"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.61.62:9001/deletions_6.csv"));
HFileOutputFormat2.configureIncrementalLoad(job, table);
//System.exit(job.waitForCompletion(true) ? 0 : 1);
return job.waitForCompletion(true) ? 0 : 1;
}
}
Can Anyone please help me in resolvin the exception.
You have to create the table first. You can do it with the below code
//Create table and do pre-split
HTableDescriptor descriptor = new HTableDescriptor(
Bytes.toBytes(tableName)
);
descriptor.addFamily(
new HColumnDescriptor(Constants.COLUMN_FAMILY_NAME)
);
HBaseAdmin admin = new HBaseAdmin(config);
byte[] startKey = new byte[16];
Arrays.fill(startKey, (byte) 0);
byte[] endKey = new byte[16];
Arrays.fill(endKey, (byte)255);
admin.createTable(descriptor, startKey, endKey, REGIONS_COUNT);
admin.close();
or directly from the hbase shell with the command:
create 'kiran', 'colfam1'
The exception is caused because the startkeys list is empty: line 306
More info can be found here.
Note that the table name must be the same with the one you use in your code (kiran).

Avro AvroMultipleOutputs part-r-00000: File is not open for writing Exception

I have written a MapReduce Job with Avro 1.7.4 on Hadoop 2.3.0. In the first step I wrote all the Avro Results in a AvroSequenceFile. Everything worked well without problems.
Then I tried to use the AvroMultipleOutputs class to write the results in different files. I wrote the same MapReduce Job without using Avro and it was no problem to write the data in two separate files (btw. part-r-00000 was created in hdfs but left empty).
The Avro variant produces exceptions if I write data in the reducer. (If I comment out the lines which write data out I get no exceptions).
Here is Job configurations:
this.conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ArchiveDataProcessorMR.class);
Path inPath = new Path(props.getProperty("inPath").trim());
Path outPath = new Path(props.getProperty("outPath").trim());
Path outPathMeta = new Path(props.getProperty("outPath.meta").trim());
Path outPathPayload = new Path(props.getProperty("outPath.payload").trim());
// cleanup resources from previous run
FileSystem fs = FileSystem.get(conf);
fs.delete(outPath, true);
FileInputFormat.setInputPaths(job, inPath);
FileOutputFormat.setOutputPath(job, outPath);
AvroSequenceFileInputFormat<DataExportKey,DataExportValue> sequenceInputFormat = new AvroSequenceFileInputFormat<DataExportKey,DataExportValue>();
job.setInputFormatClass(sequenceInputFormat.getClass());
AvroJob.setInputValueSchema(job, DataExportValue.getClassSchema());
AvroJob.setInputKeySchema(job, DataExportKey.getClassSchema());
AvroJob.setMapOutputValueSchema(job, DataExportValue.getClassSchema());
AvroJob.setMapOutputKeySchema(job, DataExportKey.getClassSchema());
job.setMapperClass(ArchiveDataMapper.class);
job.setReducerClass(ArchiveDataReducer.class);
AvroSequenceFileOutputFormat<DataExportKey,DataExportValue> sequenceOutputFormat = new AvroSequenceFileOutputFormat<DataExportKey,DataExportValue>();
AvroJob.setOutputKeySchema(job, DataExportKey.getClassSchema());
AvroJob.setOutputValueSchema(job, DataExportValue.getClassSchema());
job.setOutputFormatClass(AvroSequenceFileOutputFormat.class);
AvroMultipleOutputs.addNamedOutput(job, "meta", sequenceOutputFormat.getClass(), DataExportKey.getClassSchema(), DataExportValue.getClassSchema());
AvroMultipleOutputs.addNamedOutput(job, "payload", sequenceOutputFormat.getClass(), DataExportKey.getClassSchema(), DataExportValue.getClassSchema());
The reducer code (without the business logic) looks like
public static class ArchiveDataReducer extends Reducer<AvroKey<DataExportKey>, AvroValue<DataExportValue>,AvroKey<DataExportKey>,AvroValue<DataExportValue>> {
private AvroMultipleOutputs amos;
public void setup(Context context) throws IOException, InterruptedException {
this.amos = new AvroMultipleOutputs(context);
}
public void cleanup(Context context) throws IOException, InterruptedException {
this.amos.close();
}
/**
* #param key
*/
public void reduce(AvroKey<DataExportKey> key, Iterable<AvroValue<DataExportValue>> xmlIter, Context context) throws IOException, InterruptedException {
try {
DataExportValue newValue = new DataExportValue();
if (key.datum()......) {
... snip...
amos.write("meta",key, new AvroValue<DataExportValue>(newValue));
} else {
... snip...
amos.write("payload",key, new AvroValue<DataExportValue>(newValue));
}
} catch (Exception e) {
e.printStackTrace();
}
}
} // class ArchiveDataReducer
The exception message is
14/05/04 06:52:58 INFO mapreduce.Job: map 100% reduce 0%
14/05/04 06:53:09 INFO mapreduce.Job: map 100% reduce 91%
14/05/04 06:53:09 INFO mapreduce.Job: Task Id : attempt_1399104292130_0016_r_000000_1, Status : FAILED
Error: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): No lease on /applications/wd/data_avro/_temporary/1/_temporary/attempt_1399104292130_0016_r_000000_1/part-r-00000: File is not open for writing. Holder DFSClient_attempt_1399104292130_0016_r_000000_1_338983539_1 does not have any open files.
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2856)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.analyzeFileState(FSNamesystem.java:2667)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:2573)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:563)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:407)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:928)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1962)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1958)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1956)
at org.apache.hadoop.ipc.Client.call(Client.java:1406)
at org.apache.hadoop.ipc.Client.call(Client.java:1359)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206)
at com.sun.proxy.$Proxy10.addBlock(Unknown Source)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
at com.sun.proxy.$Proxy10.addBlock(Unknown Source)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:348)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.locateFollowingBlock(DFSOutputStream.java:1264)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1112)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:522)
Any hints how to solve this problem? Did you get another example with AvroMultipleOutputs running? I would like to see your code.
Kind regards
Martin

Resources