How can I configure no of classification classes/labels for StreamingLogisticRegressionWithSGD - apache-spark-mllib

I am new to Spark MLlib. I am trying to implement StreamingLogisticRegressionWithSGD model. Very little information is provided for it in Spark docs. When I enter 2,22-22-22 on socket stream I am getting
ERROR DataValidators: Classification labels should be 0 or 1. Found 1 invalid labels
I understand that it is expecting me to enter features with label 0 or 1 but I really want to know if I can configure it for more labels.
I do not know how to set number of classes for classification for StreamingLogisticRegressionWithSGD.
Thanks!
Code
package test;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.StreamingContext;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
public class SLRPOC {
private static StreamingLogisticRegressionWithSGD slrModel;
private static int numFeatures = 3;
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setMaster("local[3]").setAppName("SLRPOC");
SparkContext sc = new SparkContext(sparkConf);
StreamingContext ssc = new StreamingContext(sc, Durations.seconds(10));
JavaStreamingContext jssc = new JavaStreamingContext(ssc);
slrModel = new StreamingLogisticRegressionWithSGD().setStepSize(0.5).setNumIterations(10).setInitialWeights(Vectors.zeros(numFeatures));
slrModel.trainOn(getDStreamTraining(jssc));
slrModel.predictOn(getDStreamPrediction(jssc)).foreachRDD(new Function<JavaRDD<Double>, Void>() {
private static final long serialVersionUID = 5287086933555760190L;
#Override
public Void call(JavaRDD<Double> v1) throws Exception {
List<Double> list = v1.collect();
for (Double d : list) {
System.out.println(d);
}
return null;
}
});
jssc.start();
jssc.awaitTermination();
}
public static JavaDStream<LabeledPoint> getDStreamTraining(JavaStreamingContext context) {
JavaReceiverInputDStream<String> lines = context.socketTextStream("localhost", 9998);
return lines.map(new Function<String, LabeledPoint>() {
private static final long serialVersionUID = 1268686043314386060L;
#Override
public LabeledPoint call(String data) throws Exception {
System.out.println("Inside LabeledPoint call : ----- ");
String arr[] = data.split(",");
double vc[] = new double[3];
String vcS[] = arr[1].split("-");
int i = 0;
for (String vcSi : vcS) {
vc[i++] = Double.parseDouble(vcSi);
}
return new LabeledPoint(Double.parseDouble(arr[0]), Vectors.dense(vc));
}
});
}
public static JavaDStream<Vector> getDStreamPrediction(JavaStreamingContext context) {
JavaReceiverInputDStream<String> lines = context.socketTextStream("localhost", 9999);
return lines.map(new Function<String, Vector>() {
private static final long serialVersionUID = 1268686043314386060L;
#Override
public Vector call(String data) throws Exception {
System.out.println("Inside Vector call : ----- ");
String vcS[] = data.split("-");
double vc[] = new double[3];
int i = 0;
for (String vcSi : vcS) {
vc[i++] = Double.parseDouble(vcSi);
}
return Vectors.dense(vc);
}
});
}
}
Exception
Inside LabeledPoint call : ----- 16/05/18 17:51:10 INFO Executor:
Finished task 0.0 in stage 4.0 (TID 4). 953 bytes result sent to
driver 16/05/18 17:51:10 INFO TaskSetManager: Finished task 0.0 in
stage 4.0 (TID 4) in 8 ms on localhost (1/1) 16/05/18 17:51:10 INFO
TaskSchedulerImpl: Removed TaskSet 4.0, whose tasks have all
completed, from pool 16/05/18 17:51:10 INFO DAGScheduler: ResultStage
4 (trainOn at SLRPOC.java:33) finished in 0.009 s 16/05/18 17:51:10
INFO DAGScheduler: Job 6 finished: trainOn at SLRPOC.java:33, took
0.019578 s 16/05/18 17:51:10 ERROR DataValidators: Classification labels should be 0 or 1. Found 1 invalid labels 16/05/18 17:51:10 INFO
JobScheduler: Starting job streaming job 1463574070000 ms.1 from job
set of time 1463574070000 ms 16/05/18 17:51:10 ERROR JobScheduler:
Error running job streaming job 1463574070000 ms.0
org.apache.spark.SparkException: Input validation failed. at
org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.run(GeneralizedLinearAlgorithm.scala:251)
at
org.apache.spark.mllib.regression.StreamingLinearAlgorithm$$anonfun$trainOn$1.apply(StreamingLinearAlgorithm.scala:94)
at
org.apache.spark.mllib.regression.StreamingLinearAlgorithm$$anonfun$trainOn$1.apply(StreamingLinearAlgorithm.scala:92)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:42)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:399)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at scala.util.Try$.apply(Try.scala:161) at
org.apache.spark.streaming.scheduler.Job.run(Job.scala:34) at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:207)
at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:207)
at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:207)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:206)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745) Exception in thread "main"
org.apache.spark.SparkException: Input validation failed. at
org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.run(GeneralizedLinearAlgorithm.scala:251)
at
org.apache.spark.mllib.regression.StreamingLinearAlgorithm$$anonfun$trainOn$1.apply(StreamingLinearAlgorithm.scala:94)
at
org.apache.spark.mllib.regression.StreamingLinearAlgorithm$$anonfun$trainOn$1.apply(StreamingLinearAlgorithm.scala:92)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:42)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:399)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at
org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at scala.util.Try$.apply(Try.scala:161) at
org.apache.spark.streaming.scheduler.Job.run(Job.scala:34) at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:207)
at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:207)
at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:207)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) at
org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:206)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745) 16/05/18 17:51:10 INFO
StreamingContext: Invoking stop(stopGracefully=false) from shutdown
hook 16/05/18 17:51:10 INFO SparkContext: Starting job: foreachRDD at
SLRPOC.java:34 16/05/18 17:51:10 INFO DAGScheduler: Job 7 finished:
foreachRDD at SLRPOC.java:34, took 0.000020 s 16/05/18 17:51:10 INFO
JobScheduler: Finished job streaming job 1463574070000 ms.1 from job
set of time 1463574070000 ms 16/05/18 17:51:10 INFO ReceiverTracker:
Sent stop signal to all 2 receivers 16/05/18 17:51:10 INFO
ReceiverSupervisorImpl: Received stop signal 16/05/18 17:51:10 INFO
ReceiverSupervisorImpl: Stopping receiver with message: Stopped by
driver: 16/05/18 17:51:10 INFO ReceiverSupervisorImpl: Called
receiver onStop 16/05/18 17:51:10 INFO ReceiverSupervisorImpl:
Deregistering receiver 1 16/05/18 17:51:10 INFO
ReceiverSupervisorImpl: Received stop signal 16/05/18 17:51:10 INFO
ReceiverSupervisorImpl: Stopping receiver with message: Stopped by
driver: 16/05/18 17:51:10 INFO ReceiverSupervisorImpl: Called
receiver onStop 16/05/18 17:51:10 INFO ReceiverSupervisorImpl:
Deregistering receiver 0 16/05/18 17:51:10 ERROR ReceiverTracker:
Deregistered receiver for stream 1: Stopped by driver 16/05/18
17:51:10 INFO ReceiverSupervisorImpl: Stopped receiver 1 16/05/18
17:51:10 ERROR ReceiverTracker: Deregistered receiver for stream 0:
Stopped by driver

Not sure if you figured this out already, but your using an Binary algorithm which only allows 2 classification, 0 or 1. If you want to have more you need to use a multiple classification algorithm
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
new LogisticRegressionWithLBFGS().setNumClasses(10)

Related

Failed Jobs not being executed by Quartz in a clustered environment

I have built a project using Spring Quartz with a clustered environment. I am trying to test if the jobs can be picked up , in case the server that initiated them shut down. While it works perfectly as expected for Cron Triggers, same cannot be said about the SimpleTrigger job.
For Cron Triggers which have not been executed yet, Quartz runs the job without any hassle.
Steps to Reproduce:
Start the servers at port 8080 and port 8081.
Schedule a job using server at port 8081.
Shut down the server while the job is running.
This is what I get when the ClusterManager picks up the jobs:
2022-06-22 13:37:52.659 INFO 23852 --- [_ClusterManager] o.s.s.quartz.LocalDataSourceJobStore : ClusterManager: detected 2 failed or restarted instances.
2022-06-22 13:37:52.661 INFO 23852 --- [_ClusterManager] o.s.s.quartz.LocalDataSourceJobStore : ClusterManager: Scanning for instance "MacBook-Pro.local1655885157031"'s failed in-progress jobs.
2022-06-22 13:37:52.677 INFO 23852 --- [_ClusterManager] o.s.s.quartz.LocalDataSourceJobStore : ClusterManager: Scanning for instance "MacBook-Pro.local1655885169333"'s failed in-progress jobs.
2022-06-22 13:37:52.720 INFO 23852 --- [_ClusterManager] o.s.s.quartz.LocalDataSourceJobStore : ClusterManager: ......Deleted 1 complete triggers(s).
2022-06-22 13:37:52.722 INFO 23852 --- [_ClusterManager] o.s.s.quartz.LocalDataSourceJobStore : ClusterManager: ......Cleaned-up 1 other failed job(s).
This is how my Job looks like:
public class ApiJob implements Job {
final static Logger log = LoggerFactory.getLogger(ApiJob.class);
#Override
public void execute(JobExecutionContext context) {
this.context=context;
log.info("Job Execution Started");
JobDataMap map=context.getMergedJobDataMap();
ApiRequest request=new ApiRequest(map.getString("message"));
try {
Thread.sleep(2*60*1000);
log.info("Job scheduled...{}",context.getJobDetail().getKey().getName());
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
Trigger:
Trigger trigger=TriggerBuilder.newTrigger().forJob(jobDetail)
.withIdentity(jobDetail.getKey().getName(), "quartz-jobs-triggers")
.withDescription("Random trigger")
.startAt(Date.from(startTime.toInstant()))
//Add custom logic here?
.withSchedule(SimpleScheduleBuilder.simpleSchedule().withMisfireHandlingInstructionFireNow())
.build();
And finally how I setup the clustering in the application.properties:
#Quartz Properties
spring.quartz.job-store-type=jdbc
spring.quartz.properties.org.quartz.threadPool.threadCount=5
spring.quartz.properties.org.quartz.scheduler.instanceId=AUTO
spring.quartz.properties.org.quartz.jobStore.isClustered = true
spring.quartz.properties.org.quartz.jobStore.clusterCheckinInterval = 20000

KafkaProducer InterruptedException during gracefull shutdown on spring boot application

For a project we are sending some events to kafka. We use spring-kafka 2.6.2.
Due to usage of spring-vault we have to restart/kill the application before the end of credentials lease (application is automatically restarted by kubernetes).
Our problem is that when using applicationContext.close() to proceed with our gracefull shutdown, KafkaProducer gets an InterruptedException Interrupted while joining ioThread inside it's close() method.
It means that in our case some pending events are not sent to kafka before shutdown as it's forced to close due to an error during destroy.
Here under a stacktrace
2020-12-18 13:57:29.007 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.s.b.w.e.tomcat.GracefulShutdown : Commencing graceful shutdown. Waiting for active requests to complete
2020-12-18 13:57:29.009 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.apache.catalina.core.StandardService : Stopping service [Tomcat]
2020-12-18 13:57:29.013 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.a.c.c.C.[Tomcat].[localhost].[/] : Destroying Spring FrameworkServlet 'dispatcherServlet'
2020-12-18 13:57:29.014 INFO [titan-producer,,,] 1 --- [tomcat-shutdown] o.s.b.w.e.tomcat.GracefulShutdown : Graceful shutdown complete
2020-12-18 13:57:29.020 WARN [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.a.c.loader.WebappClassLoaderBase : The web application [ROOT] appears to have started a thread named [kafka-producer-network-thread | titan-producer-1] but has failed to stop it. This is very likely to create a memory leak. Stack trace of thread:
java.base#11.0.9.1/sun.nio.ch.EPoll.wait(Native Method)
java.base#11.0.9.1/sun.nio.ch.EPollSelectorImpl.doSelect(Unknown Source)
java.base#11.0.9.1/sun.nio.ch.SelectorImpl.lockAndDoSelect(Unknown Source)
java.base#11.0.9.1/sun.nio.ch.SelectorImpl.select(Unknown Source)
org.apache.kafka.common.network.Selector.select(Selector.java:873)
org.apache.kafka.common.network.Selector.poll(Selector.java:469)
org.apache.kafka.clients.NetworkClient.poll(NetworkClient.java:544)
org.apache.kafka.clients.producer.internals.Sender.runOnce(Sender.java:325)
org.apache.kafka.clients.producer.internals.Sender.run(Sender.java:240)
java.base#11.0.9.1/java.lang.Thread.run(Unknown Source)
2020-12-18 13:57:29.021 WARN [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.a.c.loader.WebappClassLoaderBase : The web application [ROOT] appears to have started a thread named [micrometer-kafka-metrics] but has failed to stop it. This is very likely to create a memory leak. Stack trace of thread:
java.base#11.0.9.1/jdk.internal.misc.Unsafe.park(Native Method)
java.base#11.0.9.1/java.util.concurrent.locks.LockSupport.parkNanos(Unknown Source)
java.base#11.0.9.1/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(Unknown Source)
java.base#11.0.9.1/java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(Unknown Source)
java.base#11.0.9.1/java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(Unknown Source)
java.base#11.0.9.1/java.util.concurrent.ThreadPoolExecutor.getTask(Unknown Source)
java.base#11.0.9.1/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
java.base#11.0.9.1/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
java.base#11.0.9.1/java.lang.Thread.run(Unknown Source)
2020-12-18 13:57:29.046 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.s.s.concurrent.ThreadPoolTaskExecutor : Shutting down ExecutorService 'applicationTaskExecutor'
2020-12-18 13:57:29.048 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.s.s.c.ThreadPoolTaskScheduler : Shutting down ExecutorService 'taskScheduler'
2020-12-18 13:57:29.051 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.a.k.clients.producer.KafkaProducer : [Producer clientId=titan-producer-1] Closing the Kafka producer with timeoutMillis = 30000 ms.
2020-12-18 13:57:29.055 ERROR [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.a.k.clients.producer.KafkaProducer : [Producer clientId=titan-producer-1] Interrupted while joining ioThreadjava.lang.InterruptedException: null
at java.base/java.lang.Object.wait(Native Method)
at java.base/java.lang.Thread.join(Unknown Source)
at org.apache.kafka.clients.producer.KafkaProducer.close(KafkaProducer.java:1205)
at org.apache.kafka.clients.producer.KafkaProducer.close(KafkaProducer.java:1182)
at org.springframework.kafka.core.DefaultKafkaProducerFactory$CloseSafeProducer.closeDelegate(DefaultKafkaProducerFactory.java:901)
at org.springframework.kafka.core.DefaultKafkaProducerFactory.destroy(DefaultKafkaProducerFactory.java:428)
at org.springframework.beans.factory.support.DisposableBeanAdapter.destroy(DisposableBeanAdapter.java:258)
at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.destroyBean(DefaultSingletonBeanRegistry.java:587)
at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.destroySingleton(DefaultSingletonBeanRegistry.java:559)
at org.springframework.beans.factory.support.DefaultListableBeanFactory.destroySingleton(DefaultListableBeanFactory.java:1092)
at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.destroySingletons(DefaultSingletonBeanRegistry.java:520)
at org.springframework.beans.factory.support.DefaultListableBeanFactory.destroySingletons(DefaultListableBeanFactory.java:1085)
at org.springframework.context.support.AbstractApplicationContext.destroyBeans(AbstractApplicationContext.java:1061)
at org.springframework.context.support.AbstractApplicationContext.doClose(AbstractApplicationContext.java:1030)
at org.springframework.boot.web.servlet.context.ServletWebServerApplicationContext.doClose(ServletWebServerApplicationContext.java:170)
at org.springframework.context.support.AbstractApplicationContext.close(AbstractApplicationContext.java:979)
at org.springframework.cloud.sleuth.instrument.async.TraceRunnable.run(TraceRunnable.java:68)
at org.springframework.scheduling.support.DelegatingErrorHandlingRunnable.run(DelegatingErrorHandlingRunnable.java:54)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at java.base/java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.base/java.lang.Thread.run(Unknown Source)2020-12-18 13:57:29.055 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.a.k.clients.producer.KafkaProducer : [Producer clientId=titan-producer-1] Proceeding to force close the producer since pending requests could not be completed within timeout 30000 ms.
2020-12-18 13:57:29.056 WARN [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.s.b.f.support.DisposableBeanAdapter : Invocation of destroy method failed on bean with name 'kafkaProducerFactory': org.apache.kafka.common.errors.InterruptException: java.lang.InterruptedException
2020-12-18 13:57:29.064 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] o.s.s.concurrent.ThreadPoolTaskExecutor : Shutting down ExecutorService
2020-12-18 13:57:29.065 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] c.l.t.p.zookeeper.ZookeeperManagerImpl : Closing zookeeperConnection
2020-12-18 13:57:29.197 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] org.apache.zookeeper.ZooKeeper : Session: 0x30022348ba6000b closed
2020-12-18 13:57:29.197 INFO [titan-producer,,,] 1 --- [d-1-EventThread] org.apache.zookeeper.ClientCnxn : EventThread shut down for session: 0x30022348ba6000b
2020-12-18 13:57:29.206 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] com.zaxxer.hikari.HikariDataSource : loadtest_fallback_titan_pendingEvents - Shutdown initiated...
2020-12-18 13:57:29.221 INFO [titan-producer,222efdd2a07966ce,222efdd2a07966ce,true] 1 --- [ scheduling-1] com.zaxxer.hikari.HikariDataSource : loadtest_fallback_titan_pendingEvents - Shutdown completed.
Here is my configuration class
#Flogger
#EnableKafka
#Configuration
#RequiredArgsConstructor
#ConditionalOnProperty(
name = "titan.producer.kafka.enabled",
havingValue = "true",
matchIfMissing = true)
public class KafkaConfiguration {
#Bean
DefaultKafkaProducerFactoryCustomizer kafkaProducerFactoryCustomizer(ObjectMapper mapper) {
return producerFactory -> producerFactory.setValueSerializer(new JsonSerializer<>(mapper));
}
#Bean
public NewTopic createTopic(TitanProperties titanProperties, KafkaProperties kafkaProperties) {
TitanProperties.Kafka kafka = titanProperties.getKafka();
String defaultTopic = kafkaProperties.getTemplate().getDefaultTopic();
int numPartitions = kafka.getNumPartitions();
short replicationFactor = kafka.getReplicationFactor();
log.atInfo()
.log("Creating Kafka Topic %s with %s partitions and %s replicationFactor", defaultTopic, numPartitions, replicationFactor);
return TopicBuilder.name(defaultTopic)
.partitions(numPartitions)
.replicas(replicationFactor)
.config(MESSAGE_TIMESTAMP_TYPE_CONFIG, LOG_APPEND_TIME.name)
.build();
}
}
and my application.yaml
spring:
application:
name: titan-producer
kafka:
client-id: ${spring.application.name}
producer:
key-serializer: org.apache.kafka.common.serialization.UUIDSerializer
value-serializer: org.springframework.kafka.support.serializer.JsonSerializer
properties:
max.block.ms: 2000
request.timeout.ms: 2000
delivery.timeout.ms: 2000 #must be greater or equal to request.timeout.ms + linger.ms
template:
default-topic: titan-dev
Our vault configuration which executes the applicationContext.close() using a scheduledTask. We do it kind randomly as we have multiple replicas of the app running in parallel and avoid all the replicas to be killed at the same time.
#Flogger
#Configuration
#ConditionalOnBean(SecretLeaseContainer.class)
#ConditionalOnProperty(
name = "titan.producer.scheduling.enabled",
havingValue = "true",
matchIfMissing = true)
public class VaultConfiguration {
#Bean
public Lifecycle scheduledAppRestart(Clock clock, TitanProperties properties, TaskScheduler scheduler, ConfigurableApplicationContext applicationContext) {
Instant now = clock.instant();
Duration maxTTL = properties.getVaultConfig().getCredsMaxLease();
Instant start = now.plusSeconds(maxTTL.dividedBy(2).toSeconds());
Instant end = now.plusSeconds(maxTTL.minus(properties.getVaultConfig().getCredsMaxLeaseExpirationThreshold()).toSeconds());
Instant randomInstant = randBetween(start, end);
return new ScheduledLifecycle(scheduler, applicationContext::close, "application restart before lease expiration", randomInstant);
}
private Instant randBetween(Instant startInclusive, Instant endExclusive) {
long startSeconds = startInclusive.getEpochSecond();
long endSeconds = endExclusive.getEpochSecond();
long random = RandomUtils.nextLong(startSeconds, endSeconds);
return Instant.ofEpochSecond(random);
}
}
The ScheduledLifecycle class we use to run the scheduledtasks
import lombok.extern.flogger.Flogger;
import org.springframework.context.SmartLifecycle;
import org.springframework.scheduling.TaskScheduler;
import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.ScheduledFuture;
#Flogger
public class ScheduledLifecycle implements SmartLifecycle {
private ScheduledFuture<?> future = null;
private Duration delay = null;
private final TaskScheduler scheduler;
private final Runnable command;
private final String commandDesc;
private final Instant startTime;
public ScheduledLifecycle(TaskScheduler scheduler, Runnable command, String commandDesc, Instant startTime) {
this.scheduler = scheduler;
this.command = command;
this.commandDesc = commandDesc;
this.startTime = startTime;
}
public ScheduledLifecycle(TaskScheduler scheduler, Runnable command, String commandDesc, Instant startTime, Duration delay) {
this(scheduler, command, commandDesc, startTime);
this.delay = delay;
}
#Override
public void start() {
if (delay != null) {
log.atInfo().log("Scheduling %s: starting at %s, running every %s", commandDesc, startTime, delay);
future = scheduler.scheduleWithFixedDelay(command, startTime, delay);
} else {
log.atInfo().log("Scheduling %s: execution at %s", commandDesc, startTime);
future = scheduler.schedule(command, startTime);
}
}
#Override
public void stop() {
if (future != null) {
log.atInfo().log("Stop %s", commandDesc);
future.cancel(true);
}
}
#Override
public boolean isRunning() {
boolean running = future != null && (!future.isDone() && !future.isCancelled());
log.atFine().log("is %s running? %s", running);
return running;
}
}
Is there a bug with spring-kafka? Any idea?
Thanks
future.cancel(true);
This is interrupting the producer thread and is likely the root cause of the problem.
You should use future.cancel(false); to allow the task to terminate in an orderly fashion, without interruption.
/**
* Attempts to cancel execution of this task. This attempt will
* fail if the task has already completed, has already been cancelled,
* or could not be cancelled for some other reason. If successful,
* and this task has not started when {#code cancel} is called,
* this task should never run. If the task has already started,
* then the {#code mayInterruptIfRunning} parameter determines
* whether the thread executing this task should be interrupted in
* an attempt to stop the task.
*
* <p>After this method returns, subsequent calls to {#link #isDone} will
* always return {#code true}. Subsequent calls to {#link #isCancelled}
* will always return {#code true} if this method returned {#code true}.
*
* #param mayInterruptIfRunning {#code true} if the thread executing this
* task should be interrupted; otherwise, in-progress tasks are allowed
* to complete
* #return {#code false} if the task could not be cancelled,
* typically because it has already completed normally;
* {#code true} otherwise
*/
boolean cancel(boolean mayInterruptIfRunning);
EDIT
In addition, the ThreadPoolTaskScheduler.waitForTasksToCompleteOnShutdown is false by default.
/**
* Set whether to wait for scheduled tasks to complete on shutdown,
* not interrupting running tasks and executing all tasks in the queue.
* <p>Default is "false", shutting down immediately through interrupting
* ongoing tasks and clearing the queue. Switch this flag to "true" if you
* prefer fully completed tasks at the expense of a longer shutdown phase.
* <p>Note that Spring's container shutdown continues while ongoing tasks
* are being completed. If you want this executor to block and wait for the
* termination of tasks before the rest of the container continues to shut
* down - e.g. in order to keep up other resources that your tasks may need -,
* set the {#link #setAwaitTerminationSeconds "awaitTerminationSeconds"}
* property instead of or in addition to this property.
* #see java.util.concurrent.ExecutorService#shutdown()
* #see java.util.concurrent.ExecutorService#shutdownNow()
*/
public void setWaitForTasksToCompleteOnShutdown(boolean waitForJobsToCompleteOnShutdown) {
this.waitForTasksToCompleteOnShutdown = waitForJobsToCompleteOnShutdown;
}
You might also have to set awaitTerminationSeconds.

Why is map reduce job poinitng to localhost:8080?

I am working with Map Reduce job and executing it using ToolRunner's run method.
Here is my code:
public class MaxTemperature extends Configured implements Tool {
public static void main(String[] args) throws Exception {
System.setProperty("hadoop.home.dir", "/");
int exitCode = ToolRunner.run(new MaxTemperature(), args);
System.exit(exitCode);
}
#Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: MaxTemperature <input path> <output path>");
System.exit(-1);
}
System.out.println("Starting job");
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
int returnValue = job.waitForCompletion(true) ? 0:1;
if(job.isSuccessful()) {
System.out.println("Job was successful");
} else if(!job.isSuccessful()) {
System.out.println("Job was not successful");
}
return returnValue;
}
}
The job executed well as expected. But when i looked into the logs which displays the information abou the job tracking, I found that the Map reduce is pointing to localhost:8080 for the tracking of the job.
Here is the snapshot of logs:
20521 [main] INFO org.apache.hadoop.mapreduce.JobSubmitter - number of splits:1
20670 [main] INFO org.apache.hadoop.mapreduce.JobSubmitter - Submitting tokens for job: job_local1454583076_0001
20713 [main] WARN org.apache.hadoop.conf.Configuration - file:/tmp/hadoop-KV/mapred/staging/KV1454583076/.staging/job_local1454583076_0001/job.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.retry.interval; Ignoring.
20716 [main] WARN org.apache.hadoop.conf.Configuration - file:/tmp/hadoop-KV/mapred/staging/KV1454583076/.staging/job_local1454583076_0001/job.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.attempts; Ignoring.
20818 [main] WARN org.apache.hadoop.conf.Configuration - file:/tmp/hadoop-KV/mapred/local/localRunner/KV/job_local1454583076_0001/job_local1454583076_0001.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.retry.interval; Ignoring.
20820 [main] WARN org.apache.hadoop.conf.Configuration - file:/tmp/hadoop-KV/mapred/local/localRunner/KV/job_local1454583076_0001/job_local1454583076_0001.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.attempts; Ignoring.
**20826 [main] INFO org.apache.hadoop.mapreduce.Job - The url to track the job: http://localhost:8080/**
20827 [main] INFO org.apache.hadoop.mapreduce.Job - Running job: job_local1454583076_0001
20829 [Thread-10] INFO org.apache.hadoop.mapred.LocalJobRunner - OutputCommitter set in config null
So my question is why is map reduce pointing to localhost:8080
The url to track the job: http://localhost:8080/
There is no configuration file or properties file where i manually set this. Also, is it possible that i can change it to some other port? If yes, how can i achieve this?
So the ports are configured in yarn-site.xml : yarn-site.xml
Check : yarn.resourcemanager.webapp.address
We need to change the default configuration and create a Configuration object and set the properties to this configuration object and then create a Job object using this Configuration as follows:
Configuration configuration = getConf();
//configuration.set("fs.defaultFS", "hdfs://192.**.***.2**");
//configuration.set("mapred.job.tracker", "jobtracker:jtPort");
configuration.set("mapreduce.jobtracker.address", "localhost:54311");
configuration.set("mapreduce.framework.name", "yarn");
configuration.set("yarn.resourcemanager.address", "127.0.0.1:8032");
//configuration.set("yarn.resourcemanager.webapp.address", "127.0.0.1:8032");
//Initialize the Hadoop job and set the jar as well as the name of the Job
Job job = new Job(configuration);

Unable to Configure Number of Reducers In WordCount Job in hadoop

I m using Single Node Cluster - Hadoop-2.7.0 in my Linum Machine.
My code for WordCount Job is running fine with 1 reducer.
But Not working fine if i increase the reducers.
It is showing the following error:
15/05/25 21:15:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library
15/05/25 21:15:10 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
15/05/25 21:15:10 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
15/05/25 21:15:10 WARN snappy.LoadSnappy: Snappy native library is available
15/05/25 21:15:10 INFO snappy.LoadSnappy: Snappy native library loaded
15/05/25 21:15:10 INFO mapred.FileInputFormat: Total input paths to process : 1
15/05/25 21:15:10 INFO mapred.JobClient: Running job: job_local_0001
15/05/25 21:15:11 INFO util.ProcessTree: setsid exited with exit code 0
15/05/25 21:15:11 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin#5f1fd699
15/05/25 21:15:11 INFO mapred.MapTask: numReduceTasks: 1
15/05/25 21:15:11 INFO mapred.MapTask: io.sort.mb = 100
15/05/25 21:15:11 INFO mapred.MapTask: data buffer = 79691776/99614720
15/05/25 21:15:11 INFO mapred.MapTask: record buffer = 262144/327680
15/05/25 21:15:11 WARN mapred.LocalJobRunner: job_local_0001
java.io.IOException: Illegal partition for am (1)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1073)
at org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:592)
at WordMapper.map(WordMapper.java:24)
at WordMapper.map(WordMapper.java:1)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:436)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)
My getPartition Method Looks like this:
public int getPartition(Text key, IntWritable value, int numRedTasks) {
String s = key.toString();
if(s.length() == 1)
{
return 0;
}
else if(s.length() == 2)
{
return 1;
}
else if(s.length() == 3)
{
return 2;
}
else
return 3;
}
Run Method in WordCount.class File:
if(input.length < 2)
{
System.out.println("Please provide valid input");
return -1;
}
else
{
JobConf config = new JobConf();
FileInputFormat.setInputPaths(config, new Path(input[0]));
FileOutputFormat.setOutputPath(config, new Path(input[1]));
config.setMapperClass(WordMapper.class);
config.setReducerClass(WordReducer.class);
config.setNumReduceTasks(4);
config.setPartitionerClass(MyPartitioner.class);
config.setMapOutputKeyClass(Text.class);
config.setMapOutputValueClass(IntWritable.class);
config.setOutputKeyClass(Text.class);
config.setOutputValueClass(IntWritable.class);
JobClient.runJob(config);
}
return 0;
}
My Mapper and Reducer Code is fine because Wordcount Job with 1 reducer is running fine.
Any One able to figure it out?
This may be due to pig fails in the operation due to high default_parallel could be set in it.
Thanks,
Shailesh.
You need to use tooRunner in your driver class and invoke the toolrunner in your main class. You can do this by using combiner as part of workflow. Below is the driver class code: As you can see from the code below, along with the mapper and reducer calls, there is a combiner call as well. And the exit code in the main runner is " int exitCode = ToolRunner.run(new Configuration(), new WordCountWithCombiner(), args);" which invokes tool runner at run time and you can specify the number of reducers or mappers you would like to use by using the "-D" option when running the wordcount program. A sample command line would look like "-D mapred.reduce.tasks =2 input output"
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
public class WordCountWithCombiner extends Configured
implements Tool{
#Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
Job job = new Job(conf, "MyJob");
job.setJarByClass(WordCount.class);
job.setJobName("Word Count With Combiners");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(WordCountMapper.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new WordCountWithCombiner(), args);
System.exit(exitCode);
}
}

FileNotFoundException while running Hadoop MR Job

I am writing one mapper class which should read files from a HDFS Location and create a record (using custom class) for each file. The Code for Mapper class :-
package com.nayan.bigdata.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
/**
* #file : FileToRecordMapper.java
* #author : nayan
* #version : 1.0.0
* #date : 27-Aug-2013 12:13:44 PM
* #desc : Mapper class to read files and convert it into records.
*/
public class FileToRecordMapper extends
Mapper<LongWritable, Text, Text, RecordWritable> {
private static Logger logger = Logger.getLogger(FileToRecordMapper.class);
List<Path> allPaths;
FileSystem fs;
#Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
logger.info("Inside cleanup method.");
}
#Override
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
logger.info("Starting map method of FileToRecordMapper class.");
for(Path path : allPaths) {
FSDataInputStream in = this.fs.open(path);
Text filePath = new Text(path.getName());
Text directoryPath = new Text(path.getParent().getName());
Text filename = new Text(path.getName().substring(path.getName().lastIndexOf('/') + 1,
path.getName().length()));
byte[] b = new byte[1024];
StringBuilder contentBuilder = new StringBuilder();
while ((in.read(b)) > 0) {
contentBuilder.append(new String(b, "UTF-8"));
}
Text fileContent = new Text(contentBuilder.toString());
in.close();
RecordWritable record = new RecordWritable(filePath, filename,
fileContent, new LongWritable(System.currentTimeMillis()));
logger.info("Record Created : " + record);
context.write(directoryPath, record);
logger.info("map method of FileToRecordMapper class completed.");
}
}
#Override
public void run(Context context)
throws IOException, InterruptedException {
logger.info("Inside run method.");
}
#Override
protected void setup(Context context)
throws IOException, InterruptedException {
logger.info("Inside setup method.");
try {
logger.info("Starting configure method of FileToRecordMapper class.");
fs = FileSystem.get(context.getConfiguration());
Path path = new Path(context.getConfiguration().get("mapred.input.dir"));
allPaths = getAllPaths(path);
} catch (IOException e) {
logger.error("Error while fetching paths.", e);
}
logger.info("Paths : " + ((null != allPaths) ? allPaths : "null"));
logger.info("configure method of FileToRecordMapper class completed.");
super.setup(context);
}
private List<Path> getAllPaths(Path path) throws IOException {
ArrayList<Path> paths = new ArrayList<Path>();
getAllPaths(path, paths);
return paths;
}
private void getAllPaths(Path path, List<Path> paths) throws IOException{
try {
if (!this.fs.isFile(path)) {
for (FileStatus s : fs.listStatus(path)) {
getAllPaths(s.getPath(), paths);
}
} else {
paths.add(path);
}
} catch (IOException e) {
logger.error("File System Exception.", e);
throw e;
}
}
}
Class for record is :-
package com.nayan.bigdata.hadoop;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
* #file : RecordWritable.java
* #author : nayan
* #version : 1.0.0
* #date : 21-Aug-2013 1:53:12 PM
* #desc : Class to create a record in Accumulo
*/
public class RecordWritable implements Writable {
private Text filePath;
private Text fileName;
private Text fileContent;
private LongWritable timeStamp;
public RecordWritable() {
this.filePath = new Text();
this.fileName = new Text();
this.fileContent = new Text();
this.timeStamp = new LongWritable(System.currentTimeMillis());
}
/**
* #param filePath
* #param fileName
* #param fileContent
* #param timeStamp
*/
public RecordWritable(Text filePath, Text fileName, Text fileContent,
LongWritable timeStamp) {
this.filePath = filePath;
this.fileName = fileName;
this.fileContent = fileContent;
this.timeStamp = timeStamp;
}
public Text getFilePath() {
return filePath;
}
public void setFilePath(Text filePath) {
this.filePath = filePath;
}
public Text getFileName() {
return fileName;
}
public void setFileName(Text fileName) {
this.fileName = fileName;
}
public Text getFileContent() {
return fileContent;
}
public void setFileContent(Text fileContent) {
this.fileContent = fileContent;
}
public LongWritable getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(LongWritable timeStamp) {
this.timeStamp = timeStamp;
}
#Override
public int hashCode() {
return this.filePath.getLength() + this.fileName.getLength() + this.fileContent.getLength();
}
#Override
public boolean equals(Object obj) {
if(obj instanceof RecordWritable) {
RecordWritable otherRecord = (RecordWritable) obj;
return this.filePath.equals(otherRecord.filePath) && this.fileName.equals(otherRecord.fileName);
}
return false;
}
#Override
public String toString() {
StringBuilder recordDesc = new StringBuilder("Record Details ::\t");
recordDesc.append("File Path + ").append(this.filePath).append("\t");
recordDesc.append("File Name + ").append(this.fileName).append("\t");
recordDesc.append("File Content Length + ").append(this.fileContent.getLength()).append("\t");
recordDesc.append("File TimeStamp + ").append(this.timeStamp).append("\t");
return recordDesc.toString();
}
#Override
public void readFields(DataInput din) throws IOException {
filePath.readFields(din);
fileName.readFields(din);
fileContent.readFields(din);
timeStamp.readFields(din);
}
#Override
public void write(DataOutput dout) throws IOException {
filePath.write(dout);
fileName.write(dout);
fileContent.write(dout);
timeStamp.write(dout);
}
}
Job Runner class :-
package com.nayan.bigdata.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
/**
* #file : HadoopJobRunner.java
* #author : nayan
* #version : 1.0.0
* #date : 22-Aug-2013 12:45:15 PM
* #desc : Class to run Hadoop MR job.
*/
public class HadoopJobRunner extends Configured implements Tool {
private static Logger logger = Logger.getLogger(HadoopJobRunner.class);
/**
* #param args
* #throws Exception
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new HadoopJobRunner(), args);
System.exit(res);
}
#Override
public int run(String[] arg0) throws Exception {
logger.info("Initiating Hadoop Job.");
Configuration conf = new Configuration(true);
conf.setStrings("mapred.output.dir", arg0[1]);
conf.setStrings("mapred.input.dir", arg0[0]);
Job mrJob = new Job(conf, "FileRecordsJob");
mrJob.setJarByClass(HadoopJobRunner.class);
mrJob.setMapOutputKeyClass(Text.class);
mrJob.setMapOutputValueClass(RecordWritable.class);
mrJob.setMapperClass(FileToRecordMapper.class);
mrJob.setReducerClass(FileRecordsReducer.class);
mrJob.setOutputKeyClass(Text.class);
mrJob.setOutputValueClass(RecordWritable.class);
logger.info("MapRed Job Configuration : " + mrJob.getConfiguration().toString());
logger.info("Input Path : " + mrJob.getConfiguration().get("mapred.input.dir"));
return mrJob.waitForCompletion(true) ? 0 : 1;
}
}
Pom file for the project :-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.nayan.bigdata</groupId>
<artifactId>BigDataOperations</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>BigDataOperations</name>
<properties>
<hadoop.version>0.20.2</hadoop.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<version>1.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.nayan.bigdata.hadoop.HadoopJobRunner</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
When I run the jar, I am getting output on console :-
[root#koversevm tmp]# hadoop jar BigDataOperations-1.0-SNAPSHOT.jar /usr/hadoop/sample /usr/hadoop/jobout
13/08/28 18:33:57 INFO hadoop.HadoopJobRunner: Initiating Hadoop Job.
13/08/28 18:33:57 INFO hadoop.HadoopJobRunner: Setting the input/output path.
13/08/28 18:33:57 INFO hadoop.HadoopJobRunner: MapRed Job Configuration : Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml
13/08/28 18:33:57 INFO hadoop.HadoopJobRunner: Input Path : null
13/08/28 18:33:58 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
13/08/28 18:33:58 INFO input.FileInputFormat: Total input paths to process : 8
13/08/28 18:33:58 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
13/08/28 18:33:58 WARN snappy.LoadSnappy: Snappy native library not loaded
13/08/28 18:33:58 INFO mapred.JobClient: Running job: job_201308281800_0008
13/08/28 18:33:59 INFO mapred.JobClient: map 0% reduce 0%
13/08/28 18:34:06 INFO mapred.JobClient: map 25% reduce 0%
13/08/28 18:34:13 INFO mapred.JobClient: map 50% reduce 0%
13/08/28 18:34:17 INFO mapred.JobClient: map 75% reduce 0%
13/08/28 18:34:23 INFO mapred.JobClient: map 100% reduce 0%
13/08/28 18:34:24 INFO mapred.JobClient: map 100% reduce 33%
13/08/28 18:34:26 INFO mapred.JobClient: map 100% reduce 100%
13/08/28 18:34:27 INFO mapred.JobClient: Job complete: job_201308281800_0008
13/08/28 18:34:27 INFO mapred.JobClient: Counters: 25
13/08/28 18:34:27 INFO mapred.JobClient: Job Counters
13/08/28 18:34:27 INFO mapred.JobClient: Launched reduce tasks=1
13/08/28 18:34:27 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=44066
13/08/28 18:34:27 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
13/08/28 18:34:27 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
13/08/28 18:34:27 INFO mapred.JobClient: Launched map tasks=8
13/08/28 18:34:27 INFO mapred.JobClient: Data-local map tasks=8
13/08/28 18:34:27 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=19034
13/08/28 18:34:27 INFO mapred.JobClient: FileSystemCounters
13/08/28 18:34:27 INFO mapred.JobClient: FILE_BYTES_READ=6
13/08/28 18:34:27 INFO mapred.JobClient: HDFS_BYTES_READ=1011
13/08/28 18:34:27 INFO mapred.JobClient: FILE_BYTES_WRITTEN=549207
13/08/28 18:34:27 INFO mapred.JobClient: Map-Reduce Framework
13/08/28 18:34:27 INFO mapred.JobClient: Map input records=0
13/08/28 18:34:27 INFO mapred.JobClient: Reduce shuffle bytes=48
13/08/28 18:34:27 INFO mapred.JobClient: Spilled Records=0
13/08/28 18:34:27 INFO mapred.JobClient: Map output bytes=0
13/08/28 18:34:27 INFO mapred.JobClient: CPU time spent (ms)=3030
13/08/28 18:34:27 INFO mapred.JobClient: Total committed heap usage (bytes)=1473413120
13/08/28 18:34:27 INFO mapred.JobClient: Combine input records=0
13/08/28 18:34:27 INFO mapred.JobClient: SPLIT_RAW_BYTES=1011
13/08/28 18:34:27 INFO mapred.JobClient: Reduce input records=0
13/08/28 18:34:27 INFO mapred.JobClient: Reduce input groups=0
13/08/28 18:34:27 INFO mapred.JobClient: Combine output records=0
13/08/28 18:34:27 INFO mapred.JobClient: Physical memory (bytes) snapshot=1607675904
13/08/28 18:34:27 INFO mapred.JobClient: Reduce output records=0
13/08/28 18:34:27 INFO mapred.JobClient: Virtual memory (bytes) snapshot=23948111872
13/08/28 18:34:27 INFO mapred.JobClient: Map output records=0
But when I look into logs I found following exception :-
Task Logs: 'attempt_201308281800_0008_m_000000_0'
stdout logs
2013-08-28 18:34:01 DEBUG Child:82 - Child starting
2013-08-28 18:34:02 DEBUG Groups:136 - Creating new Groups object
2013-08-28 18:34:02 DEBUG Groups:59 - Group mapping impl=org.apache.hadoop.security.ShellBasedUnixGroupsMapping; cacheTimeout=300000
2013-08-28 18:34:02 DEBUG UserGroupInformation:193 - hadoop login
2013-08-28 18:34:02 DEBUG UserGroupInformation:142 - hadoop login commit
2013-08-28 18:34:02 DEBUG UserGroupInformation:172 - using local user:UnixPrincipal: mapred
2013-08-28 18:34:02 DEBUG UserGroupInformation:664 - UGI loginUser:mapred (auth:SIMPLE)
2013-08-28 18:34:02 DEBUG FileSystem:1598 - Creating filesystem for file:///var/lib/hadoop-0.20/cache/mapred/mapred/local/taskTracker/root/jobcache/job_201308281800_0008/jobToken
2013-08-28 18:34:02 DEBUG TokenCache:182 - Task: Loaded jobTokenFile from: /var/lib/hadoop-0.20/cache/mapred/mapred/local/taskTracker/root/jobcache/job_201308281800_0008/jobToken; num of sec keys = 0 Number of tokens 1
2013-08-28 18:34:02 DEBUG Child:106 - loading token. # keys =0; from file=/var/lib/hadoop-0.20/cache/mapred/mapred/local/taskTracker/root/jobcache/job_201308281800_0008/jobToken
2013-08-28 18:34:02 DEBUG UserGroupInformation:1300 - PriviledgedAction as:job_201308281800_0008 (auth:SIMPLE) from:org.apache.hadoop.mapred.Child.main(Child.java:121)
2013-08-28 18:34:02 DEBUG Client:256 - The ping interval is60000ms.
2013-08-28 18:34:02 DEBUG Client:299 - Use SIMPLE authentication for protocol TaskUmbilicalProtocol
2013-08-28 18:34:02 DEBUG Client:569 - Connecting to /127.0.0.1:50925
2013-08-28 18:34:02 DEBUG Client:762 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008: starting, having connections 1
2013-08-28 18:34:02 DEBUG Client:808 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 sending #0
2013-08-28 18:34:02 DEBUG Client:861 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 got value #0
2013-08-28 18:34:02 DEBUG RPC:230 - Call: getProtocolVersion 98
2013-08-28 18:34:02 DEBUG Client:808 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 sending #1
2013-08-28 18:34:02 DEBUG Client:861 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 got value #1
2013-08-28 18:34:02 DEBUG SortedRanges:347 - currentIndex 0 0:0
2013-08-28 18:34:02 DEBUG Counters:177 - Creating group org.apache.hadoop.mapred.Task$Counter with bundle
2013-08-28 18:34:02 DEBUG Counters:314 - Adding SPILLED_RECORDS
2013-08-28 18:34:02 DEBUG Counters:177 - Creating group org.apache.hadoop.mapred.Task$Counter with bundle
2013-08-28 18:34:02 DEBUG SortedRanges:347 - currentIndex 0 0:0
2013-08-28 18:34:02 DEBUG SortedRanges:347 - currentIndex 1 0:0
2013-08-28 18:34:02 DEBUG RPC:230 - Call: getTask 208
2013-08-28 18:34:03 DEBUG TaskRunner:653 - mapred.local.dir for child : /var/lib/hadoop-0.20/cache/mapred/mapred/local/taskTracker/root/jobcache/job_201308281800_0008/attempt_201308281800_0008_m_000000_0
2013-08-28 18:34:03 DEBUG NativeCodeLoader:40 - Trying to load the custom-built native-hadoop library...
2013-08-28 18:34:03 DEBUG NativeCodeLoader:47 - Failed to load native-hadoop with error: java.lang.UnsatisfiedLinkError: no hadoop in java.library.path
2013-08-28 18:34:03 DEBUG NativeCodeLoader:48 - java.library.path=/usr/java/jdk1.6.0_45/jre/lib/amd64/server:/usr/java/jdk1.6.0_45/jre/lib/amd64:/usr/java/jdk1.6.0_45/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib:/var/lib/hadoop-0.20/cache/mapred/mapred/local/taskTracker/root/jobcache/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/work
2013-08-28 18:34:03 WARN NativeCodeLoader:52 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2013-08-28 18:34:03 DEBUG TaskRunner:709 - Fully deleting contents of /var/lib/hadoop-0.20/cache/mapred/mapred/local/taskTracker/root/jobcache/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/work
2013-08-28 18:34:03 INFO JvmMetrics:71 - Initializing JVM Metrics with processName=MAP, sessionId=
2013-08-28 18:34:03 DEBUG Child:251 - Creating remote user to execute task: root
2013-08-28 18:34:03 DEBUG UserGroupInformation:1300 - PriviledgedAction as:root (auth:SIMPLE) from:org.apache.hadoop.mapred.Child.main(Child.java:260)
2013-08-28 18:34:03 DEBUG FileSystem:1598 - Creating filesystem for hdfs://localhost:8020
2013-08-28 18:34:04 DEBUG Client:256 - The ping interval is60000ms.
2013-08-28 18:34:04 DEBUG Client:299 - Use SIMPLE authentication for protocol ClientProtocol
2013-08-28 18:34:04 DEBUG Client:569 - Connecting to localhost/127.0.0.1:8020
2013-08-28 18:34:04 DEBUG Client:808 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root sending #2
2013-08-28 18:34:04 DEBUG Client:762 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root: starting, having connections 2
2013-08-28 18:34:04 DEBUG Client:861 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root got value #2
2013-08-28 18:34:04 DEBUG RPC:230 - Call: getProtocolVersion 18
2013-08-28 18:34:04 DEBUG DFSClient:274 - Short circuit read is false
2013-08-28 18:34:04 DEBUG DFSClient:280 - Connect to datanode via hostname is false
2013-08-28 18:34:04 DEBUG Task:516 - using new api for output committer
2013-08-28 18:34:04 INFO ProcessTree:65 - setsid exited with exit code 0
2013-08-28 18:34:04 INFO Task:539 - Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin#79ee2c2c
2013-08-28 18:34:04 DEBUG ProcfsBasedProcessTree:238 - [ 16890 ]
2013-08-28 18:34:04 DEBUG Client:808 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root sending #3
2013-08-28 18:34:04 DEBUG Client:861 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root got value #3
2013-08-28 18:34:04 DEBUG RPC:230 - Call: getBlockLocations 12
2013-08-28 18:34:04 DEBUG DFSClient:2595 - Connecting to /127.0.0.1:50010
2013-08-28 18:34:04 DEBUG FSInputChecker:1653 - DFSClient readChunk got seqno 0 offsetInBlock 0 lastPacketInBlock false packetLen 520
2013-08-28 18:34:04 DEBUG Counters:314 - Adding SPLIT_RAW_BYTES
2013-08-28 18:34:04 DEBUG DFSClient:2529 - Client couldn't reuse - didnt send code
2013-08-28 18:34:04 INFO MapTask:613 - Processing split: hdfs://localhost:8020/usr/hadoop/sample/2012MTCReportFINAL.pdf:0+1419623
2013-08-28 18:34:04 DEBUG Counters:314 - Adding MAP_INPUT_RECORDS
2013-08-28 18:34:04 DEBUG FileSystem:1598 - Creating filesystem for file:///
2013-08-28 18:34:04 INFO MapTask:803 - io.sort.mb = 100
2013-08-28 18:34:05 INFO MapTask:815 - data buffer = 79691776/99614720
2013-08-28 18:34:05 INFO MapTask:816 - record buffer = 262144/327680
2013-08-28 18:34:05 DEBUG Counters:314 - Adding MAP_OUTPUT_BYTES
2013-08-28 18:34:05 DEBUG Counters:314 - Adding MAP_OUTPUT_RECORDS
2013-08-28 18:34:05 DEBUG Counters:314 - Adding COMBINE_INPUT_RECORDS
2013-08-28 18:34:05 DEBUG Counters:314 - Adding COMBINE_OUTPUT_RECORDS
2013-08-28 18:34:05 WARN LoadSnappy:46 - Snappy native library not loaded
2013-08-28 18:34:05 DEBUG Client:808 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root sending #4
2013-08-28 18:34:05 DEBUG Client:861 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root got value #4
2013-08-28 18:34:05 DEBUG RPC:230 - Call: getBlockLocations 4
2013-08-28 18:34:05 INFO FileToRecordMapper:65 - Inside run method.
2013-08-28 18:34:05 INFO MapTask:1142 - Starting flush of map output
2013-08-28 18:34:05 INFO Task:830 - Task:attempt_201308281800_0008_m_000000_0 is done. And is in the process of commiting
2013-08-28 18:34:05 DEBUG Counters:177 - Creating group FileSystemCounters with nothing
2013-08-28 18:34:05 DEBUG Counters:314 - Adding FILE_BYTES_WRITTEN
2013-08-28 18:34:05 DEBUG Counters:314 - Adding HDFS_BYTES_READ
2013-08-28 18:34:05 DEBUG Counters:314 - Adding COMMITTED_HEAP_BYTES
2013-08-28 18:34:05 DEBUG ProcfsBasedProcessTree:238 - [ 16890 ]
2013-08-28 18:34:05 DEBUG Counters:314 - Adding CPU_MILLISECONDS
2013-08-28 18:34:05 DEBUG Counters:314 - Adding PHYSICAL_MEMORY_BYTES
2013-08-28 18:34:05 DEBUG Counters:314 - Adding VIRTUAL_MEMORY_BYTES
2013-08-28 18:34:05 DEBUG Client:808 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root sending #5
2013-08-28 18:34:05 DEBUG Client:861 - IPC Client (47) connection to localhost/127.0.0.1:8020 from root got value #5
2013-08-28 18:34:05 DEBUG RPC:230 - Call: getFileInfo 2
2013-08-28 18:34:05 DEBUG Task:658 - attempt_201308281800_0008_m_000000_0 Progress/ping thread exiting since it got interrupted
2013-08-28 18:34:05 DEBUG Client:808 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 sending #6
2013-08-28 18:34:05 DEBUG Client:861 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 got value #6
2013-08-28 18:34:05 DEBUG RPC:230 - Call: statusUpdate 3
2013-08-28 18:34:05 DEBUG Client:808 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 sending #7
2013-08-28 18:34:05 DEBUG Client:861 - IPC Client (47) connection to /127.0.0.1:50925 from job_201308281800_0008 got value #7
2013-08-28 18:34:05 DEBUG RPC:230 - Call: done 1
2013-08-28 18:34:05 INFO Task:942 - Task 'attempt_201308281800_0008_m_000000_0' done.
2013-08-28 18:34:05 INFO TaskLogsTruncater:69 - Initializing logs' truncater with mapRetainSize=-1 and reduceRetainSize=-1
2013-08-28 18:34:05 DEBUG TaskLogsTruncater:174 - Truncation is not needed for /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/stdout
2013-08-28 18:34:05 DEBUG TaskLogsTruncater:174 - Truncation is not needed for /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/stderr
2013-08-28 18:34:05 DEBUG TaskLogsTruncater:202 - Cannot open /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/syslog for reading. Continuing with other log files
java.io.FileNotFoundException: /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/syslog (No such file or directory)
at java.io.FileInputStream.open(Native Method)
at java.io.FileInputStream.<init>(FileInputStream.java:120)
at org.apache.hadoop.mapred.TaskLogsTruncater.truncateLogs(TaskLogsTruncater.java:199)
at org.apache.hadoop.mapred.Child$4.run(Child.java:271)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1278)
at org.apache.hadoop.mapred.Child.main(Child.java:260)
2013-08-28 18:34:05 DEBUG TaskLogsTruncater:202 - Cannot open /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/profile.out for reading. Continuing with other log files
java.io.FileNotFoundException: /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/profile.out (No such file or directory)
at java.io.FileInputStream.open(Native Method)
at java.io.FileInputStream.<init>(FileInputStream.java:120)
at org.apache.hadoop.mapred.TaskLogsTruncater.truncateLogs(TaskLogsTruncater.java:199)
at org.apache.hadoop.mapred.Child$4.run(Child.java:271)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1278)
at org.apache.hadoop.mapred.Child.main(Child.java:260)
2013-08-28 18:34:05 DEBUG TaskLogsTruncater:202 - Cannot open /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/debugout for reading. Continuing with other log files
java.io.FileNotFoundException: /usr/lib/hadoop-0.20/logs/userlogs/job_201308281800_0008/attempt_201308281800_0008_m_000000_0/debugout (No such file or directory)
at java.io.FileInputStream.open(Native Method)
at java.io.FileInputStream.<init>(FileInputStream.java:120)
at org.apache.hadoop.mapred.TaskLogsTruncater.truncateLogs(TaskLogsTruncater.java:199)
at org.apache.hadoop.mapred.Child$4.run(Child.java:271)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1278)
at org.apache.hadoop.mapred.Child.main(Child.java:260)
I have checked the permission and it works fine for the Sample WordCount program. I am new to Hadoop. I googled but could not find anything substantial. I am using hadoop-0.20.2-cdh3u6 on a single node setup.

Resources