How to pass object from controller to step in Spring Batch - spring

I want to pass reqData form My Controller class to Step of my job,Is there any way to achieve the same any help will be appreciated. I have a Object of HttpRequestData which i have revived in controller. Thanks
package com.npst.imps.controller;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.JobExecution;
import org.springframework.batch.core.JobParameters;
import org.springframework.batch.core.JobParametersBuilder;
import org.springframework.batch.core.launch.JobLauncher;
import org.springframework.batch.item.ExecutionContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import com.npst.imps.utils.HttpRequestData;
import com.npst.imps.utils.TransactionResponseData;
import javax.servlet.http.HttpSession;
public class HttpRequestController {
TransactionResponseData transactionResponseData;
HttpSession session;
JobExecution jobExecution;
JobLauncher jobLauncher;
Job fundtrans;
String test;
public String handleHttpRequest(#RequestBody HttpRequestData reqData) throws Exception {
Logger logger = LoggerFactory.getLogger(this.getClass());
try {
JobParameters jobParameters = new JobParametersBuilder().addLong("time", System.currentTimeMillis()).toJobParameters();
jobExecution =, jobParameters);
ExecutionContext context= jobExecution.getExecutionContext();
//context.put("reqData", reqData);
transactionResponseData=(TransactionResponseData) context.get("transactionData");
} catch (Exception e) {;
return reqData+" "+transactionResponseData.getMsg()+",Tid="+transactionResponseData.getTid();
Below is my step class
I want to get the same reqData in my step class and from here on wards i will put inside step Execution object of doAfter method.
package com.npst.imps.action;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.servlet.http.HttpSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.batch.core.ExitStatus;
import org.springframework.batch.core.StepContribution;
import org.springframework.batch.core.StepExecution;
import org.springframework.batch.core.StepExecutionListener;
import org.springframework.batch.core.scope.context.ChunkContext;
import org.springframework.batch.core.step.tasklet.Tasklet;
import org.springframework.batch.repeat.RepeatStatus;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.npst.imps.service.TransactionService;
import com.npst.imps.utils.GenericTicketKey;
import com.npst.imps.utils.HttpRequestData;
import com.npst.imps.utils.TicketGenerator;
import com.npst.imps.utils.TransactionResponseData;
public class PrepareTransactionId implements Tasklet,StepExecutionListener{
static Logger logger = LoggerFactory.getLogger(PrepareTransactionId.class);
String appId;
private static TicketGenerator ticketGenerator = null;
private static GenericTicketKey genericTicketKey = null;
HttpSession session;
TransactionService transactionService;
public ExitStatus afterStep(StepExecution stepExecution) {
try {
DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
Date date = new Date();
String ticket;
System.out.println("transactionService:: PrepareTransactionId"+transactionService);
TransactionResponseData transactionData=new TransactionResponseData();
long value=transactionService.getMaxTid(appId);"Max id From db::"+value);
if (value == 0) {
value = System.currentTimeMillis() / 10000;
long l = value;
long l = value + 1;
ticketGenerator = TicketGenerator.getInstance(9999999999L, 0, l);
genericTicketKey = new GenericTicketKey(0, false, 10);
ticket = ticketGenerator.getNextEdgeTicketFor(genericTicketKey);
stepExecution.getJobExecution().getExecutionContext().put("ticket", ticket);
stepExecution.getJobExecution().getExecutionContext().put("tid", ticket);
stepExecution.getJobExecution().getExecutionContext().put("reqData", reqData);
transactionData.setMsg("Request Recived...");
stepExecution.getJobExecution().getExecutionContext().put("transactionData", transactionData);"Request Recived with tid::"+ticket);
ExitStatus exist=new ExitStatus("SUCCESS", "success");
return exist.replaceExitCode("SUCCESS");
catch(Exception e) {
return ExitStatus.FAILED;
public String getAppId() {
return appId;
public void setAppId(String appId) {
this.appId = appId;
public void beforeStep(StepExecution arg0) {
// TODO Auto-generated method stub
public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception {
return null;

TL;DR -> You can't.
JobParameters instances can only hold values of types:
The reason behind it is primarily persistence. Remember that all spring batch metadata (including job parameters) goes to a datasource.
To use custom objects, you would need to make sure that your object is immutable and thread-safe.
JobParameters documentation states:
Value object representing runtime parameters to a batch job. Because
the parameters have no individual meaning outside of the JobParameters
they are contained within, it is a value object rather than an entity.
It is also extremely important that a parameters object can be
reliably compared to another for equality, in order to determine if
one JobParameters object equals another. Furthermore, because these
parameters will need to be persisted, it is vital that the types added
are restricted. This class is immutable and therefore thread-safe.
JobParametersBuilder documentation states as well:
Helper class for creating JobParameters. Useful because all
JobParameter objects are immutable, and must be instantiated
separately to ensure typesafety. Once created, it can be used in the
same was a java.lang.StringBuilder (except, order is irrelevant), by
adding various parameter types and creating a valid JobParameters once
But i promise my objects are ok. Can I use them?
You could, but Spring developers decide to not support this feature a long time ago.
This was discussed in spring forums and even a JIRA ticket was created - status Won't fix.
Related Links
Spring - JobParameters JavaDocs
Spring - JobParametersBuilder JavaDocs
Spring - JIRA Ticket
Spring - Forums Discussion

I will not suggest to pass complete HttpRequestData. Rather than pass only requires information to batch. You can pass this information using JobParameters.
sample code
JobParameters parameters = new JobParametersBuilder().addString("key1",HttpRequestData.gteData)
now in step you can get JobParameters from StepExecution
putting custom object in JobParameters
HashMap<String, JobParameter>();
JobParameter myParameter = new JobParameter(your custom object);
map.put("myobject", myParameter);
JobParameters jobParameters = new JobParameters(map);


Spring-data-elasticsearch: cannot convert from Flux<SearchHit<Sugestao>> to Flux<Sugestao> after updated to 7.6.2. How deal with SearchHit?

Context: I want to use ElasticSearch in a full reactive stack compound by ElasticSearch and Spring WebFlux.
It is my first time using and I have worked in a reactive stack using MongoDb but it is my first time with ElasticSearch.
I have successfully follow a tutorial using ReactiveElasticsearchOperations with spring-data-elasticsearch-3.2.6 and elasticsearch-6.8.7 (Elastic Tutorial)
And the findAll/findById are working properly with elastic-6.8.7 and spring-data-elasticsearch-3.2.6
private final ReactiveElasticsearchOperations reactiveElasticsearchOperations;
private final ReactiveElasticsearchClient reactiveElasticsearchClient;
public MyModelServiceImpl(ReactiveElasticsearchOperations reactiveElasticsearchOperations,
ReactiveElasticsearchClient reactiveElasticsearchClient) {
this.reactiveElasticsearchOperations = reactiveElasticsearchOperations;
this.reactiveElasticsearchClient = reactiveElasticsearchClient;
public Mono<MyModel> findMyModelById(String id){
return reactiveElasticsearchOperations.findById(
).doOnError(throwable -> logger.error(throwable.getMessage(), throwable));
public Flux<MyModel> findAllMyModels(String field, String value){
NativeSearchQueryBuilder query = new NativeSearchQueryBuilder();
if (!StringUtils.isEmpty(field) && !StringUtils.isEmpty(value)) {
query.withQuery(QueryBuilders.matchQuery(field, value));
return reactiveElasticsearchOperations.find(,
).doOnError(throwable -> logger.error(throwable.getMessage(), throwable));
I try follow same idea with updated versions (spring-data-elasticsearch-4 and elast-7.6.2. Since I can read "Deprecated. since 4.0, use search(Query, ...) Flux emitting matching entities one by one wrapped in a SearchHit." then I got completely stuck because the result is wrraped in SearchHit. Well, searching around I din't get the idea why such wrrapper neither how to convert/map/flatMap/etc to a Flux of my model to return by controller method.
Here is my tentative causing the issue mentioned on this question topic:
import com.poc.favoritos.model.Sugestao;
import org.elasticsearch.index.query.QueryBuilders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
public class SugestaoServiceImpl implements SugestaoService{
private static final Logger logger = LoggerFactory.getLogger(SugestaoServiceImpl.class);
private final ReactiveElasticsearchOperations reactiveElasticsearchOperations;
private final ReactiveElasticsearchClient reactiveElasticsearchClient;
public SugestaoServiceImpl(ReactiveElasticsearchOperations reactiveElasticsearchOperations,
ReactiveElasticsearchClient reactiveElasticsearchClient) {
this.reactiveElasticsearchOperations = reactiveElasticsearchOperations;
this.reactiveElasticsearchClient = reactiveElasticsearchClient;
public Mono<Sugestao> findSugestaoById(String id) {
return reactiveElasticsearchOperations.get(id, Sugestao.class)
.doOnError(throwable -> logger.error(throwable.getMessage(), throwable));
public Flux<Sugestao> findAllMySugestoes(String field, String value) {
NativeSearchQueryBuilder query = new NativeSearchQueryBuilder();
if (!StringUtils.isEmpty(field) && !StringUtils.isEmpty(value)) {
query.withQuery(QueryBuilders.matchQuery(field, value));
return, Sugestao.class);
ElastiSearchConfig orinally copied from Same tutorial mentioned above . Honestly, I am not sure why do I need and what is this config adding to my project. BTW, I am studding it also from operations reference.
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.web.reactive.function.client.ExchangeStrategies;
public class ElasticsearchConfig {
public ReactiveElasticsearchClient reactiveElasticsearchClient() {
ClientConfiguration clientConfiguration = ClientConfiguration.builder()
.withWebClientConfigurer(webClient -> {
ExchangeStrategies exchangeStrategies = ExchangeStrategies.builder()
.codecs(configurer -> configurer.defaultCodecs()
return webClient.mutate().exchangeStrategies(exchangeStrategies).build();
return ReactiveRestClients.create(clientConfiguration);
public ElasticsearchConverter elasticsearchConverter() {
return new MappingElasticsearchConverter(elasticsearchMappingContext());
public SimpleElasticsearchMappingContext elasticsearchMappingContext() {
return new SimpleElasticsearchMappingContext();
public ReactiveElasticsearchOperations reactiveElasticsearchOperations() {
return new ReactiveElasticsearchTemplate(reactiveElasticsearchClient(), elasticsearchConverter());
private String elassandraHostAndPort;
As for the SearchHit: This class contains information form a search result that is not part of the entity, but part of the search result like score, sort values, highlight entries.
If you don't need this and just want to have a Flux with the entity alone:
Flux<SearchHit<Entity>> fluxSearchHits = ...
Flux<Entity> fluxEntity = -> searchHit.getContent);
As for the configuration:
you need the ReactiveElasticsearchClient bean to configure Spring Data Elasticsearch. The other 3 beans: Don't know why they are there; they are not needed for Spring Data Elasticsearch 4.0
Edit 16.05.2020:
The configuration: You should derive your configuration class from AbstractReactiveElasticsearchConfiguration, then you don't need the other beans, because the base class defines the necessary things:
public class ElasticsearchConfig extends AbstractReactiveElasticsearchConfiguration{
private String elassandraHostAndPort;
public ReactiveElasticsearchClient reactiveElasticsearchClient() {
ClientConfiguration clientConfiguration = ClientConfiguration.builder()
return ReactiveRestClients.create(clientConfiguration);
and the customized WebClientConfiguration is only needed if you retrieve large result sets and the default memory size for the result buffer is too low.

Need a way to prevent unwanted job param from propagating to next execution of spring boot batch job

I am running a batch app using spring boot 2.1.2 and spring batch 4.1.1. The app uses a MySQL database for the spring batch metadata data source.
First, I run the job with this command:
java -jar target/batchdemo-0.0.1-SNAPSHOT.jar -Dspring.batch.job.names=echo com.paypal.batch.batchdemo.BatchdemoApplication myparam1=value1 myparam2=value2
Notice I am passing two params:
Since the job uses RunIdIncrementer, the actual params used by the app are logged as:
Job: [SimpleJob: [name=echo]] completed with the following parameters: [{myparam2=value2,, myparam1=value1}]
Next I run the job again, this time dropping myparam2:
java -jar target/batchdemo-0.0.1-SNAPSHOT.jar -Dspring.batch.job.names=echo com.paypal.batch.batchdemo.BatchdemoApplication myparam1=value1
This time the job again runs with param2 still included:
Job: [SimpleJob: [name=echo]] completed with the following parameters: [{myparam2=value2,, myparam1=value1}]
This causes business logic to be invoked as if I had again passed myparam2 to the app.
Is there a way to drop the job parameter and have it not be passed to the next instance?
App code:
package com.paypal.batch.batchdemo;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.Step;
import org.springframework.batch.core.configuration.annotation.EnableBatchProcessing;
import org.springframework.batch.core.configuration.annotation.JobBuilderFactory;
import org.springframework.batch.core.configuration.annotation.StepBuilderFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.Bean;
public class BatchdemoApplication {
public static void main(String[] args) {, args);
JobBuilderFactory jobBuilder;
StepBuilderFactory stepBuilder;
ParamEchoTasklet paramEchoTasklet;
public RunIdIncrementer incrementer() {
return new RunIdIncrementer();
public Job job() {
return jobBuilder.get("echo").incrementer(incrementer()).start(echoParamsStep()).build();
public Step echoParamsStep() {
return stepBuilder.get("echoParams").tasklet(paramEchoTasklet).build();
package com.paypal.batch.batchdemo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.batch.core.StepContribution;
import org.springframework.batch.core.scope.context.ChunkContext;
import org.springframework.batch.core.step.tasklet.Tasklet;
import org.springframework.batch.repeat.RepeatStatus;
import org.springframework.stereotype.Component;
public class ParamEchoTasklet implements Tasklet {
public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception {"ParamEchoTasklet BEGIN");
chunkContext.getStepContext().getJobParameters().entrySet().stream().forEachOrdered((entry) -> {
String key = entry.getKey();
Object value = entry.getValue();"Param {} = {}", key, value);
});"ParamEchoTasklet END");
return RepeatStatus.FINISHED;
private Logger LOGGER = LoggerFactory.getLogger(ParamEchoTasklet.class);
I debugged the spring batch and spring boot code, and here is what is happening. JobParametersBuilder line 273 adds the params from the most recent prior job instance to the nextParameters map along with any params added by the JobParametersIncrementer:
List<JobExecution> previousExecutions = this.jobExplorer.getJobExecutions(lastInstances.get(0));
if (previousExecutions.isEmpty()) {
// Normally this will not happen - an instance exists with no executions
nextParameters = incrementer.getNext(new JobParameters());
else {
JobExecution previousExecution = previousExecutions.get(0);
nextParameters = incrementer.getNext(previousExecution.getJobParameters());
Then since I am using spring boot, JobLauncherCommandLineRunner line 213 merges the prior params with the new params passed for the new execution, which results in the old param being passed to the new execution:
return merge(nextParameters, jobParameters);
It appears to be impossible to run the job ever again without the param unless I am missing something. Could it be a bug in spring batch?
The normal behavior for RunIdIncrementer appears to increment the run id for the JobExecution and pass along the remaining prior JobParameters. I would not call this a bug.
Keep in mind that the idea behind the RunIdIncrementer is simply to change one identifying parameter to allow a job to be run again, even if a prior run with the same (other) parameters completed successfully and restart has not been configured.
You could always create a customized incrementer by implementing JobParametersIncrementer.
Another alternative is to use the JobParametersBuilder to build a JobParameters object and then use the JobLauncher to run your job with those parameters. I often use the current system time in milliseconds to create uniqueness if I'm running jobs that will otherwise have the same JobParameters. You will obviously have to figure out the logic for pulling your specific parameters from the command line (or wherever else) and iterating over them to populate the JobParameters object.
public JobExecution executeJob(Job job) {
JobExecution jobExecution = null;
try {
JobParameters jobParameters =
new JobParametersBuilder()
.addLong( "time.millis", System.currentTimeMillis(), true)
.addString( "param1", "value1", true)
jobExecution = job, jobParameters );
} catch ( JobInstanceAlreadyCompleteException | JobRestartException | JobParametersInvalidException | JobExecutionAlreadyRunningException e ) {
return jobExecution;

is putting sqs-consumer to detect receiveMessage event in sqs scalable

I am using aws sqs as message queue. After sqs.sendMessage sends the data , I want to detect sqs.receiveMessage via either infinite loop or event triggering in scalable way. Then I came accross sqs-consumer
to handle sqs.receiveMessage events, the moment it receives the messages. But I was wondering , is it the most suitable way to handle message passing between microservices or is there any other better way to handle this thing?
I had written the code in java for fetching the data from sqs queue with SQSBufferedAsyncClient, advantages using this API is buffered the messages in async mode.
package com.sxm.aota.tsc.config;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.amazonaws.AmazonClientException;
import com.amazonaws.AmazonWebServiceRequest;
import com.amazonaws.ClientConfiguration;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.auth.InstanceProfileCredentialsProvider;
import com.amazonaws.regions.Region;
import com.amazonaws.regions.Regions;
import com.amazonaws.retry.RetryPolicy;
import com.amazonaws.retry.RetryPolicy.BackoffStrategy;
public class SQSConfiguration {
/** The properties cache config. */
private PropertiesCacheConfig propertiesCacheConfig;
public AmazonSQSAsync amazonSQSClient() {
// Create Client Configuration
ClientConfiguration clientConfig = new ClientConfiguration()
.withRetryPolicy(new RetryPolicy(
new BackoffStrategy() {
public long delayBeforeNextRetry(AmazonWebServiceRequest req,
AmazonClientException exception, int retries) {
// Delay between retries is 10s unless it is UnknownHostException
// for which retry is 60s
return exception.getCause() instanceof UnknownHostException ? 60_000L : 10_000L;
}, 10, true));
// Create Amazon client
AmazonSQSAsync asyncSqsClient = null;
if (propertiesCacheConfig.isIamRole()) {
asyncSqsClient = new AmazonSQSAsyncClient(new InstanceProfileCredentialsProvider(true), clientConfig);
} else {
asyncSqsClient = new AmazonSQSAsyncClient(
new BasicAWSCredentials("sceretkey", "accesskey"));
final Regions regions = Regions.fromName(propertiesCacheConfig.getRegionName());
// Buffer for request batching
final QueueBufferConfig bufferConfig = new QueueBufferConfig();
// Ensure visibility timeout is maintained
// Enable long polling
// Set batch parameters
// bufferConfig.setMaxBatchOpenMs(500);
// Set to receive messages only on demand
// bufferConfig.setMaxDoneReceiveBatches(0);
// bufferConfig.setMaxInflightReceiveBatches(0);
return new AmazonSQSBufferedAsyncClient(asyncSqsClient, bufferConfig);
then written the scheduleR which executes after every 2 secs and fetches the data from queue, process it and delete it from queue before visibility timeout otherwise it will be ready for processing again when visibility tiiimeout expires again.
package com.sxm.aota.tsc.sqs;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import javax.annotation.PostConstruct;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.DependsOn;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import com.fasterxml.jackson.databind.ObjectMapper;
* The Class TSCDataSenderScheduledTask.
* Sends the aggregated Vehicle data to TSC in batches
#DependsOn({ "propertiesCacheConfig", "amazonSQSClient" })
public class SQSScheduledTask {
private static final Logger LOGGER = LoggerFactory.getLogger(SQSScheduledTask.class);
private PropertiesCacheConfig propertiesCacheConfig;
public AmazonSQSAsync amazonSQSClient;
* Timer Task that will run after specific interval of time Majorly
* responsible for sending the data in batches to TSC.
private String queueUrl;
private final ObjectMapper mapper = new ObjectMapper();
public void initialize() throws Exception {"SQS-Publisher", "Publisher initializing for queue " + propertiesCacheConfig.getSQSQueueName(),
"Publisher initializing for queue " + propertiesCacheConfig.getSQSQueueName());
// Get queue URL
final GetQueueUrlRequest request = new GetQueueUrlRequest().withQueueName(propertiesCacheConfig.getSQSQueueName());
final GetQueueUrlResult response = amazonSQSClient.getQueueUrl(request);
queueUrl = response.getQueueUrl();"SQS-Publisher", "Publisher initialized for queue " + propertiesCacheConfig.getSQSQueueName(),
"Publisher initialized for queue " + propertiesCacheConfig.getSQSQueueName() + ", URL = " + queueUrl);
#Scheduled(fixedDelayString = "${sqs.consumer.delay}")
public void timerTask() {
final ReceiveMessageResult receiveResult = getMessagesFromSQS();
String messageBody = null;
if (receiveResult != null && receiveResult.getMessages() != null && !receiveResult.getMessages().isEmpty()) {
try {
messageBody = receiveResult.getMessages().get(0).getBody();
String messageReceiptHandle = receiveResult.getMessages().get(0).getReceiptHandle();
Vehicles vehicles = mapper.readValue(messageBody, Vehicles.class);
} catch (Exception e) {
LOGGER.error("Exception while processing SQS message : {}", messageBody);
// Message is not deleted on SQS and will be processed again after visibility timeout
public void processMessage(List<Vehicle> vehicles,String messageReceiptHandle) throws InterruptedException {
//processing code
//delete the sqs message as the processing is completed
//Need to create atomic counter that will be increamented by all TS.. Once it will be 0 then we will be deleting the messages
amazonSQSClient.deleteMessage(new DeleteMessageRequest(queueUrl, messageReceiptHandle));
private ReceiveMessageResult getMessagesFromSQS() {
try {
// Create new request and fetch data from Amazon SQS queue
final ReceiveMessageResult receiveResult = amazonSQSClient
.receiveMessage(new ReceiveMessageRequest().withMaxNumberOfMessages(1).withQueueUrl(queueUrl));
return receiveResult;
} catch (Exception e) {
LOGGER.error("Error while fetching data from SQS", e);
return null;

Hbase mapside join- One of the tables is not getting read? read from hbase and right result into hbase

I am trying to do mapside join of two tables located in Hbase. My aim is to keep record of the small table in hashmap and compare with the big table, and once matched, write record in a table in hbase again. I wrote the similar code for join operation using both Mapper and Reducer and it worked well and both tables are scanned in mapper class. But since reduce side join is not efficient at all, I want to join the tables in mapper side only. In the following code "commented if block" is just to see that it returns false always and first table (small one) is not getting read. Any hints helps are appreciated. I am using sandbox of HDP.
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
//import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.mapred.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
public class JoinDriver extends Configured implements Tool {
static int row_index = 0;
public static class JoinJobMapper extends TableMapper<ImmutableBytesWritable, Put> {
private static byte[] big_table_bytarr = Bytes.toBytes("big_table");
private static byte[] small_table_bytarr = Bytes.toBytes("small_table");
HashMap<String,String> myHashMap = new HashMap<String, String>();
byte[] c1_value;
byte[] c2_value;
String big_table;
String small_table;
String big_table_c1;
String big_table_c2;
String small_table_c1;
String small_table_c2;
Text mapperKeyS;
Text mapperValueS;
Text mapperKeyB;
Text mapperValueB;
public void map(ImmutableBytesWritable rowKey, Result columns, Context context) {
TableSplit currentSplit = (TableSplit) context.getInputSplit();
byte[] tableName = currentSplit.getTableName();
try {
Put put = new Put(Bytes.toBytes(++row_index));
// put small table into hashmap - myhashMap
if (Arrays.equals(tableName, small_table_bytarr)) {
c1_value = columns.getValue(Bytes.toBytes("s_cf"), Bytes.toBytes("s_cf_c1"));
c2_value = columns.getValue(Bytes.toBytes("s_cf"), Bytes.toBytes("s_cf_c2"));
small_table_c1 = new String(c1_value);
small_table_c2 = new String(c2_value);
mapperKeyS = new Text(small_table_c1);
mapperValueS = new Text(small_table_c2);
} else if (Arrays.equals(tableName, big_table_bytarr)) {
c1_value = columns.getValue(Bytes.toBytes("b_cf"), Bytes.toBytes("b_cf_c1"));
c2_value = columns.getValue(Bytes.toBytes("b_cf"), Bytes.toBytes("b_cf_c2"));
big_table_c1 = new String(c1_value);
big_table_c2 = new String(c2_value);
mapperKeyB = new Text(big_table_c1);
mapperValueB = new Text(big_table_c2);
// if (set.containsKey(big_table_c1)){
put.addColumn(Bytes.toBytes("join"), Bytes.toBytes("join_c1"), Bytes.toBytes(big_table_c1));
context.write(new ImmutableBytesWritable(mapperKeyB.getBytes()), put );
put.addColumn(Bytes.toBytes("join"), Bytes.toBytes("join_c2"), Bytes.toBytes(big_table_c2));
context.write(new ImmutableBytesWritable(mapperKeyB.getBytes()), put );
put.addColumn(Bytes.toBytes("join"), Bytes.toBytes("join_c3"),Bytes.toBytes((myHashMap.get(big_table_c1))));
context.write(new ImmutableBytesWritable(mapperKeyB.getBytes()), put );
// }
} catch (Exception e) {
// TODO : exception handling logic
public int run(String[] args) throws Exception {
List<Scan> scans = new ArrayList<Scan>();
Scan scan1 = new Scan();
scan1.setAttribute("", Bytes.toBytes("small_table"));
Scan scan2 = new Scan();
scan2.setAttribute("", Bytes.toBytes("big_table"));
Configuration conf = new Configuration();
Job job = new Job(conf);
TableMapReduceUtil.initTableMapperJob(scans, JoinJobMapper.class, ImmutableBytesWritable.class, Put.class, job);
TableMapReduceUtil.initTableReducerJob("joined_table", null, job);
return 0;
public static void main(String[] args) throws Exception {
JoinDriver runJob = new JoinDriver();;
By reading your problem statement I believe you have got some wrong idea about uses of Multiple HBase table input.
I suggest you load small table in a HashMap, in setup method of mapper class. Then use map only job on big table, in map method you can fetch corresponding values from the HashMap which you loaded earlier.
Let me know how this works out.

Mapreduce with HCATALOG integration with oozie in MAPR

I have written a mapreduce program that reads the data from hive table using HCATLOG and writes into HBase. This is a map only job with no reducers. I have ran the program from command line and it works as expected(Created a fat jar to avoid Jar issues). I wanted to integrate it oozie (with Help of HUE) . I have two options to run it
Use Mapreduce Action
Use Java Action
Since my Mapreduce program has a driver method that holds the below code
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat;
public class HBaseValdiateInsertDriver {
public static void main(String[] args) throws Exception {
String dbName = "Test";
String tableName = "emp";
Configuration conf = new Configuration();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "HBase Get Put Demo");
HCatInputFormat.setInput(job, dbName, tableName, null);
FileInputFormat.addInputPath(job, new Path("maprfs:///user/input"));
FileOutputFormat.setOutputPath(job, new Path("maprfs:///user/output"));
How do i specify the driver method in oozie, All that i can see is to specify mapper and reducer class.Can someone guide me how do i set the properties ?
Using java action i can specify my driver class as the main class and get this executed , but i face errors like table not found, HCATLOG jars not found etc. I have include hive-site.xml in the workflow(Using Hue) but i feel the system is not able to pick up the properties. Can someone advise me what all do i have to take care of, are there any other configuration properties that i need to include ?
Also the sample program i referred in cloudera website uses
HCatInputFormat.setInput(job, InputJobInfo.create(dbName,
inputTableName, null));
where as i use the below (I dont see a method that accept the above input
HCatInputFormat.setInput(job, dbName, tableName, null);
Below is my mapper code
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Mapper;
public class HBaseValdiateInsert extends Mapper<WritableComparable, HCatRecord, Text, Text> {
static HTableInterface table;
static HTableInterface inserted;
private String hbaseDate = null;
String existigValue=null;
List<Put> putList = new ArrayList<Put>();
public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
String tablename = "dev_arch186";
table = Utils.getTable(tablename);
public void cleanup(Context context) {
try {
} catch (IOException e) {
public void map(WritableComparable key, HCatRecord value, Context context) throws IOException, InterruptedException {
String name_hive = (String) value.get(0);
String id_hive = (String) value.get(1);
String rec[] = test.toString().split(",");
Get g = new Get(Bytes.toBytes(name_hive));
if (existigValue.equalsIgnoreCase("NA") || !existigValue.equalsIgnoreCase(id_hive)) {
Put put = new Put(Bytes.toBytes(rec[0]));
public String getOneRecord(byte[] columnFamily, byte[] columnQualifier, String rowKey)
throws IOException {
Get get = new Get(rowKey.getBytes());
Result rs = table.get(get);
rs.getColumn(columnFamily, columnQualifier);
System.out.println(rs.containsColumn(columnFamily, columnQualifier));
KeyValue result = rs.getColumnLatest(columnFamily,columnQualifier);
if (rs.containsColumn(columnFamily, columnQualifier))
return (Bytes.toString(result.getValue()));
return "NA";
public boolean columnQualifierExists(String tableName, String ColumnFamily,
String ColumnQualifier, String rowKey) throws IOException {
Get get = new Get(rowKey.getBytes());
Result rs = table.get(get);
I use MapR (M3) Cluster with HUE as the interface for oozie.
Hive Version : 1-0
HCAT Version: 1-0
I couldn't find any way to initialize HCatInputFormat from Oozie mapreduce action.
But I have a workaround as below.
Created LazyHCatInputFormat by extending HCatInputFormat.
Override the getJobInfo method, to handle initalization. This will be called as part of getSplits(..) call.
private static void lazyInit(Configuration conf){
conf = new Configuration(false);
conf.addResource(new Path(System.getProperty("oozie.action.conf.xml")));
conf.addResource(new org.apache.hadoop.fs.Path("hive-config.xml"));
String databaseName = conf.get("LazyHCatInputFormat.databaseName");
String tableName = conf.get("LazyHCatInputFormat.tableName");
String partitionFilter = conf.get("LazyHCatInputFormat.partitionFilter");
setInput(conf, databaseName, tableName);
//System.out.println("After lazyinit : "+conf.get(""));
}catch(Exception e){
System.out.println("*** LAZY INIT FAILED ***");
public static InputJobInfo getJobInfo(Configuration conf)
throws IOException {
String jobString = conf.get("");
if (jobString == null) {
jobString = conf.get("");
if(jobString == null){
throw new IOException("job information not found in JobContext. HCatInputFormat.setInput() not called?");
return (InputJobInfo) HCatUtil.deserialize(jobString);
In the oozie map-redcue action, configured as below.
<value>HCAT DatabaseNameHere</value>
<value>HCAT TableNameHere</value>
This might not be the best implementation, but a quick hack to make it work.
