//抓包
DataStream<byte[]> logSource = env.addSource(new PacpParalleSourceFunction()).setParallelism(CAPTURE_PACKAGE_PARALLELISM);
//解析
XmlLogParse xmlLogParse = new XmlLogParse();
DataStream<JSONObject> objectDataStream = logSource.map(new Log2ObjFunction(xmlLogParse)).setParallelism(PARSE_LOG_PARALLELISM);
//写csv The parallelism of non parallel operator must be 1
LogStorage logStorage = new WriteFileStorage();
objectDataStream.countWindowAll(ConstantField.BATCH_DEAL_COUNT).apply(
new AllWindowFunction<JSONObject, Object, GlobalWindow>() {
#Override
public void apply(GlobalWindow globalWindow, Iterable<JSONObject> iterable, Collector<Object> collector) throws Exception {
ArrayList<JSONObject> list = Lists.newArrayList(iterable);
if (list.size() > 0) {
long start = System.currentTimeMillis();
logStorage.doStorage(list);
logger.info("写入文件 " + list.size() + " 耗时" + (System.currentTimeMillis() - start) + "ms " + Thread.currentThread().getName());
}
}
}
).name("out put to csv").setParallelism(1);
//写kafka
Properties props = new Properties();
props.put("bootstrap.servers", KAFKA_HOST);
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("topic", KAFKA_DST_TOPIC_NAME);
DataStreamSink<JSONObject> obj2kafka = objectDataStream.addSink(new KafkaSinkFunction(props)).setParallelism(WRITE_KAFKA_PARALLELISM);
obj2kafka.name("output2kafka");
// execute application
env.execute("elksoc-log4p");
How can I make these two tasks parallel?
How can I make these two tasks parallel?
How can I make these two tasks parallel?
How can I make these two tasks parallel?
How can I make these two tasks parallel?
How can I make these two tasks parallel?
How can I make these two tasks parallel?
Perhaps you should use Side Outputs, for more information see https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/dev/datastream/side_output/
Related
Hi i have one mapreduce apllication that bulk loads data into HBase .
I have total 142 text files of total size 200gb.
My mapper gets completed within 5 minutes and all reducer also but last one is stuck at 100%.
Its taking very long time and running from past 24 hr .
I have one column family .
My row key is like below .
48433197315|1972-03-31T00:00:00Z|4 48433197315|1972-03-31T00:00:00Z|38 48433197315|1972-03-31T00:00:00Z|41 48433197315|1972-03-31T00:00:00Z|23 48433197315|1972-03-31T00:00:00Z|7 48433336118|1972-03-31T00:00:00Z|17 48433197319|1972-03-31T00:00:00Z|64 48433197319|1972-03-31T00:00:00Z|58 48433197319|1972-03-31T00:00:00Z|61 48433197319|1972-03-31T00:00:00Z|73 48433197319|1972-03-31T00:00:00Z|97 48433336119|1972-03-31T00:00:00Z|7
I have created my table like this .
private static Configuration getHbaseConfiguration() {
try {
if (hbaseConf == null) {
System.out.println(
"UserId= " + USERID + " \t keytab file =" + KEYTAB_FILE + " \t conf =" + KRB5_CONF_FILE);
HBaseConfiguration.create();
hbaseConf = HBaseConfiguration.create();
hbaseConf.set("mapreduce.job.queuename", "root.fricadev");
hbaseConf.set("mapreduce.child.java.opts", "-Xmx6553m");
hbaseConf.set("mapreduce.map.memory.mb", "8192");
hbaseConf.setInt(MAX_FILES_PER_REGION_PER_FAMILY, 1024);
System.setProperty("java.security.krb5.conf", KRB5_CONF_FILE);
UserGroupInformation.loginUserFromKeytab(USERID, KEYTAB_FILE);
}
} catch (Exception e) {
e.printStackTrace();
}
return hbaseConf;
}
/**
* HBase bulk import example Data preparation MapReduce job driver
*
* args[0]: HDFS input path args[1]: HDFS output path
*
* #throws Exception
*
*/
public static void main(String[] args) throws Exception {
if (hbaseConf == null)
hbaseConf = getHbaseConfiguration();
String outputPath = args[2];
hbaseConf.set("data.seperator", DATA_SEPERATOR);
hbaseConf.set("hbase.table.name", args[0]);
hbaseConf.setInt(MAX_FILES_PER_REGION_PER_FAMILY, 1024);
Job job = new Job(hbaseConf);
job.setJarByClass(HBaseBulkLoadDriver.class);
job.setJobName("Bulk Loading HBase Table::" + args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapperClass(HBaseBulkLoadMapperUnzipped.class);
// job.getConfiguration().set("mapreduce.job.acl-view-job",
// "bigdata-app-fricadev-sdw-u6034690");
if (HbaseBulkLoadMapperConstants.FUNDAMENTAL_ANALYTIC.equals(args[0])) {
HTableDescriptor descriptor = new HTableDescriptor(Bytes.toBytes(args[0]));
descriptor.addFamily(new HColumnDescriptor(COLUMN_FAMILY));
HBaseAdmin admin = new HBaseAdmin(hbaseConf);
byte[] startKey = new byte[16];
Arrays.fill(startKey, (byte) 0);
byte[] endKey = new byte[16];
Arrays.fill(endKey, (byte) 255);
admin.createTable(descriptor, startKey, endKey, REGIONS_COUNT);
admin.close();
// HColumnDescriptor hcd = new
// HColumnDescriptor(COLUMN_FAMILY).setMaxVersions(1);
// createPreSplitLoadTestTable(hbaseConf, descriptor, hcd);
}
job.getConfiguration().setBoolean("mapreduce.compress.map.output", true);
job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", true);
job.getConfiguration().setClass("mapreduce.map.output.compression.codec",
org.apache.hadoop.io.compress.GzipCodec.class, org.apache.hadoop.io.compress.CompressionCodec.class);
job.getConfiguration().set("hfile.compression", Compression.Algorithm.LZO.getName());
// Connection connection =
// ConnectionFactory.createConnection(hbaseConf);
// Table table = connection.getTable(TableName.valueOf(args[0]));
FileInputFormat.setInputPaths(job, args[1]);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setMapOutputValueClass(Put.class);
HFileOutputFormat.configureIncrementalLoad(job, new HTable(hbaseConf, args[0]));
System.exit(job.waitForCompletion(true) ? 0 : -1);
System.out.println("job is successfull..........");
// LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hbaseConf);
// loader.doBulkLoad(new Path(outputPath), (HTable) table);
HBaseBulkLoad.doBulkLoad(outputPath, args[0]);
}
/**
* Enum of counters.
* It used for collect statistics
*/
public static enum Counters {
/**
* Counts data format errors.
*/
WRONG_DATA_FORMAT_COUNTER
}
}
There is no reducer in my code only mapper .
My ,mapper code is like this .
public class FundamentalAnalyticLoader implements TableLoader {
private ImmutableBytesWritable hbaseTableName;
private Text value;
private Mapper<LongWritable, Text, ImmutableBytesWritable, Put>.Context context;
private String strFileLocationAndDate;
#SuppressWarnings("unchecked")
public FundamentalAnalyticLoader(ImmutableBytesWritable hbaseTableName, Text value, Context context,
String strFileLocationAndDate) {
//System.out.println("Constructing Fundalmental Analytic Load");
this.hbaseTableName = hbaseTableName;
this.value = value;
this.context = context;
this.strFileLocationAndDate = strFileLocationAndDate;
}
#SuppressWarnings("deprecation")
public void load() {
if (!HbaseBulkLoadMapperConstants.FF_ACTION.contains(value.toString())) {
String[] values = value.toString().split(HbaseBulkLoadMapperConstants.DATA_SEPERATOR);
String[] strArrFileLocationAndDate = strFileLocationAndDate
.split(HbaseBulkLoadMapperConstants.FIELD_SEPERATOR);
if (17 == values.length) {
String strKey = values[5].trim() + "|" + values[0].trim() + "|" + values[3].trim() + "|"
+ values[4].trim() + "|" + values[14].trim() + "|" + strArrFileLocationAndDate[0].trim() + "|"
+ strArrFileLocationAndDate[2].trim();
//String strRowKey=StringUtils.leftPad(Integer.toString(Math.abs(strKey.hashCode() % 470)), 3, "0") + "|" + strKey;
byte[] hashedRowKey = HbaseBulkImportUtil.getHash(strKey);
Put put = new Put((hashedRowKey));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FUNDAMENTAL_SERIES_ID),
Bytes.toBytes(values[0].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FUNDAMENTAL_SERIES_ID_OBJECT_TYPE_ID),
Bytes.toBytes(values[1].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FUNDAMENTAL_SERIES_ID_OBJECT_TYPE),
Bytes.toBytes(values[2]));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FINANCIAL_PERIOD_END_DATE),
Bytes.toBytes(values[3].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FINANCIAL_PERIOD_TYPE),
Bytes.toBytes(values[4].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.LINE_ITEM_ID), Bytes.toBytes(values[5].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_ITEM_INSTANCE_KEY),
Bytes.toBytes(values[6].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_VALUE), Bytes.toBytes(values[7].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_CONCEPT_CODE),
Bytes.toBytes(values[8].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_VALUE_CURRENCY_ID),
Bytes.toBytes(values[9].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_IS_ESTIMATED),
Bytes.toBytes(values[10].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_AUDITABILITY_EQUATION),
Bytes.toBytes(values[11].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FINANCIAL_PERIOD_TYPE_ID),
Bytes.toBytes(values[12].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_CONCEPT_ID),
Bytes.toBytes(values[13].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_LINE_ITEM_IS_YEAR_TO_DATE),
Bytes.toBytes(values[14].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.IS_ANNUAL), Bytes.toBytes(values[15].trim()));
// put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
// Bytes.toBytes(HbaseBulkLoadMapperConstants.TAXONOMY_ID),
// Bytes.toBytes(values[16].trim()));
//
// put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
// Bytes.toBytes(HbaseBulkLoadMapperConstants.INSTRUMENT_ID),
// Bytes.toBytes(values[17].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FF_ACTION),
Bytes.toBytes(values[16].substring(0, values[16].length() - 3)));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FILE_PARTITION),
Bytes.toBytes(strArrFileLocationAndDate[0].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FILE_PARTITION_DATE),
Bytes.toBytes(strArrFileLocationAndDate[2].trim()));
try {
context.write(hbaseTableName, put);
} catch (IOException e) {
context.getCounter(Counters.WRONG_DATA_FORMAT_COUNTER).increment(1);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
System.out.println("Values length is less 15 and value is " + value.toString());
}
}
}
Any help to improve the speed is highly appreciated .
Counter image
here`
I suspect that all records go into single region.
When you created empty table, HBase splitted key address space in even ranges. But because all actual keys share the same prefix, they all go into single region. That means that single region/reduce task does all the job and all others regions/reduce tasks do not do anything useful. You may check this hypothesis by looking at Hadoop counters: how many bytes slow reduce task read/wrote compared to other reduce tasks.
If this is the problem, then you need to manually prepare split keys and create table by using createTable(HTableDescriptor desc, byte[][] splitKeys. Split keys should evenly divide your actual dataset for optimal performance.
Example #1. If your keys were ordinary English words, then it would be easy to split table into 26 regions by first character (split keys are 'a', 'b', ..., 'z'). Or to split it into 26*26 regions by first two characters: ('aa', 'ab', ..., 'zz'). Regions would not be necessarily even, but this would be anyway better than to have only single region.
Example #2. If your keys were 4-byte hashes, then it would be easy to split table into 256 regions by first byte (0x00, 0x01, ..., 0xff) or into 2^16 regions by first two bytes.
In your particular case, I see two options:
Search for smallest key (in sorted order) and for largest key in your dataset. And use them as startKey and endKey to Admin.createTable(). This will work well only if keys are uniformly distributed between startKey and endKey.
Prefix your keys with hash(key) and use method in Example #2. This should work well, but you won't be able to make semantical queries like (KEY >= ${first} and KEY <= ${last}).
Mostly if a job is hanging at the last minute or sec, then the issue could be a particular node or resources having concurrency issues etc.
Small check list could be:
1. Try again with smaller data set. This will rule out basic functioning of the code.
2. Since most of the job is done, the mapper and reducer might be good. You can try the job running with same volume few times. The logs can help you identify if the same node is having issues for repeated runs.
3. Verify if the output is getting generated as expected.
4. You can also reduce the number of columns you are trying to add to HBase. This will relieve the load with same volume.
Jobs getting hanged can be caused due to variety of issues. But trouble shooting mostly consists of above steps - verifying the cause if its data related, resource related, a specific node related, memory related etc.
For a research project, I tried sorting the elements in an RDD. I did this in two different approaches.
In the first method, I applied a mapPartitions() function on the RDD, so that it would sort the contents of the RDD, and provide a result RDD that contains the sorted list as the only record in the RDD. Then, I applied a reduce function which basically merges sorted lists.
I ran these experiments on an EC2 cluster containing 30 nodes. I set it up using the spark ec2 script. The data file was stored in HDFS.
In the second approach I used the sortBy method in Spark.
I performed these operation on the US census data(100MB) found here
A single lines looks like this
9, Not in universe, 0, 0, Children, 0, Not in universe, Never married, Not in universe or children, Not in universe, White, All other, Female, Not in universe, Not in universe, Children or Armed Forces, 0, 0, 0, Nonfiler, Not in universe, Not in universe, Child <18 never marr not in subfamily, Child under 18 never married, 1758.14, Nonmover, Nonmover, Nonmover, Yes, Not in universe, 0, Both parents present, United-States, United-States, United-States, Native- Born in the United States, 0, Not in universe, 0, 0, 94, - 50000.
I sorted based on the 25th value in the CSV. In this line that is 1758.14.
I noticed that sortBy performs worse than the other method. Is this the expected scenario? If it is, why wouldn't the mapPartitions() and reduce() be the default sorting approach?
Here is my implementation
public static void sortBy(JavaSparkContext sc){
JavaRDD<String> rdd = sc.textFile("/data.txt",32);
long start = System.currentTimeMillis();
rdd.sortBy(new Function<String, Double>(){
#Override
public Double call(String v1) throws Exception {
// TODO Auto-generated method stub
String [] arr = v1.split(",");
return Double.parseDouble(arr[24]);
}
}, true, 9).collect();
long end = System.currentTimeMillis();
System.out.println("SortBy: " + (end - start));
}
public static void sortList(JavaSparkContext sc){
JavaRDD<String> rdd = sc.textFile("/data.txt",32); //parallelize(l, 8);
long start = System.currentTimeMillis();
JavaRDD<LinkedList<Tuple2<Double, String>>> rdd3 = rdd.mapPartitions(new FlatMapFunction<Iterator<String>, LinkedList<Tuple2<Double, String>>>(){
#Override
public Iterable<LinkedList<Tuple2<Double, String>>> call(Iterator<String> t)
throws Exception {
// TODO Auto-generated method stub
LinkedList<Tuple2<Double, String>> lines = new LinkedList<Tuple2<Double, String>>();
while(t.hasNext()){
String s = t.next();
String arr1[] = s.split(",");
Tuple2<Double, String> t1 = new Tuple2<Double, String>(Double.parseDouble(arr1[24]),s);
lines.add(t1);
}
Collections.sort(lines, new IncomeComparator());
LinkedList<LinkedList<Tuple2<Double, String>>> list = new LinkedList<LinkedList<Tuple2<Double, String>>>();
list.add(lines);
return list;
}
});
rdd3.reduce(new Function2<LinkedList<Tuple2<Double, String>>, LinkedList<Tuple2<Double, String>>, LinkedList<Tuple2<Double, String>>>(){
#Override
public LinkedList<Tuple2<Double, String>> call(
LinkedList<Tuple2<Double, String>> a,
LinkedList<Tuple2<Double, String>> b) throws Exception {
// TODO Auto-generated method stub
LinkedList<Tuple2<Double, String>> result = new LinkedList<Tuple2<Double, String>>();
while (a.size() > 0 && b.size() > 0) {
if (a.getFirst()._1.compareTo(b.getFirst()._1) <= 0)
result.add(a.poll());
else
result.add(b.poll());
}
while (a.size() > 0)
result.add(a.poll());
while (b.size() > 0)
result.add(b.poll());
return result;
}
});
long end = System.currentTimeMillis();
System.out.println("MapPartitions: " + (end - start));
}
Collect() is a major bottleneck as it returns all the results to the driver.
It produces both IO hit & additional network traffic to a single source (in this case - the driver).
It also blocks other operations.
Instead of collect() in your first sortBy() code segment,
try performing a parallel operation such as saveAsTextFile(tmp) than read back using sc.textFile(tmp).
The other sortBy() code segment utilizes both mapPartitions() and reduce() parallel APIs - so the entire work is done in parallel.
It would seem that this is the cause for the difference in end-to-end performance times.
Note that your findings don't necessarily mean that the sum of execution times over all machines is worse.
I am aware that hadoop REST API provides access to job status via program.
Similarly is there any way to get the spark job status in a program?
It is not similar to a REST API, but you can track the status of jobs from inside the application by registering a SparkListener with SparkContext.addSparkListener. It goes something like this:
sc.addSparkListener(new SparkListener {
override def onStageCompleted(event: SparkListenerStageCompleted) = {
if (event.stageInfo.stageId == myStage) {
println(s"Stage $myStage is done.")
}
}
})
Providing the answer for Java. In Scala would be almost similar just using SparkContext instead of JavaSparkContext.
Assume you have a JavaSparkContext:
private final JavaSparkContext sc;
Following code allow to get all info available from Jobs and Stages tabs:
JavaSparkStatusTracker statusTracker = sc.statusTracker();
for(int jobId: statusTracker.getActiveJobIds()) {
SparkJobInfo jobInfo = statusTracker.getJobInfo(jobId);
log.info("Job " + jobId + " status is " + jobInfo.status().name());
log.info("Stages status:");
for(int stageId: jobInfo.stageIds()) {
SparkStageInfo stageInfo = statusTracker.getStageInfo(stageId);
log.info("Stage id=" + stageId + "; name = " + stageInfo.name()
+ "; completed tasks:" + stageInfo.numCompletedTasks()
+ "; active tasks: " + stageInfo.numActiveTasks()
+ "; all tasks: " + stageInfo.numTasks()
+ "; submission time: " + stageInfo.submissionTime());
}
}
Unfortunately everything else is accessible only from scala Spark Context, so could be some difficulties to work with provided structures from Java.
Pools list: sc.sc().getAllPools()
Executor Memory Status: sc.sc().getExecutorMemoryStatus()
Executor ids: sc.sc().getExecutorIds()
Storage info: sc.sc().getRddStorageInfo()
... you can try to find there more useful info.
There's a (n)(almost) undocumented REST API feature that delivers almost everything you can see on the Spark UI:
http://<sparkMasterHost>:<uiPort>/api/v1/...
For local installation you can start from here:
http://localhost:8080/api/v1/applications
Possible end points you can find here: https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
There's a (n)(almost) undocumented REST API feature on the Spark UI that delivers metrics about the job and performance.
You can access it with:
http://<driverHost>:<uiPort>/metrics/json/
(UIPort is 4040 by default)
You can get Spark job status without using Spark Job History server too. You can use SparkLauncher 2.0.1 (even Spark 1.6 version will work too) for launching your Spark job from Java program:
SparkAppHandle appHandle = sparkLauncher.startApplication();
You can also add listener to startApplication() method:
SparkAppHandle appHandle = sparkLauncher.startApplication(sparkAppListener);
Where listener has 2 methods which will inform you about job state change and info change.
I implemented using CountDownLatch, and it works as expected. This is for SparkLauncher version 2.0.1 and it works in Yarn-cluster mode too.
...
final CountDownLatch countDownLatch = new CountDownLatch(1);
SparkAppListener sparkAppListener = new SparkAppListener(countDownLatch);
SparkAppHandle appHandle = sparkLauncher.startApplication(sparkAppListener);
Thread sparkAppListenerThread = new Thread(sparkAppListener);
sparkAppListenerThread.start();
long timeout = 120;
countDownLatch.await(timeout, TimeUnit.SECONDS);
...
private static class SparkAppListener implements SparkAppHandle.Listener, Runnable {
private static final Log log = LogFactory.getLog(SparkAppListener.class);
private final CountDownLatch countDownLatch;
public SparkAppListener(CountDownLatch countDownLatch) {
this.countDownLatch = countDownLatch;
}
#Override
public void stateChanged(SparkAppHandle handle) {
String sparkAppId = handle.getAppId();
State appState = handle.getState();
if (sparkAppId != null) {
log.info("Spark job with app id: " + sparkAppId + ",\t State changed to: " + appState + " - "
+ SPARK_STATE_MSG.get(appState));
} else {
log.info("Spark job's state changed to: " + appState + " - " + SPARK_STATE_MSG.get(appState));
}
if (appState != null && appState.isFinal()) {
countDownLatch.countDown();
}
}
#Override
public void infoChanged(SparkAppHandle handle) {}
#Override
public void run() {}
}
This is a weird situation and I normally would never do it but our system has unfortunately now required this kind of scenario.
The System
We are running a Spring/Hibernate applications that is using OpenSessionInView and TransactionInterceptor to manage our transactions. For the most part it works great. However, we have recently required the need to spawn a number of threads to make some concurrent HTTP requests to providers.
The Problem
We need the entity that is passed into the thread to have all of the data that we have updated in our current transaction. The problem is we spawn the thread deep down in the guts of our service layer and it's very difficult to make a smaller transaction to allow this work. We tried originally just passing the entity to the thread and just calling:
leadDao.update(lead);
The problem is that we than get the error about the entity living in two sessions. Next we try to commit the original transaction and reopen as soon as the threads are complete.
This is what I have listed here.
try {
logger.info("------- BEGIN MULTITHREAD PING for leadId:" + lead.getId());
start = new Date();
leadDao.commitTransaction();
List<Future<T>> futures = pool.invokeAll(buyerClientThreads, lead.getAffiliate().getPingTimeout(), TimeUnit.SECONDS);
for (int i = 0; i < futures.size(); i++) {
Future<T> future = futures.get(i);
T leadStatus = null;
try {
leadStatus = future.get();
if (logger.isDebugEnabled())
logger.debug("Retrieved results from thread buyer" + leadStatus.getLeadBuyer().getName() + " leadId:" + leadStatus.getLead().getId() + " time:" + DateUtils.formatDate(start, "HH:mm:ss"));
} catch (CancellationException e) {
leadStatus = extractErrorPingLeadStatus(lead, "Timeout - CancellationException", buyerClientThreads.get(i).getBuyerClient().getLeadBuyer(), buyerClientThreads.get(i).getBuyerClient().constructPingLeadStatusInstance());
leadStatus.setTimeout(true);
leadStatus.setResponseTime(new Date().getTime() - start.getTime());
logger.debug("We had a ping that didn't make it in time");
}
if (leadStatus != null) {
completed.add(leadStatus);
}
}
} catch (InterruptedException e) {
logger.debug("There was a problem calling the pool of pings", e);
} catch (ExecutionException e) {
logger.error("There was a problem calling the pool of pings", e);
}
leadDao.beginNewTransaction();
The begin transaction looks like this:
public void beginNewTransaction() {
if (getCurrentSession().isConnected()) {
logger.info("Session is not connected");
getCurrentSession().reconnect();
if (getCurrentSession().isConnected()) {
logger.info("Now connected!");
} else {
logger.info("STill not connected---------------");
}
} else if (getCurrentSession().isOpen()) {
logger.info("Session is not open");
}
getCurrentSession().beginTransaction();
logger.info("BEGINNING TRANSAACTION - " + getCurrentSession().getTransaction().isActive());
}
The threads are using TransactionTemplates since my buyerClient object is not managed by spring (long involved requirements).
Here is that code:
#SuppressWarnings("unchecked")
private T processPing(Lead lead) {
Date now = new Date();
if (logger.isDebugEnabled()) {
logger.debug("BEGIN PINGING BUYER " + getLeadBuyer().getName() + " for leadId:" + lead.getId() + " time:" + DateUtils.formatDate(now, "HH:mm:ss:Z"));
}
Object leadStatus = transaction(lead);
if (logger.isDebugEnabled()) {
logger.debug("PING COMPLETE FOR BUYER " + getLeadBuyer().getName() + " for leadId:" + lead.getId() + " time:" + DateUtils.formatDate(now, "HH:mm:ss:Z"));
}
return (T) leadStatus;
}
public T transaction(final Lead incomingLead) {
final T pingLeadStatus = this.constructPingLeadStatusInstance();
Lead lead = leadDao.fetchLeadById(incomingLead.getId());
T object = transactionTemplate.execute(new TransactionCallback<T>() {
#Override
public T doInTransaction(TransactionStatus status) {
Date startTime = null, endTime = null;
logger.info("incomingLead obfid:" + incomingLead.getObfuscatedAffiliateId() + " affiliateId:" + incomingLead.getAffiliate().getId());
T leadStatus = null;
if (leadStatus == null) {
leadStatus = filterLead(incomingLead);
}
if (leadStatus == null) {
leadStatus = pingLeadStatus;
leadStatus.setLead(incomingLead);
...LOTS OF CODE
}
if (logger.isDebugEnabled())
logger.debug("RETURNING LEADSTATUS FOR BUYER " + getLeadBuyer().getName() + " for leadId:" + incomingLead.getId() + " time:" + DateUtils.formatDate(new Date(), "HH:mm:ss:Z"));
return leadStatus;
}
});
if (logger.isDebugEnabled()) {
logger.debug("Transaction complete for buyer:" + getLeadBuyer().getName() + " leadId:" + incomingLead.getId() + " time:" + DateUtils.formatDate(new Date(), "HH:mm:ss:Z"));
}
return object;
}
However, when we begin our new transaction we get this error:
org.springframework.transaction.TransactionSystemException: Could not commit Hibernate transaction; nested exception is org.hibernate.TransactionException: Transaction not successfully started
at org.springframework.orm.hibernate3.HibernateTransactionManager.doCommit(HibernateTransactionManager.java:660)
at org.springframework.transaction.support.AbstractPlatformTransactionManager.processCommit(AbstractPlatformTransactionManager.java:754)
at org.springframework.transaction.support.AbstractPlatformTransactionManager.commit(AbstractPlatformTransactionManager.java:723)
at org.springframework.transaction.interceptor.TransactionAspectSupport.commitTransactionAfterReturning(TransactionAspectSupport.java:393)
at org.springframework.transaction.interceptor.TransactionInterceptor.invoke(TransactionInterceptor.java:120)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:172)
at org.springframework.aop.interceptor.ExposeInvocationInterceptor.invoke(ExposeInvocationInterceptor.java:90)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:172)
at org.springframework.aop.framework.JdkDynamicAopProxy.invoke(JdkDynamicAopProxy.java:202)
My Goal
My goal is to be able to have that entity fully initalized on the other side or Does anyone have any ideas on how I can commit the data to the database so the thread can have a fully populated object. Or, have a way to query for a full object?
Thanks I know this is really involved. I apologize if I haven't been clear enough.
I have tried
Hibernate.initialize()
saveWithFlush()
update(lead)
I didn't follow everything - you can try one of this to workaround the issue that you get about the same object being associated with two sessions.
// do this in the main thread to detach the object
// from the current session
// if it has associations that also need to be handled the cascade=evict should
// be specified. Other option is to do flush & clear on the session.
session.evict(object);
// pass the object to the other thread
// in the other thread - use merge
session.merge(object)
Second approach - create a deep copy of the object and pass the copy. This can be easily achieved if your entity classes are serializable - just serialize the object and deserialize.
Thanks #gkamal for your help.
For everyone living in posterity. The answer to my dilemma was a left over call to hibernateTemplate instead of getCurrentSession(). I made the move about a year and a half ago and for some reason missed a few key places. This was generating a second transaction. After that I was able to use #gkamal suggestion and evict the object and grab it again.
This post helped me figure it out:
http://forum.springsource.org/showthread.php?26782-Illegal-attempt-to-associate-a-collection-with-two-open-sessions
Thus for used ajax enabled wcf services to get records from DB and display it in client without using AsyncPattern property of OperationContractAttribute....
When should i consider AsyncPattern property?
Sample of my operationcontract methods,
[OperationContract]
public string GetDesignationData()
{
DataSet dt = GetDesignationViewData();
return GetJSONString(dt.Tables[0]);
}
public string GetJSONString(DataTable Dt)
{
string[] StrDc = new string[Dt.Columns.Count];
string HeadStr = string.Empty;
for (int i = 0; i < Dt.Columns.Count; i++)
{
StrDc[i] = Dt.Columns[i].Caption;
HeadStr += "\"" + StrDc[i] + "\" : \"" + StrDc[i] + i.ToString() + "¾" + "\",";
}
HeadStr = HeadStr.Substring(0, HeadStr.Length - 1);
StringBuilder Sb = new StringBuilder();
Sb.Append("{\"" + Dt.TableName + "\" : [");
for (int i = 0; i < Dt.Rows.Count; i++)
{
string TempStr = HeadStr;
Sb.Append("{");
for (int j = 0; j < Dt.Columns.Count; j++)
{
if (Dt.Rows[i][j].ToString().Contains("'") == true)
{
Dt.Rows[i][j] = Dt.Rows[i][j].ToString().Replace("'", "");
}
TempStr = TempStr.Replace(Dt.Columns[j] + j.ToString() + "¾", Dt.Rows[i][j].ToString());
}
Sb.Append(TempStr + "},");
}
Sb = new StringBuilder(Sb.ToString().Substring(0, Sb.ToString().Length - 1));
Sb.Append("]}");
return Sb.ToString();
}
public DataSet GetDesignationViewData()
{
try
{
string connectionString = System.Configuration.ConfigurationManager.ConnectionStrings["connectionString"].ConnectionString;
return SqlHelper.ExecuteDataset(connectionString, CommandType.StoredProcedure, DataTemplate.spDesignation_View);
}
catch (Exception err)
{
throw err;
}
}
AsyncPattern has a few uses- it's mainly a server performance optimization that allows you to free up worker pool request threads on blocking operations. For example, when a long-running blocking operation like DB access occurs, if you're using an async DB API on the server with AsyncPattern, the worker thread can return to the pool and service other requests. The original request is "awakened" later on another worker thread when the DB access completes, and the rest of the work is done (the service client just patiently waits- this is all transparent to it unless you're using an AsyncPattern-aware client and binding). This CAN allow your service to process more requests, if done carefully. To take advantage, you need to be using APIs on the server that have native async implementations. The only one I see that might be a candidate is the DB call that's happening in your SQLHelper.ExecuteDataset method- you'd have to read up on the underlying API to make sure a TRUE asynchronous option is available (presence of BeginXXX/EndXXX methods doesn't necessarily mean it's a TRUE async impl). The System.SqlClient stuff is truly async.
A word of caution: you have to be processing a lot of requests to make this worthwhile- there's a significant cost to code complexity and readability to split things up this way. You also need to understand multi-threaded programming very well- there are numerous pitfalls around locking, error handling, etc, that are well outside the scope of a SO post.
Good luck!