Begenner at spark Big data programming (spark code) - hadoop

i'm learning spark for distributed systemes. i runned this code and it's worked.
but i know that it's count word in input files but i have probleme undestanding how Methods are written and what the us of JavaRDD
public class JavaWordCount {
public static void main(String[] args) throws Exception {
System.out.print("le programme commence");
//String inputFile = "/mapr/demo.mapr.com/TestMapr/Input/alice.txt";
String inputFile = args[0];
String outputFile = args[1];
// Create a Java Spark Context.
System.out.print("le programme cree un java spark contect");
SparkConf conf = new SparkConf().setAppName("JavaWordCount");
JavaSparkContext sc = new JavaSparkContext(conf);
// Load our input data.
System.out.print("Context créeS");
JavaRDD<String> input = sc.textFile(inputFile);
// map/split each line to multiple words
System.out.print("le programme divise le document en multiple line");
JavaRDD<String> words = input.flatMap(
new FlatMapFunction<String, String>() {
#Override
public Iterable<String> call(String x) {
return Arrays.asList(x.split(" "));
}
}
);
System.out.print("Turn the words into (word, 1) pairse");
// Turn the words into (word, 1) pairs
JavaPairRDD<String, Integer> wordOnePairs = words.mapToPair(
new PairFunction<String, String, Integer>() {
#Override
public Tuple2<String, Integer> call(String x) {
return new Tuple2(x, 1);
}
}
);
System.out.print(" // reduce add the pairs by key to produce counts");
// reduce add the pairs by key to produce counts
JavaPairRDD<String, Integer> counts = wordOnePairs.reduceByKey(
new Function2<Integer, Integer, Integer>() {
#Override
public Integer call(Integer x, Integer y) {
return x + y;
}
}
);
System.out.print(" Save the word count back out to a text file, causing evaluation.");
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile(outputFile);
System.out.println(counts.collect());
sc.close();
}
}

As mentioned by PinoSan this question is probably too generic, and you should be able to find your answer in any Spark Getting Started, or Tutorial.
Let me point you to some interesting content:
Spark Quick Start Guide
Getting Started with Apache Spark, ebook
Introduction to Apache Spark with Examples and Use Cases
Disclaimer: I am working for MapR this is why I put online resources on Spark from MapR site

Related

Spring batch patitioning of db not working properly

I have configured a job as follow, which is to read from db and write into files but by partitioning data on basis of sequence.
//Job Config
#Bean
public Job job(JobBuilderFactory jobBuilderFactory) throws Exception {
Flow masterFlow1 = (Flow) new FlowBuilder<Object>("masterFlow1").start(masterStep()).build();
return (jobBuilderFactory.get("Partition-Job")
.incrementer(new RunIdIncrementer())
.start(masterFlow1)
.build()).build();
}
#Bean
public Step masterStep() throws Exception
{
return stepBuilderFactory.get(MASTERPPREPAREDATA)
//.listener(customSEL)
.partitioner(STEPPREPAREDATA,new DBPartitioner())
.step(prepareDataForS1())
.gridSize(gridSize)
.taskExecutor(new SimpleAsyncTaskExecutor("Thread"))
.build();
}
#Bean
public Step prepareDataForS1() throws Exception
{
return stepBuilderFactory.get(STEPPREPAREDATA)
//.listener(customSEL)
.<InputData,InputData>chunk(chunkSize)
.reader(JDBCItemReader(0,0))
.writer(writer(null))
.build();
}
#Bean(destroyMethod="")
#StepScope
public JdbcCursorItemReader<InputData> JDBCItemReader(#Value("#{stepExecutionContext[startingIndex]}") int startingIndex,
#Value("#{stepExecutionContext[endingIndex]}") int endingIndex)
{
JdbcCursorItemReader<InputData> ir = new JdbcCursorItemReader<>();
ir.setDataSource(batchDataSource);
ir.setMaxItemCount(DBPartitioner.partitionSize);
ir.setSaveState(false);
ir.setRowMapper(new InputDataRowMapper());
ir.setSql("SELECT * FROM FIF_INPUT fi WHERE fi.SEQ > ? AND fi.SEQ < ?");
ir.setPreparedStatementSetter(new PreparedStatementSetter() {
#Override
public void setValues(PreparedStatement ps) throws SQLException {
ps.setInt(1, startingIndex);
ps.setInt(2, endingIndex);
}
});
return ir;
}
#Bean
#StepScope
public FlatFileItemWriter<InputData> writer(#Value("#{stepExecutionContext[index]}") String index)
{
System.out.println("writer initialized!!!!!!!!!!!!!"+index);
//Create writer instance
FlatFileItemWriter<InputData> writer = new FlatFileItemWriter<>();
//Set output file location
writer.setResource(new FileSystemResource(batchDirectory+relativeInputDirectory+index+inputFileForS1));
//All job repetitions should "append" to same output file
writer.setAppendAllowed(false);
//Name field values sequence based on object properties
writer.setLineAggregator(customLineAggregator);
return writer;
}
Partitioner provided for partitioning db is written separately in other file so as follows
//PartitionDb.java
public class DBPartitioner implements Partitioner{
public static int partitionSize;
private static Log log = LogFactory.getLog(DBPartitioner.class);
#SuppressWarnings("unchecked")
#Override
public Map<String, ExecutionContext> partition(int gridSize) {
log.debug("START: Partition"+"grid size:"+gridSize);
#SuppressWarnings("rawtypes")
Map partitionMap = new HashMap<>();
int startingIndex = -1;
int endSize = partitionSize+1;
for(int i=0; i< gridSize; i++){
ExecutionContext ctxMap = new ExecutionContext();
ctxMap.putInt("startingIndex",startingIndex);
ctxMap.putInt("endingIndex", endSize);
ctxMap.put("index", i);
startingIndex = endSize-1;
endSize += partitionSize;
partitionMap.put("Thread:-"+i, ctxMap);
}
log.debug("END: Created Partitions of size: "+ partitionMap.size());
return partitionMap;
}
}
This one is executing properly but problem is even after partitioning on the basis of sequence i am getting same rows in multiple files which is not right as i am providing different set of data for each partition. Can anyone tell me whats wrong. I am using HikariCP for Db connection pooling and spring batch 4
This one is executing properly but problem is even after partitioning on the basis of sequence i am getting same rows in multiple files which is not right as i am providing different set of data for each partition.
I'm not sure your partitioner is working properly. A quick test shows that it is not providing different sets of data as you are claiming:
DBPartitioner dbPartitioner = new DBPartitioner();
Map<String, ExecutionContext> partition = dbPartitioner.partition(5);
for (String s : partition.keySet()) {
System.out.println(s + " : " + partition.get(s));
}
This prints:
Thread:-0 : {endingIndex=1, index=0, startingIndex=-1}
Thread:-1 : {endingIndex=1, index=1, startingIndex=0}
Thread:-2 : {endingIndex=1, index=2, startingIndex=0}
Thread:-3 : {endingIndex=1, index=3, startingIndex=0}
Thread:-4 : {endingIndex=1, index=4, startingIndex=0}
As you can see, almost all partitions will have the same startingIndex and endingIndex.
I recommend you unit test your partitioner before using it in a partitioned step.

Partitioner is not working correctly

I am trying to code one MapReduce scenario in which i have created some User ClickStream data in the form of JSON. After that i have written Mapper class to fetch the required data from the file my mapper code is :-
private final static String URL = "u";
private final static String Country_Code = "c";
private final static String Known_User = "nk";
private final static String Session_Start_time = "hc";
private final static String User_Id = "user";
private final static String Event_Id = "event";
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String aJSONRecord = value.toString();
try {
JSONObject aJSONObject = new JSONObject(aJSONRecord);
StringBuilder aOutputString = new StringBuilder();
aOutputString.append(aJSONObject.get(User_Id).toString()+",");
aOutputString.append(aJSONObject.get(Event_Id).toString()+",");
aOutputString.append(aJSONObject.get(URL).toString()+",");
aOutputString.append(aJSONObject.get(Known_User)+",");
aOutputString.append(aJSONObject.get(Session_Start_time)+",");
aOutputString.append(aJSONObject.get(Country_Code)+",");
context.write(new Text(aOutputString.toString()), key);
System.out.println(aOutputString.toString());
} catch (JSONException e) {
e.printStackTrace();
}
}
}
And my reducer code is :-
public void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
String aString = key.toString();
context.write(new Text(aString.trim()), new Text(""));
}
And my partitioner code is :-
public int getPartition(Text key, LongWritable value, int numPartitions) {
String aRecord = key.toString();
if(aRecord.contains(Country_code_Us)){
return 0;
}else{
return 1;
}
}
And here is my driver code
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Click Stream Analyzer");
job.setNumReduceTasks(2);
job.setJarByClass(ClickStreamDriver.class);
job.setMapperClass(ClickStreamMapper.class);
job.setReducerClass(ClickStreamReducer.class);
job.setPartitionerClass(ClickStreamPartitioner.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Here i am trying to partition my data on the basis of country code. But its not working, it is sending each and every record in a single reducer file i think file other then the one created for US reduce.
One more thing when i see the output of mappers it shows some extra space added at the end of each record.
Please suggest if i am making any mistake here.
Your problem with the partitioning is due to the number of reducers. If it is 1, all your data will be sent to it, independently to what you return from your partitioner. Thus, setting mapred.reduce.tasks to 2 will solve this issue. Or you can simply write:
job.setNumReduceTasks(2);
In order to have 2 reducers as you want.
Unless you have very specific requirement, you can set reducers as below for job parameters.
mapred.reduce.tasks (in 1.x) & mapreduce.job.reduces(2.x)
Or
job.setNumReduceTasks(2) as per mark91 answer.
But leave the job to Hadoop fraemork by using below API. Framework will decide number of reducers as per the file & block sizes.
job.setPartitionerClass(HashPartitioner.class);
I have used NullWritable and it works. Now i can see records are getting partitioned in different files. Since i was using longwritable as a null value instead of null writable , space is added in the last of each line and due to this US was listed as "US " and partition was not able to divide the orders.

Implement Hadoop Map with JavaPairRDD as Spark Way

I have an RDD:
JavaPairRDD<Long, ViewRecord> myRDD
which is created via newAPIHadoopRDD method. I have an existed map function which I want to implement it in Spark way:
LongWritable one = new LongWritable(1L);
protected void map(Long key, ViewRecord viewRecord, Context context)
throws IOException ,InterruptedException {
String url = viewRecord.getUrl();
long day = viewRecord.getDay();
tuple.getKey().set(url);
tuple.getValue().set(day);
context.write(tuple, one);
};
PS: tuple is derived from:
KeyValueWritable<Text, LongWritable>
and can be found here: TextLong.java
I don't know what tuple is but if you just want to map record to tuple with key (url, day) and value 1L you can do it like this:
result = myRDD
.values()
.mapToPair(viewRecord -> {
String url = viewRecord.getUrl();
long day = viewRecord.getDay();
return new Tuple2<>(new Tuple2<>(url, day), 1L);
})
//java 7 style
JavaPairRDD<Pair, Long> result = myRDD
.values()
.mapToPair(new PairFunction<ViewRecord, Pair, Long>() {
#Override
public Tuple2<Pair, Long> call(ViewRecord record) throws Exception {
String url = record.getUrl();
Long day = record.getDay();
return new Tuple2<>(new Pair(url, day), 1L);
}
}
);

Getting friends within a specified degree with MapReduce

Do you know how can I implement this algorithm using the MapReduce paradigm?
def getFriends(self, degree):
friendList = []
self._getFriends(degree, friendList)
return friendList
def _getFriends(self, degree, friendList):
friendList.append(self)
if degree:
for friend in self.friends:
friend._getFriends(degree-1, friendList)
Let's say that we have the following bi-directional friendships:
(1,2), (1,3), (1,4), (4,5), (4,6), (5,7), (5,8)
How can, for example, to get the 1st, 2nd and 3rd degree connections of user 1? The answer must be 1 -> 2, 3, 4, 5, 7, 8
Thanks
Maybe you can use hive which support the sql-like query!
As far as I understand, you want to collect all friends in the n-th circle of some person in a social graph. Most graph algorithms are recursive, and recursion is not well-suitable for a MapReduce way of solving tasks.
I can suggest you to use Apache Giraph to solve this problem (actually it uses MapReduce under the hood). It's mostly async and you write your jobs describing behaviour of a single node like:
1. Send a message from root node to all friends to get their friendlist.
2.1. Each friend sends a message with friendlist to root node.
2.2. Each friend sends a message to all it's sub-friends to get their friendlist.
3.1. Each sub-friend sends a message with friendlist to root node.
3.2. Each sub-friend sends a message to all it's sub-sub-friends to get their friendlist.
...
N. Root node collects all these messages and merges them in a single list.
Also you can use a cascade of map-reduce jobs to collect circles, but it's not very effective way to solve the task:
Export root user friends to a file circle-001
Use circle-001 as an input to a job that exports each user friends from circle-001 to a circle-002
Do the same, but use circle-002 as an input
...
Repeat N times
The first approach is more suitable if you have a lot of users to calculate their circles. The second has huge overhead of starting multiple MR jobs, but it's much simpler and is OK for small input set of users.
I am novice in this field but here is my though on that.
You could use a conventional BFS algorithm following the below pseudo code.
At each iteration you launch an Hadoop job that discovers all the child nodes of the current working set that were not yet visited.
BFS (list curNodes, list visited, int depth){
if (depth <= 0){
return visited;
}
//run Hadoop job on the current working set curNodes restricted by visited
//the job will populate some result list with the list of child nodes of the current working set
//then,
visited.addAll(result);
curNodes.empty();
curNodes.addAll(result);
BFS(curNodes, visited, depth-1);
}
The mapper and reducer of this job will look as below.
In this example I just used static members to hold the working set, visited and result sets.
It should have been implemented using a temp file. Probably there are ways to optimize the persistence of the temporary data accumulated from one iteration to the next.
the input file I used for the job contains list of topples one topple per line e.g.
1,2
2,3
5,4
...
...
public static class VertexMapper extends
Mapper<Object, Text, IntWritable, IntWritable> {
private static Set<IntWritable> curVertex = null;
private static IntWritable curLevel = null;
private static Set<IntWritable> visited = null;
private IntWritable key = new IntWritable();
private IntWritable value = new IntWritable();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString(), ",");
if (itr.countTokens() == 2) {
String keyStr = itr.nextToken();
String valueStr = itr.nextToken();
try {
this.key.set(Integer.parseInt(keyStr));
this.value.set(Integer.parseInt(valueStr));
if (VertexMapper.curVertex.contains(this.key)
&& !VertexMapper.visited.contains(this.value)
&& !key.equals(value)) {
context.write(VertexMapper.curLevel, this.value);
}
} catch (NumberFormatException e) {
System.err.println("Found key,value <" + keyStr + "," + valueStr
+ "> which cannot be parsed as int");
}
} else {
System.err.println("Found malformed line: " + value.toString());
}
}
}
public static class UniqueReducer extends
Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
private static Set<IntWritable> result = new HashSet<IntWritable>();
public void reduce(IntWritable key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
for (IntWritable val : values) {
UniqueReducer.result.add(new IntWritable(val.get()));
}
// context.write(key, key);
}
}
Running a job will be something like that
UniqueReducer.result.clear();
VertexMapper.curLevel = new IntWritable(1);
VertexMapper.curVertex = new HashSet<IntWritable>(1);
VertexMapper.curVertex.add(new IntWritable(1));
VertexMapper.visited = new HashSet<IntWritable>(1);
VertexMapper.visited.add(new IntWritable(1));
Configuration conf = getConf();
Job job = new Job(conf, "BFS");
job.setJarByClass(BFSExample.class);
job.setMapperClass(VertexMapper.class);
job.setCombinerClass(UniqueReducer.class);
job.setReducerClass(UniqueReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
job.setOutputFormatClass(NullOutputFormat.class);
boolean result = job.waitForCompletion(true);
BFSExample bfs = new BFSExample();
ToolRunner.run(new Configuration(), bfs, args);

Any suggestions for reading two different dataset into Hadoop at the same time?

Dear hadooper:
I'm new for hadoop, and recently try to implement an algorithm.
This algorithm needs to calculate a matrix, which represent the different rating of every two pair of songs. I already did this, and the output is a 600000*600000 sparse matrix which I stored in my HDFS. Let's call this dataset A (size=160G)
Now, I need to read the users' profiles to predict their rating for a specific song. So I need to read the users' profile first(which is 5G size), let call this dataset B, and then calculate use the dataset A.
But now I don't know how to read the two dataset from a single hadoop program. Or can I read the dataset B into RAM then do the calculation?( I guess I can't, because the HDFS is a distribute system, and I can't read the dataset B into a single machine's memory).
Any suggestions?
You can use two Map function, Each Map Function Can process one data set if you want to implement different processing. You need to register your map with your job conf. For eg:
public static class FullOuterJoinStdDetMapper extends MapReduceBase implements Mapper <LongWritable ,Text ,Text, Text>
{
private String person_name, book_title,file_tag="person_book#";
private String emit_value = new String();
//emit_value = "";
public void map(LongWritable key, Text values, OutputCollector<Text,Text>output, Reporter reporter)
throws IOException
{
String line = values.toString();
try
{
String[] person_detail = line.split(",");
person_name = person_detail[0].trim();
book_title = person_detail[1].trim();
}
catch (ArrayIndexOutOfBoundsException e)
{
person_name = "student name missing";
}
emit_value = file_tag + person_name;
output.collect(new Text(book_title), new Text(emit_value));
}
}
public static class FullOuterJoinResultDetMapper extends MapReduceBase implements Mapper <LongWritable ,Text ,Text, Text>
{
private String author_name, book_title,file_tag="auth_book#";
private String emit_value = new String();
// emit_value = "";
public void map(LongWritable key, Text values, OutputCollectoroutput, Reporter reporter)
throws IOException
{
String line = values.toString();
try
{
String[] author_detail = line.split(",");
author_name = author_detail[1].trim();
book_title = author_detail[0].trim();
}
catch (ArrayIndexOutOfBoundsException e)
{
author_name = "Not Appeared in Exam";
}
emit_value = file_tag + author_name;
output.collect(new Text(book_title), new Text(emit_value));
}
}
public static void main(String args[])
throws Exception
{
if(args.length !=3)
{
System.out.println("Input outpur file missing");
System.exit(-1);
}
Configuration conf = new Configuration();
String [] argum = new GenericOptionsParser(conf,args).getRemainingArgs();
conf.set("mapred.textoutputformat.separator", ",");
JobConf mrjob = new JobConf();
mrjob.setJobName("Inner_Join");
mrjob.setJarByClass(FullOuterJoin.class);
MultipleInputs.addInputPath(mrjob,new Path(argum[0]),TextInputFormat.class,FullOuterJoinStdDetMapper.class);
MultipleInputs.addInputPath(mrjob,new Path(argum[1]),TextInputFormat.class,FullOuterJoinResultDetMapper.class);
FileOutputFormat.setOutputPath(mrjob,new Path(args[2]));
mrjob.setReducerClass(FullOuterJoinReducer.class);
mrjob.setOutputKeyClass(Text.class);
mrjob.setOutputValueClass(Text.class);
JobClient.runJob(mrjob);
}
Hadoop allows you to use different map input formats for different folders. So you can read from several datasources and then cast to specific type in Map function i.e. in one case you got (String,User) in other (String,SongSongRating) and you Map signature is (String,Object).
The second step is selection recommendation algorithm, join those data in some way so aggregator will have least information enough to calculate recommendation.

Resources