finding mean using pig or hadoop - hadoop

I have a huge text file of form
data is saved in directory data/data1.txt, data2.txt and so on
merchant_id, user_id, amount
1234, 9123, 299.2
1233, 9199, 203.2
1234, 0124, 230
and so on..
What I want to do is for each merchant, find the average amount..
so basically in the end i want to save the output in file.
something like
merchant_id, average_amount
1234, avg_amt_1234 a
and so on.
How do I calculate the standard deviation as well?
Sorry for asking such a basic question. :(
Any help would be appreciated. :)

Apache PIG is well adapted for such tasks. See example:
inpt = load '~/pig_data/pig_fun/input/group.txt' as (amnt:double, id:chararray,c2:chararray);
grp = group inpt by id;
mean = foreach grp {
sum = SUM(inpt.amnt);
count = COUNT(inpt);
generate group as id, sum/count as mean, sum as sum, count as count;
};
Pay special attention to the data type of the amnt column as it will influence which implementation of the SUM function PIG is going to invoke.
PIG can also do something that SQL can not, it can put the mean against each input row without using any inner joins. That is useful if you are calculating z-scores using standard deviation.
mean = foreach grp {
sum = SUM(inpt.amnt);
count = COUNT(inpt);
generate FLATTEN(inpt), sum/count as mean, sum as sum, count as count;
};
FLATTEN(inpt) does the trick, now you have access to the original amount that had contributed to the groups average, sum and count.
UPDATE 1:
Calculating variance and standard deviation:
inpt = load '~/pig_data/pig_fun/input/group.txt' as (amnt:double, id:chararray, c2:chararray);
grp = group inpt by id;
mean = foreach grp {
sum = SUM(inpt.amnt);
count = COUNT(inpt);
generate flatten(inpt), sum/count as avg, count as count;
};
tmp = foreach mean {
dif = (amnt - avg) * (amnt - avg) ;
generate *, dif as dif;
};
grp = group tmp by id;
standard_tmp = foreach grp generate flatten(tmp), SUM(tmp.dif) as sqr_sum;
standard = foreach standard_tmp generate *, sqr_sum / count as variance, SQRT(sqr_sum / count) as standard;
It will use 2 jobs. I have not figured out how to do it in one, hmm, need to spend more time on it.

So what do you want? You want the running java code or the abstract map-reduce process? For the second:
The map step:
record -> (merchant_id as key, amount as value)
The reduce step:
(merchant_id, amount) -> (merchant_id, aggregate the value you want)
As in the reduce step, you will be provided with a stream of record having the same key and you can do almost everything you can including the average, variance.

you can calculate the standard deviation just in one step; using the formula
var=E(x^2)-(Ex)^2
inpt = load '~/pig_data/pig_fun/input/group.txt' as (amnt:double, id:chararray, c2:chararray);
grp = group inpt by id;
mean = foreach grp {
sum = SUM(inpt.amnt);
sum2 = SUM(inpt.amnt**2);
count = COUNT(inpt);
generate flatten(inpt), sum/count as avg, count as count, sum2/count- (sum/count)**2 as std;
};
that's it!

I calculated all stats(min, max, mean and standard deviation) in just 1 loop. FILTER_DATA contains data-set.
GROUP_SYMBOL_YEAR = GROUP FILTER_DATA BY (SYMBOL, SUBSTRING(TIMESTAMP,0,4));
STATS_ALL = FOREACH GROUP_SYMBOL_YEAR {
MINIMUM = MIN(FILTER_DATA.CLOSE);
MAXIMUM = MAX(FILTER_DATA.CLOSE);
MEAN = AVG(FILTER_DATA.CLOSE);
CNT = COUNT(FILTER_DATA.CLOSE);
CSQ = FOREACH FILTER_DATA GENERATE CLOSE * CLOSE AS (CC:DOUBLE);
GENERATE group.$0 AS (SYMBOL:CHARARRAY), MINIMUM AS (MIN:DOUBLE), MAXIMUM AS (MAX:DOUBLE), ROUND_TO(MEAN,6) AS (MEAN:DOUBLE), ROUND_TO(SQRT(SUM(CSQ.CC) / (CNT * 1.0) - (MEAN * MEAN)),6) AS (STDDEV:DOUBLE), group.$1 AS (YEAR:INT);
};

Related

maximum concurrent count record in elasticsearch?

how to find the maximum concurrent connected client count on the basis of networkId filed in 1 hour in elasticsearch?
I have tried this 1 query
GET /sample_index/_search {"size":0,"query":{"bool":{"range":{"#timestamp":{"gte":"now-1h","lt":"now"}}}},"aggregations":{"maxconnectclient":{"terms":{"field":"networkId.keyword","size":10},"aggregations":{"wlan0clintCount":{"sum":{"field":"wlan0.clients"}},"wlan1clintCount":{"sum":{"field":"wlan1.clients"}},"totalClientCount":{"bucket_script":{"buckets_path":{"wlan0clintCount":"wlan0clintCount","wlan1clintCount":"wlan1clintCount"},"script":{"source":"double sum = 0.0; sum = params.wlan0clintCount + params.wlan1clintCount+p; return (sum);","lang":"painless"}}}}}}}
and found the only all-max record of 1hours
or should i use this query---
GET sample _index/_search
{"size":0,"query":{"bool":{"filter":[{"range":{"#timestamp":{"gte":"now-1h","lt":"now"}}}]}},"aggregations":{"maxconnectclient":{"terms":{"field":"networkId.keyword","size":10,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]},"aggregations":{"wlan0clintCount":{"sum":{"field":"wlan0.clients"}},"wlan1clintCount":{"sum":{"field":"wlan1.clients"}},"wlan2clintCount":{"sum":{"field":"wlan2.clients"}},"wlan2_6clintCount":{"sum":{"field":"wlan2_6.clients"}},"totalClientCount":{"bucket_script":{"buckets_path":{"wlan0clintCount":"wlan0clintCount","wlan1clintCount":"wlan1clintCount","wlan2clintCount":"wlan2clintCount","wlan2_6clintCount":"wlan2_6clintCount"},"script":{"source":"double sum = 0.0; sum = params.wlan0clintCount + params.wlan1clintCount+params.wlan2clintCount+params.wlan2_6clintCount; return (sum);","lang":"painless"},"gap_policy":"skip"}},"sum_bucket_sort":{"bucket_sort":{"sort":[{"totalClientCount":{"order":"desc"}}],"from":0,"gap_policy":"SKIP"}}}}}}

Non-associative RDOM parallellization in Halide

I am trying to write a decoder for GPU. My encoding scheme has data dependencies between lines. So when decoding columns of data each column depends on the previous. I want to parallellize the internal computation of each column, but execute each column one-by-one and sequentially, but I am having trouble getting this correctly.
Below I have modeled a toy example to show the problem:
Func f;
Var x,y;
RDom r(1,3,1,3); // goes from (1,1) to (4,4)
f(x,y) = 0;
f(0,y) = y;
Expr p_1 = f(r.x-1,r.y);
Expr p_2 = f(r.x-1,r.y-1);
f(r.x,r.y) = p_1 + p_2;
Buffer<int32_t> output_2D = f.realize({4,4});
A visualization of this program can be seen here: Serial Computation Visualisation
This reduction should give the following array():
int expected_output[4][4] = {{0,0,0,0},
{1,1,1,1},
{2,3,4,5},
{3,5,8,12}};
And checking using Catch2 I can see that it actually calculates it correctly
for(int j = 0; j < output_2D.height(); j++){
for(int i = 0; i < output_2D.width(); i++){
CAPTURE(i,j);
REQUIRE(expected_output[j][i]==output_2D(i,j));
}
}
My task is to speed this computation up. Since column one depends on column zero I have to calculate each column in series. I can however, calculate all the values in the column in parallel. Please see Computation Steps Parallel and Desired Pipeline to see how I want Halide to compute the pipeline.
I tried doing this in halide using the f.update(1).allow_race_conditions().parallel(r.y); and this does almost what I want.
f(r.x,r.y) = p_1 + p_2;
f.update(1).allow_race_conditions().parallel(r.y);
f.trace_stores();
Buffer<int32_t> output_2D = f.realize({4,4});
For some reason however, it seems that parallel(y) executes the columns in seemingly random order.
It yields the following store_trace:
Init Image:
Store f29.0(0, 0) = 0
Store f29.0(1, 0) = 0
....
Store f29.0(3, 3) = 0
Init first row:
Store f29.0(0, 0) = 0
Store f29.0(1, 0) = 1
Store f29.0(2, 0) = 2
Store f29.0(3, 0) = 3
Start Parallel Computation:
Store f29.0(1, 1) = 1 // First parallel column
Store f29.0(2, 1) = 1
Store f29.0(3, 1) = 1
Store f29.0(1, 3) = 5 // Second parallel column: THIS IS MY PROBLEM
Store f29.0(2, 3) = 5 // This should be column 2 not column 3.
Store f29.0(3, 3) = 5
Store f29.0(1, 2) = 3
Store f29.0(2, 2) = 4
Store f29.0(3, 2) = 5
A visualization of this pattern can be seen here in this figure: Current Pipeline.
I know that I explicitly enabling the race_conditions so I must be doing something wrong, but I dont know what is the right way to do this and this is the closest I got. I could vectorize() with respect to y and that gives the correct evaluation, but I want to use the parallel() block to gain greater speedup for larger matrixes/images. RFactor might be a solution as my problem should be associative in the y direction, but it might not work as it is non-associative in the x-direction(each column depends on the previous) Does anyone know how to be serial in x and parallel in y when using RDoms?

RX LINQ partition the input stream

I have a input stream where the input element consist of Date, Depth and Area.
I want to plot the Area against the Depth and want therefor to take out a window of Depth e.g. between 1.0-100.0m.
The problem is that I want to down sample the input stream since there can be many inputs with close Depth values.
I want to partition the input into x bins, e.g. 2 bins means all depth values between 1-50 is averaged in the first bin and 51-100 in the second bin.
I was thinking something like this:
var q = from e in input
where (e.Depth > 1) && (e.Depth <= 100)
// here I need some way of partition the sequence into bins
// and averaging the elements.
Split a collection into `n` parts with LINQ? wants to do something similar without rx.
Modified answer as per your comment. steps = number of buckets.
var min = 1, max = 100;
var steps = 10;
var f = (max - min + 1) / steps; // The extra 1 is really an epsilon. #hack
var q = from e in input
where e.Depth > 1 && e.depth <= 100
let x = e.Depth - min
group e by x < max ? (x - (x % f)) : ;
This is the function we're grouping by for the given e.Depth.
This probably won't work so great with floating point values (due to precision), unless you floor/ceil the selection, but then you might run out of integers, so you may need to scale a bit... something like group e by Math.Floor((x - (x % f)) * scaleFactor).
This should do what you want
static int GetBucket(double value, double min, double max, int bucketCount)
{
return (int)((value - min) / (max - min) * bucketCount + 0.5);
}
var grouped = input.GroupBy(e => GetBucket(e.Depth, 1, 100, 50));

Flooding Bayesian rating creates values out of range

I'm trying to apply the Bayesian rating formula, but if I rate 1 out of 5 thousand of hundreds, the final rating is greater than 5.
For example, a given item has no votes and after voting 170,000 times with 1 star, its final rating is 5.23. If I rate 100, it has a normal value.
Here is what I have in PHP.
<?php
// these values came from DB
$total_votes = 2936; // total of votes for all items
$total_rating = 582.955; // sum of all ratings
$total_items = 202;
// now the specific item, it has no votes yet
$this_num_votes = 0;
$this_score = 0;
$this_rating = 0;
// simulating a lot of votes with 1 star
for ($i=0; $i < 170000; $i++) {
$rating_sent = 1; // the new rating, always 1
$total_votes++; // adding 1 to total
$total_rating = $total_rating+$rating_sent; // adding 1 to total
$avg_num_votes = ($total_votes/$total_items); // Average number of votes in all items
$avg_rating = ($total_rating/$total_items); // Average rating for all items
$this_num_votes = $this_num_votes+1; // Number of votes for this item
$this_score = $this_score+$rating_sent; // Sum of all votes for this item
$this_rating = $this_score/$this_num_votes; // Rating for this item
$bayesian_rating = ( ($avg_num_votes * $avg_rating) + ($this_num_votes * $this_rating) ) / ($avg_num_votes + $this_num_votes);
}
echo $bayesian_rating;
?>
Even if I flood with 1 or 2:
$rating_sent = rand(1,2)
The final rating after 100,000 votes is over 5.
I just did a new test using
$rating_sent = rand(1,5)
And after 100,000 I got a value completely out of range range (10.53). I know that in a normal situation no item will get 170,000 votes while all the other items get no vote. But I wonder if there is something wrong with my code or if this is an expected behavior of Bayesian formula considering the massive votes.
Edit
Just to make it clear, here is a better explanation for some variables.
$avg_num_votes // SUM(votes given to all items)/COUNT(all items)
$avg_rating // SUM(rating of all items)/COUNT(all items)
$this_num_votes // COUNT(votes given for this item)
$this_score // SUM(rating for this item)
$bayesian_rating // is the formula itself
The formula is: ( (avg_num_votes * avg_rating) + (this_num_votes * this_rating) ) / (avg_num_votes + this_num_votes). Taken from here
You need to divide by total_votes rather than total_items when calculating avg_rating.
I made the changes and got something that behaves much better here.
http://codepad.org/gSdrUhZ2

Algorithm to order 'tag line' campaigns based on resulting sales

I want to be able to introduce new 'tag lines' into a database that are shown 'randomly' to users. (These tag lines are shown as an introduction as animated text.)
Based upon the number of sales that result from those taglines I'd like the good ones to trickle to the top, but still show the others less frequently.
I could come up with a basic algorithm quite easily but I want something thats a little more 'statistically accurate'.
I dont really know where to start. Its been a while since I've done anything more than basic statistics. My model would need to be sensitive to tolerances, but obviously it doesnt need to be worthy of a PHD.
Edit: I am currently tracking a 'conversion rate' - i.e. hits per order. This value would probably be best calculated as a cumulative 'all time' convertsion rate to be fed into the algorithm.
Looking at your problem, I would modify the requirements a bit -
1) The most popular one should be shown most often.
2) Taglines should "age", so one that got a lot of votes (purchase) in the past, but none recently should be shown less often
3) Brand new taglines should be shown more often during their first days.
If you agree on those, then a algorithm could be something like:
START:
x = random(1, 3);
if x = 3 goto NEW else goto NORMAL
NEW:
TagVec = Taglines.filterYounger(5 days); // I'm taking a LOT of liberties with the pseudo code,,,
x = random(1, TagVec.Length);
return tagVec[x-1]; // 0 indexed vectors even in made up language,
NORMAL:
// Similar to EBGREEN above
sum = 0;
ForEach(TagLine in TagLines) {
sum += TagLine.noOfPurhcases;
}
x = random(1, sum);
ForEach(TagLine in TagLines) {
x -= TagLine.noOfPurchase;
if ( x > 0) return TagLine; // Find the TagLine that represent our random number
}
Now, as a setup I would give every new tagline 10 purchases, to avoid getting really big slanting for one single purchase.
The aging process I would count a purchase older than a week as 0.8 purhcase per week of age. So 1 week old gives 0.8 points, 2 weeks give 0.8*0.8 = 0,64 and so forth...
You would have to play around with the Initial purhcases parameter (10 in my example) and the aging speed (1 week here) and the aging factor (0.8 here) to find something that suits you.
I would suggest randomly choosing with a weighting factor based on previous sales. So let's say you had this:
tag1 = 1 sale
tag2 = 0 sales
tag3 = 1 sale
tag4 = 2 sales
tag5 = 3 sales
A simple weighting formula would be 1 + number of sales, so this would be the probability of selecting each tag:
tag1 = 2/12 = 16.7%
tag2 = 1/12 = 8.3%
tag3 = 2/12 = 16.6%
tag4 = 3/12 = 25%
tag5 = 4/12 = 33.3%
You could easily change the weighting formula to get just the distribution that you want.
You have to come up with a weighting formula based on sales.
I don't think there's any such thing as a "statistically accurate" formula here - it's all based on your preference.
No one can say "this is the correct weighting and the other weighting is wrong" because there isn't a final outcome you are attempting to model - this isn't like trying to weigh responses to a poll about an upcoming election (where you are trying to model results to represent something that will happen in the future).
Heres an example in javascript. Not that I'm not suggesting running this client side...
Also there is alot of optimization that can be done.
Note: createMemberInNormalDistribution() is implemented here Converting a Uniform Distribution to a Normal Distribution
/*
* an example set of taglines
* hits are sales
* views are times its been shown
*/
var taglines = [
{"tag":"tagline 1","hits":1,"views":234},
{"tag":"tagline 2","hits":5,"views":566},
{"tag":"tagline 3","hits":3,"views":421},
{"tag":"tagline 4","hits":1,"views":120},
{"tag":"tagline 5","hits":7,"views":200}
];
/*set up our stat model for the tags*/
var TagModel = function(set){
var hits, views, sumOfDiff, sumOfSqDiff;
hits = views = sumOfDiff = sumOfSqDiff = 0;
/*find average*/
for (n in set){
hits += set[n].hits;
views += set[n].views;
}
this.avg = hits/views;
/*find standard deviation and variance*/
for (n in set){
var diff =((set[n].hits/set[n].views)-this.avg);
sumOfDiff += diff;
sumOfSqDiff += diff*diff;
}
this.variance = sumOfDiff;
this.std_dev = Math.sqrt(sumOfSqDiff/set.length);
/*return tag to use fChooser determines likelyhood of tag*/
this.getTag = function(fChooser){
var m = this;
set.sort(function(a,b){
return fChooser((a.hits/a.views),(b.hits/b.views), m);
});
return set[0];
};
};
var config = {
"uniformDistribution":function(a,b,model){
return Math.random()*b-Math.random()*a;
},
"normalDistribution":function(a,b,model){
var a1 = createMemberInNormalDistribution(model.avg,model.std_dev)* a;
var b1 = createMemberInNormalDistribution(model.avg,model.std_dev)* b;
return b1-a1;
},
//say weight = 10^n... higher n is the more even the distribution will be.
"weight": .5,
"weightedDistribution":function(a,b,model){
var a1 = createMemberInNormalDistribution(model.avg,model.std_dev*config.weight)* a;
var b1 = createMemberInNormalDistribution(model.avg,model.std_dev*config.weight)* b;
return b1-a1;
}
}
var model = new TagModel(taglines);
//to use
model.getTag(config.uniformDistribution).tag;
//running 10000 times: ({'tagline 4':836, 'tagline 5':7608, 'tagline 1':100, 'tagline 2':924, 'tagline 3':532})
model.getTag(config.normalDistribution).tag;
//running 10000 times: ({'tagline 4':1775, 'tagline 5':3471, 'tagline 1':1273, 'tagline 2':1857, 'tagline 3':1624})
model.getTag(config.weightedDistribution).tag;
//running 10000 times: ({'tagline 4':1514, 'tagline 5':5045, 'tagline 1':577, 'tagline 2':1627, 'tagline 3':1237})
config.weight = 2;
model.getTag(config.weightedDistribution).tag;
//running 10000 times: {'tagline 4':1941, 'tagline 5':2715, 'tagline 1':1559, 'tagline 2':1957, 'tagline 3':1828})

Resources