Crossfilter query - d3.js

Is it possible to filter a crossfilter dataset which has an array as the value?
For example, say I have the following dataset:
var data = [
{
bookname: "the joy of clojure",
authors: ["Michael Fogus", "Chris Houser"],
tags: ["clojure", "lisp"]
},
{
bookname: "Eloquent Ruby",
authors: ["Russ Olsen"],
tags: ["ruby"]
},
{
bookname: "Design Patterns in Ruby",
authors: ["Russ Olsen"],
tags: ["design patterns", "ruby"]
}
];
Is there an easy way to access the books which are tagged by an particular tag? And also the books which have a particular author? The way I understand how to use crossfilter so far has me doing something like this:
var filtered_data = crossfilter(data);
var tags = filtered_data.dimension(function(d) {return d.tags});
var tag = tags.group();
And then when I access the grouping (like so):
tag.all()
I get this:
[{key: ["clojure", "lisp"], value: 1},
{key: ["design patterns", "ruby"], value: 1},
{key: ["ruby"], value: 1}]
When I would rather have this:
[{key: "ruby", value: 2},
{key: "clojure", value: 1},
{key: "lisp", value: 1},
{key: "design patterns", value: 1}]

I've added comments to the code below. Big picture: use reduce function.
var data = ...
var filtered_data = crossfilter(data);
var tags = filtered_data.dimension(function(d) {return d.tags});
tags.groupAll().reduce(reduceAdd, reduceRemove, reduceInitial).value()
Notice how I've used groupAll() instead of group() b/c we want our reduce functions (defined below) to operate on one group rather than 3 groups.
Now the reduce functions should look like this:
/*
v is the row in the dataset
p is {} for the first execution (passed from reduceInitial).
For every subsequent execution it is the value returned from reduceAdd of the prev row
*/
function reduceAdd(p, v) {
v.tags.forEach (function(val, idx) {
p[val] = (p[val] || 0) + 1; //increment counts
});
return p;
}
function reduceRemove(p, v) {
//omitted. not useful for demonstration
}
function reduceInitial() {
/* this is how our reduce function is seeded. similar to how inject or fold
works in functional languages. this map will contain the final counts
by the time we are done reducing our entire data set.*/
return {};
}

I've never used "crossfilter" (I'm assuming this is a JS library). Here are some pure JS methods though.
This...
data.filter(function(d) {
return d.authors.indexOf("Michael Fogus") !== -1;
})
returns this:
[{bookname:"the joy of clojure", authors:["Michael Fogus", "Chris Houser"], tags:["clojure", "lisp"]}]
This...
var res = {};
data.forEach(function(d) {
d.tags.forEach(function(tag) {
res.hasOwnProperty(tag) ? res[tag]++ : res[tag] = 1
});
})
returns this:
({clojure:1, lisp:1, ruby:2, 'design patterns':1})
To either of these, you can apply d3.entries to get your {key:"ruby", value: 2} format.

Related

Reduce number of datapoints using crossfilter

Let's say I have a 100 years worth of monthly data, total of 1200 data points, see bottom.
To plot a tiny overview line chart (e.g. just 100 data points) I have to do it manually by grouping. For instance, group the data by year, then get the average of 12 months value, iterate through every group, then finally reduced the data points to 100.
Instead of this approach, is there a convenient way using crossfilter or any other library?
[
{ date: 1900-01, value: 72000000000},
{ date: 1900-02, value: 58000000000},
{ date: 1900-03, value: 2900000000},
{ date: 1900-04, value: 31000000000},
{ date: 1900-05, value: 33000000000},
...
{ date: 1999-11, value: 30000000000},
{ date: 1999-12, value: 10000000000},
]
It's going to be the same algorithm no matter which library you use, just different ways of specifying it. In this case d3.nest is probably the easiest way to do this, but if you want quick filtering, the crossfilter way isn't too bad.
The difference between using d3.nest and crossfilter is that we're not constructing an array of values, just a single value. So we'll maintain both sum and count.
We'll also need to specify what happens when a row is removed from a bin.
var parse = d3.timeParse("%Y-%m");
data.forEach(function(d) {
// it's best to convert fields before passing to crossfilter
// because crossfilter will look at them many times
d.date = parse(d.key);
});
var cf = crossfilter(data);
var yearDim = cf.dimension(d => d3.timeYear(d.date));
var yearAvgGroup = yearDim.group().reduce(
function(p, v) { // add
p.sum += v.value;
++p.count;
p.avg = p.sum/p.count;
return p;
},
function(p, v) { // remove
p.sum -= v.value;
--p.count;
p.avg = p.count ? p.sum/p.count : 0;
return p;
},
function() { // init
return {sum: 0, count: 0, avg: 0};
}
);
Now yearAvgGroup.all() will return an array of key/value pairs, where the key is the year, and the value contains sum, count, and avg.
Crossfilter doesn't make this problem particularly convenient to solve, but reductio has a helper function for this:
var yearAvgGroup = yearDim.group();
reductio().avg(d => d.value);
Note: it doesn't matter unless you have ton of data, but it's more efficient to only compute sum and count in the group, and compute the average when it's needed.
If you're using dc.js, you can use valueAccessor for this:
// remove avg lines from the above, and
chart.dimension(yearDim)
.group(yearAvgGroup)
.valueAccessor(kv => kv.value.sum / kv.value.count);
Assuming your question is only concerned with producing the data, you can use d3-nest, without crossfilter, to average each year:
Parsing the date value, you can then format the date as a year to create a key. This groups values by key, then we rollup those values with a function to calculate the mean for a given year:
var parse = d3.timeParse("%Y-%m"); // takes: "1900-01"
var format = d3.timeFormat("%Y"); // gives: "1900"
var means = d3.nest()
.key(function(d) { return format(parse(d.date)); })
.rollup(function(values) { return d3.mean(values, function(d) {return d.value; }) })
.entries(data);
Which gives us the following structure:
[
{
"key": "1900",
"value": 39380000000
},
{
"key": "1999",
"value": 20000000000
}
]
var data = [
{ date: "1900-01", value: 72000000000},
{ date: "1900-02", value: 58000000000},
{ date: "1900-03", value: 2900000000},
{ date: "1900-04", value: 31000000000},
{ date: "1900-05", value: 33000000000},
{ date: "1999-11", value: 30000000000},
{ date: "1999-12", value: 10000000000},
];
var parse = d3.timeParse("%Y-%m");
var format = d3.timeFormat("%Y");
var means = d3.nest()
.key(function(d) { return format(parse(d.date)); })
.rollup(function(values) { return d3.mean(values, function(d) {return d.value; }) })
.entries(data);
console.log(means);
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min.js"></script>

Jasmine: how to check that array contains an object?

This check used to pass:
expect(array).toContain(value)
Array:
[
{"_t":"user","id":1073970419,"email":"email3#example.org","name":"Spectator"},
{"_t":"user","id":4464992042,"email":"email4#example.org","name":"Collaborator"},
{"_t":"user","id":1978569710,"email":"email5#example.org","name":"Manage"}
]
Value:
{"_t":"user","id":1978569710,"email":"email5#example.org","name":"Manage"}
But no longer passes. Whats the new way to write the same test?
The syntax you need is:
const obj = {"_t":"user","id":1978569710,"email":"email5#example.org","name":"Manage"};
expect(array).toContain(jasmine.objectContaining(obj));
See fiddle: https://jsfiddle.net/bblackwo/4o5u5Lmo/16/
It won't contain that object (remember, two objects with the same properties are not the same object for the purposes of equality), so toContain will never pass.
You need to use another test, like toEqual or (if you only want to check for a subset of properties), toEqual combined with jasmine.objectContaining.
Here's the toEqual example from the Jasmine documentation on that page:
describe("The 'toEqual' matcher", function() {
it("works for simple literals and variables", function() {
var a = 12;
expect(a).toEqual(12);
});
it("should work for objects", function() {
var foo = {
a: 12,
b: 34
};
var bar = {
a: 12,
b: 34
};
expect(foo).toEqual(bar);
});
});
Note now foo equals bar.
Here's their example using jasmine.objectContaining:
describe("jasmine.objectContaining", function() {
var foo;
beforeEach(function() {
foo = {
a: 1,
b: 2,
bar: "baz"
};
});
it("matches objects with the expect key/value pairs", function() {
expect(foo).toEqual(jasmine.objectContaining({
bar: "baz"
}));
expect(foo).not.toEqual(jasmine.objectContaining({
c: 37
}));
});
// ...
});
Note how the object with several properties matches the partial object supplied to jasmine.objectContaining.
#T.J.Crowder already explained precisely the problem. Just to help you a tad more, if you want to adapt your example, you'd need something like this:
var userA = {"_t":"user","id":1978569710,"email":"email5#example.org","name":"Manage"}
array =
[
{"_t":"user","id":1073970419,"email":"email3#example.org","name":"Spectator"},
{"_t":"user","id":4464992042,"email":"email4#example.org","name":"Collaborator"},
userA
]
expect(array).toContain(userA);

D3.JS and mapping buckets

I'm very new to D3 and I'm having trouble understanding how to access a 'sum' field from D3. With a basic aggregation I'm able to easily map fields with the following command:
var load_data = resp.aggregations.my_summary.buckets.map(function(d) {
return {
letter: d.key,
frequency: d.doc_count
}
});
The problem occurs when a response is formatted like this:
"buckets": [
{
"1": {
"value": 5975
},
"key": "XXXXXXXXXXXX",
"doc_count": 5376
},
Really what I need is the same "key" field from the original function but the frequency to be the value of that data above.
Thank you for any help!
-John
If I understand your question correctly, you will have many anonymous objects with the same key, but different frequencies. You want to sum each unique key to total the cumulative frequencies.
You can use the Array.reduce() method to help you solve this. I'm not sure how D3 is needed here -- were you trying to use the d3.sum() method?
var b = [
{key: 'a', freq: 45454}, {key: 'a', freq: 4545},
{key: 'b', freq: 1232}, {key: 'b', freq: 4544}
];
var counts = b.reduce(function(p, d) {
if (d.key in p)
p[d.key] += d.freq;
else
p[d.key] = d.freq;
return p;
}, {});

dc.js - min value from group per day

I have the following data structure and would like to display a lineChart with the minimum value of 'amount' (group) by Day (of date) (dimension).
var data = [
{date: "2014-01-01", amount: 10},
{date: "2014-01-01", amount: 1},
{date: "2014-01-15", amount: 0},
{date: "2014-01-15", amount: 10 },
{date: "2014-02-20", amount: 100 },
{date: "2014-02-20", amount: 10 },
];
Where as I would normally be doing something along the following lines, I'm not sure how to find the min in the group.
var dateDim = facts.dimension(function (d) {return d3.time.day(d.date);});
var dateDimGroup = dateDim.group().reduceSum(function (d) { return d.amount; })
Is this possible? I can't seem to find any examples of this so any help would be really appreciated.
Thanks!
You're going to need to keep a custom grouping. The basic idea is that you maintain an array (ordered is best) of all the values in a group. In your add function, you the current value to the array and assign the first value to a 'min' property of the group. In your remove function you remove values, you remove the current value from the array and then assign the first value to the 'min' property (and check if the list is empty, in which case set it to undefined or something along those lines).
You functions will look something like this:
function add(accessor) {
var i;
var bisect = crossfilter.bisect.by(function(d) { return d; }).left;
return function (p, v) {
// Not sure if this is more efficient than sorting.
i = bisect(p.valueList, accessor(v), 0, p.valueList.length);
p.valueList.splice(i, 0, accessor(v));
p.min = p.valueList[0];
return p;
};
};
function remove(accessor) {
var i;
var bisect = crossfilter.bisect.by(function(d) { return d; }).left;
return function (p, v) {
i = bisect(p.valueList, accessor(v), 0, p.valueList.length);
// Value already exists or something has gone terribly wrong.
p.valueList.splice(i, 1);
p.min = p.valueList[0];
return p;
};
};
function initial() {
return function () {
return {
valueList: [],
min: undefined
};
};
}
'accessor' is a function to get at the value you want the minimum of, like in reduceSum. Call each of these functions to return the function you use in the corresponding place in the .group(add, remove, initial) function.
This code is pretty much ripped straight out of reductio, so you could also use reductio and do:
var dateDim = facts.dimension(function (d) {return d3.time.day(d.date);});
var dateDimGroup = reductio().min(function (d) { return d.amount; })(dateDim.group());
Your dateDimGroup will then have a 'min' property that should give you the minimum amount for every date.

crossfilter to do word frequency

I would like a crossfilter group that gives the word frequency and average rating for each word in a series of surveys so that I can make an awesome interactive word-bubble-frequency chart.
My data looks like:
[{feedback: "This is a horrible service", rating:2},
{feedback: "I love everything about everything", rating: 10},
{feedback: "love the user interface, good service", rating:6},
{feedback: "", rating: 7} ]
I would like something like:
[ {key: love, count:2, ave: 8}, {key: horrible, count:1, rating:2 }, {key: service,
count: 2, rating: 4 } ,.... ]
So far I have:
function to break up string into tokens, returns object with frequency for each word
var wordcnt = function(bah ){
var hist = {}, words = bah.split(/[\s*\.*\,\;\+?\#\|:\-\/\\\[\]\(\)\{\}$%&0-9*]/)
for( var i in words)
if(words[i].length >1 )
hist[words[i]] ? hist[words[i]]+=1 : hist[words[i]]=1;
return hist;
};
Loading data into d3 and crossfilter
d3.csv("test.csv", function( error, data) {
data.forEach(function(d) {
d.rating= +d.rating;
d.wordCount= wordcnt(d.feedback.toLowerCase());
});
var ndx = crossfilter(data);
var all = ndx.groupAll();
var frequencyDimension=ndx.dimension(function(d){return d.wordCount; });
And one butchered group reduce function!
var frequencyGroup= frequencyDimension.group().reduce(
function (p, v) {
for (var key in v.wordCount) {
if (v.frequency.hasOwnProperty(key)) {
p.frequency[key]+= v[key];
p.frequency[key].count++;
p.frequency[key].sum+= v.rating ;
}
else{
p.frequency[key]=v[key];
p.frequency[key].count=1;
p.frequency[key].sum = v.rating;
}
}
p.frequency[key].ave = p.frequency[key].sum/p.frequency[key].count ;
return p;
},
function (p, v) {
for (var key in v.wordCount) {
if (v.frequency.hasOwnProperty(key)) {
p.frequency[key]-= v[key];
p.frequency[key].count--;
p.frequency[key].sum-= v.rating ;
}
//don't need an else statement because can't remove key if it doesn't exist
}
p.frequency[key].ave = p.frequency[key].sum/p.frequency[key].count ;
return p;
},
function (p,v) {
return { frequency: {} } ;
}
)

Resources