Elasticsearch: Filter results in deep aggregation

Elasticsearch: Filter results in deep aggregation - elasticsearch

I'm trying to retrieve the number of students participating in a combination of sport activities. I've tried using deep aggregation, but I would like to exclude events of the same eventid such that the event is not counted towards the aggregated results if the combination criteria of e.g. 100m and 100m is not met (i.e. both 100m events should have unique eventid). Would this be achievable using elasticsearch?
Mapping:
{
"properties": {
"events": {
"type": "nested",
"include_in_parent": true
}
}
}
Student data:
{
"name": "Alice"
"events": [
{"activity": "400m", "eventid": "4000"},
{"activity": "800m", "eventid": "8000"},
{"activity": "100m", "eventid": "1000"},
{"activity": "100m", "eventid": "1001"}
]
},
{
"name": "Bob"
"events": [
{"activity": "100m", "eventid": "1000"},
{"activity": "400m", "eventid": "4000"}
]
}
{
"name": "Cat"
"events": [
{"activity": "400m", "eventid": "4000"},
{"activity": "400m", "eventid": "4001"}
]
}
{
"name": "Dillian"
"events": [
{"activity": "100m", "eventid": "1001"},
{"activity": "800m", "eventid": "8000"}
]
}
Query:
{
"from": 0,
"size": 0,
"aggregations": {
"activity1": {
"terms": {
"field": "events.activity.keyword",
"size": 5,
"order": {
"_term": "asc"
}
},
"aggregations": {
"activity2": {
"terms": {
"field": "events.activity.keyword",
"size": 5,
"order": {
"_term": "asc"
}
}
}
}
}
}
}
Incorrect Result:
"aggregations": {
"activity1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 3,
"activity2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 3
},
{
"key": "400m",
"doc_count": 2
},
{
"key": "800m",
"doc_count": 2
}
]
}
},
{
"key": "400m",
"doc_count": 3,
"activity2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 2
},
{
"key": "400m",
"doc_count": 3
},
{
"key": "800m",
"doc_count": 1
}
]
}
},
{
"key": "800m",
"doc_count": 2,
"activity2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 2
},
{
"key": "400m",
"doc_count": 1
},
{
"key": "800m",
"doc_count": 2
}
]
}
}
]
}
}
Required Result:
"aggregations": {
"activity1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 3,
"activity2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 1
},
{
"key": "400m",
"doc_count": 2
},
{
"key": "800m",
"doc_count": 2
}
]
}
},
{
"key": "400m",
"doc_count": 3,
"activity2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 2
},
{
"key": "400m",
"doc_count": 1
},
{
"key": "800m",
"doc_count": 1
}
]
}
},
{
"key": "800m",
"doc_count": 2,
"activity2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "100m",
"doc_count": 2
},
{
"key": "400m",
"doc_count": 1
},
{
"key": "800m",
"doc_count": 0
}
]
}
}
]
}
}

Related

How to write an elastic query to find consecutive intervals advancing one day

I cannot find out how to query elastic to find data for multiple intervals, increasing in one day increments through the end of the month.
For instance, I want to look at 7 day intervals in the month of January. 1-7, 2-8, 3-9, 4-10, etc. But I'm getting like this for the given query: 1-7, 8-15, 16-23, etc.
Does anyone know if this is possible in elastic or how to write a query with results I wrote above for consecutive days?
Here is my attempt:
{
"size": 0,
"query": {
"bool": {,
"filter": [
{
"range": {
"associated_datetime": {
"gte": "14/12/2021 19:31:56",
"lte": "14/12/2022 19:31:56",
"format": "dd/MM/yyyy HH:mm:ss"
}
}
}
]
}
},
"aggs": {
"incident": {
"date_histogram": {
"field": "associated_datetime",
"calendar_interval": "week"
},
"aggs": {
"associated_to.id": {
"terms": {
"size": 10000,
"field": "associated_to.id"
}
}
}
}
}
}
Output for the above query looks like this (aggregation object):
"aggregations": {
"incident": {
"buckets": [
{
"key_as_string": "2022-01-03T00:00:00.000Z",
"key": 1641168000000,
"doc_count": 2,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 4,
"doc_count": 2
}
]
}
},
{
"key_as_string": "2022-01-10T00:00:00.000Z",
"key": 1641772800000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-01-17T00:00:00.000Z",
"key": 1642377600000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-03-07T00:00:00.000Z",
"key": 1646611200000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 4,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-03-21T00:00:00.000Z",
"key": 1647820800000,
"doc_count": 7,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 37,
"doc_count": 2
},
{
"key": 38,
"doc_count": 2
},
{
"key": 39,
"doc_count": 2
},
{
"key": 40,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-05-16T00:00:00.000Z",
"key": 1652659200000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 4,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-11-14T00:00:00.000Z",
"key": 1668384000000,
"doc_count": 3,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 2
},
{
"key": 37,
"doc_count": 1
},
{
"key": 38,
"doc_count": 1
},
{
"key": 39,
"doc_count": 1
},
{
"key": 40,
"doc_count": 1
},
{
"key": 41,
"doc_count": 1
},
{
"key": 42,
"doc_count": 1
}
]
}
}
]
}
}

One way to do it is with a date_range aggregation (note: the to date of the ranges are exclusive):
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"range": {
"associated_datetime": {
"gte": "14/12/2021 19:31:56",
"lte": "14/12/2022 19:31:56",
"format": "dd/MM/yyyy HH:mm:ss"
}
}
}
]
}
},
"aggs": {
"incident": {
"date_range": {
"field": "associated_datetime",
"ranges": [
{
"from": "2022-01-01",
"to": "2022-01-08"
},
{
"from": "2022-01-02",
"to": "2022-01-09"
},
{
"from": "2022-01-03",
"to": "2022-01-10"
},
...
]
},
"aggs": {
"associated_to.id": {
"terms": {
"size": 10000,
"field": "associated_to.id"
}
}
}
}
}
}

how can I filter the continuous days in elasticsearch?

Here is my scenario, I want find the users who had continuously login our website equal or greater than 3 days.
For example:
{"login_time":"2018-01-01T18:19:07.982Z", "user_id":123}
{"login_time":"2018-01-01T08:30:07.982Z", "user_id":456}
{"login_time":"2018-01-02T09:39:07.982Z", "user_id":123}
{"login_time":"2018-01-03T08:20:07.982Z", "user_id":123}
{"login_time":"2018-01-03T08:20:07.982Z", "user_id":456}
So, the user_id:123 has been continuously login for 3 days and user_id:456 has been continuously login for 1 day, I wish I can drop the user_id:456 when Elasticsearch returns.
This is my ES JSON:
GET event-tracking/_search
{
"aggs": {
"login_by_day": {
"date_histogram": {
"field": "login_time",
"interval": "day"
},
"aggs": {
"user_id": {
"terms": {
"field": "user_id",
"size": 10
}
}
}
}
}
}
And response:
"aggregations": {
"login_by_day": {
"buckets": [
{
"key_as_string": "2018-01-01T00:00:00.000Z",
"key": 1514764800000,
"doc_count": 2,
"user_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123,
"doc_count": 1
},
{
"key": 456,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2018-01-02T00:00:00.000Z",
"key": 1514851200000,
"doc_count": 1,
"user_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2018-01-03T00:00:00.000Z",
"key": 1514937600000,
"doc_count": 2,
"user_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123,
"doc_count": 1
},
{
"key": 456,
"doc_count": 1
}
]
}
}
]
}
}
Then, I have to write some code to filter the result. My question is how can I use ES JSON to reach it without any code.
Thanks in advance.

how to group by duplicate Field in Array List : ElasticSearch

I had problem with nested aggregation in Elasticsearch. I have mapping with nested field:
"Topics":{"type":"nested","properties":{
"CategoryLev1":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
"CategoryLev2":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} }}
After index Document:
"Topics": [
{
"CategoryRelevancy": "1.0",
"CategoryLev2": "Money",
"CategoryLev1": "Sales"
},
{
"CategoryRelevancy": "2.0",
"CategoryLev2": "Money",
"CategoryLev1": "Sales"
},
{
"CategoryRelevancy": "1.0",
"CategoryLev2": "Electrical",
"CategoryLev1": "Product"
}
]
"Topics": [
{
"CategoryRelevancy": "1.0",
"CategoryLev2": "Money",
"CategoryLev1": "Sales"
},
{
"CategoryRelevancy": "2.0",
"CategoryLev2": "Methods",
"CategoryLev1": "Sales"
},
{
"CategoryRelevancy": "1.0",
"CategoryLev2": "Engine",
"CategoryLev1": "Product"
}
]
As you see, in my nested array I have two Topics, which have Duplicate key and Value field Then I make such query:
{
"size": 10,
"aggregations": {
"resellers": {
"nested": {
"path": "Topics"
},
"aggregations": {
"topicGroup": {
"terms": {
"field": "Topics.CategoryLev1.keyword",
"size": 10
},
"aggregations": {
"Subtopic": {
"terms": {
"field": "Topics.CategoryLev2.keyword"
}
}
}
}
}
}
}
}
Then I get following result which has group by with topic Category
{
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"resellers": {
"doc_count": 6,
"topicGroup": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Sales",
"doc_count": 3,
"Subtopic": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Money",
"doc_count": 3
},
{
"key": "Method",
"doc_count": 1
}
]
}
},
{
"key": "Product",
"doc_count": 2,
"Subtopic": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Electrical",
"doc_count": 1
},
{
"key": "Engine",
"doc_count": 1
}
]
}
}
]
}
}
}
}
But I Want to result Like this
"buckets": [
{
"key": "Sales",
"doc_count": 2,
"Subtopic": {
"buckets": [
{
"key": "Money",
"doc_count": 2
},
{
"key": "Method",
"doc_count": 1
}
]
}
},
{
"key": "Product",
"doc_count": 2,
"Subtopic": {
"buckets": [
{
"key": "Electrical",
"doc_count": 1
},
{
"key": "Engine",
"doc_count": 1
}]
}
}]
Thanks in advance :)

ElasticSearch aggregation by all tokens in a string field

I have ElasticSearch 2.4 and I'm trying to do an aggregation on a text field of type String which contains multiple tokens. The field in question is an address field called mailingAddress. For example, below are a few results which look for NY in the address field.
{
"from": 0,
"size": 100,
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"match": {
"customerprofile.mailingAddress": {
"query": "NY",
"fuzziness": 0,
"operator": "or"
}
}
},
{
"match": {
"customerprofile.companyId": {
"query": "999",
"fuzziness": 0,
"operator": "or"
}
}
}
]
}
}
]
}
}
}
returns
"hits":[
{
"_index":"wht_index_prod_v33_es24",
"_type":"customerprofile",
"_id":"2044",
"_score":2.9787974,
"_source":{
"customerId":2044,
"companyId":2007,
"fullName":"John Doe",
"email":"jon#aol.com",
"pictureURL":"john.png",
"profilePictureContentType":"image/png",
"phone":"(703) 999-8888",
"mailingAddress":"100 Lake Braddock Drive\nBurke, NY 22015",
"gender":"Male",
"emergencyContactsIds":[
],
"wantCorrespondence":false
}
},
{
"_index":"wht_index_prod_v33_es24",
"_type":"customerprofile",
"_id":"2045",
"_score":2.9787974,
"_source":{
"customerId":2045,
"companyId":2007,
"fullName":"Jane Anderson",
"email":"janea#touchva.net",
"pictureURL":"JAnderson.png",
"profilePictureContentType":"image/png",
"phone":"(434) 111-2345",
"mailingAddress":"PO Box 333, Boydton, NY 23917",
"gender":"Male",
"emergencyContactsIds":[
],
"wantCorrespondence":false
}
},
..
..
]
The question
When I do the aggregation by mailingAddress I expect to see buckets for each word in the text field. From the results above I expect to also find a bucket key named 'NY' but there isn't one. Can anyone explain why - my guess is that it has too few entries?
The aggregation:
{
"size": 0,
"aggs": {
"group_by_age": {
"terms": {
"field": "mailingAddress"
},
"aggs": {
"group_by_gender": {
"terms": {
"field": "gender"
}
}
}
}
}
}
Aggregation results:
{
"took": 16,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 401,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"group_by_age": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 1041,
"buckets": [
{
"key": "st",
"doc_count": 30,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 17
},
{
"key": "male",
"doc_count": 13
}
]
}
},
{
"key": "ca",
"doc_count": 28,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 21
},
{
"key": "male",
"doc_count": 7
}
]
}
},
{
"key": "dr",
"doc_count": 16,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 13
},
{
"key": "male",
"doc_count": 3
}
]
}
},
{
"key": "street",
"doc_count": 15,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 11
},
{
"key": "male",
"doc_count": 4
}
]
}
},
{
"key": "ave",
"doc_count": 14,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 7
},
{
"key": "male",
"doc_count": 7
}
]
}
},
{
"key": "box",
"doc_count": 11,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 9
},
{
"key": "male",
"doc_count": 2
}
]
}
},
{
"key": "fl",
"doc_count": 11,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 9
},
{
"key": "male",
"doc_count": 2
}
]
}
},
{
"key": "va",
"doc_count": 11,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "male",
"doc_count": 6
},
{
"key": "female",
"doc_count": 5
}
]
}
},
{
"key": "n",
"doc_count": 10,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 7
},
{
"key": "male",
"doc_count": 3
}
]
}
},
{
"key": "az",
"doc_count": 9,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 7
},
{
"key": "male",
"doc_count": 2
}
]
}
}
]
}
}
}

By default, terms aggregation return the first 10 terms, but you can decide to return more by specifying a size in your aggregation, like this:
{
"size": 0,
"aggs": {
"group_by_age": {
"terms": {
"field": "mailingAddress",
"size": 50 <---- add this
},
"aggs": {
"group_by_gender": {
"terms": {
"field": "gender"
}
}
}
}
}
}
Your mileage may vary and you might need to increase the size in order to really see NY.

How to use elasticsearch facet query to groupby the result

I have a json data in the below format
{
"ID": { "Color": "Black", "Product": "Car" },
"ID": { "Color": "Black", "Product": "Car" },
"ID": { "Color": "Black", "Product": "Van" },
"ID": { "Color": "Black", "Product": "Van" },
"ID": { "Color": "Ash", "Product": "Bike" }
}
I want to calculate the count of car and the corresponding color. I am using elasticsearch facet to do this.
My query
$http.post('http://localhost:9200/product/productinfoinfo/_search?size=5', { "aggregations": { "ProductInfo": { "terms": { "field": "product" } } }, "facets": { "ProductColor": { "terms": { "field": "Color", "size": 10 } } } })
I am getting the output like below
"facets": { "ProductColor": { "_type": "terms", "missing": 0, "total": 7115, "other": 1448, "terms": [ { "term": "Black", "count": 4 }, { "term": "Ash","count":1} },
"aggregations": { "ProductInfo": { "doc_count_error_upper_bound": 94, "sum_other_doc_count": 11414, "buckets": [ { "key": "Car", "doc_count": 2 }, { "key": "Van", "doc_count": 2 }, { "key": "Bike", "doc_count": 1 } ] } } }
What I actually want is,
[ { "key": "Car", "doc_count": 2, "Color":"Black", "count":2 }, { "key": "Van", "doc_count": 2,"Color":"Black", "count":2 }, { "key": "Bike", "doc_count": 1,"Color":"Ash", "count":1 } ]
I would like to groupby the result . Is it possible to do it in elasticsearch query.
Thanks in advance

This is because you're using both aggregations and facets, which, if they are similar, are not meant to be used together.
Facets are deprecated and will be soon removed from ElasticSearch.
Aggregations are the way to go to make "group by"-like queries.
You just have to nest another terms aggregation in the first one, like this :
{
"aggs": {
"By_type": {
"terms": {
"field": "Product"
},
"aggs": {
"By_color": {
"terms": {
"field": "Color"
}
}
}
}
}
}
And the result will be close to what you want :
"aggregations": {
"By_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bike",
"doc_count": 2,
"By_color": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "ash",
"doc_count": 1
},
{
"key": "black",
"doc_count": 1
}
]
}
},
{
"key": "car",
"doc_count": 2,
"By_color": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "black",
"doc_count": 2
}
]
}
},
{
"key": "van",
"doc_count": 1,
"By_color": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "black",
"doc_count": 1
}
]
}
}
]
}
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Elasticsearch: Filter results in deep aggregation - elasticsearch

Related

How to write an elastic query to find consecutive intervals advancing one day

how can I filter the continuous days in elasticsearch?

how to group by duplicate Field in Array List : ElasticSearch

ElasticSearch aggregation by all tokens in a string field

How to use elasticsearch facet query to groupby the result

Categories

Resources