How to search and group in ElasticSearch? - elasticsearch

I have ElasticSearch index with records like:
{
"project" : "A",
"updated" : <date>,
"cost" : 123
},
{
"project" : "A",
"updated" : <date>,
"cost" : 1
},
{
"project" : "B",
"updated" : <date>,
"cost" : 3
},
{
"project" : "B",
"updated" : <date>,
"cost" : 4
},
{
"project" : "C",
"updated" : <date>,
"cost" : 5
}
I'm trying to draw "cost" chart by selected projects.
Can anyone help me to build a query to get a sum of cost, grouped data by a project?
F.e. I want to select data for projects "A" & "B" and get something like:
date1 ->
projectA -> sum(cost)
projectB -> sum(cost)
date2 ->
projectA -> sum(cost)
projectB -> sum(cost)
Have no idea how to modify this query, which extracts data for the one project:
"query": {
"bool": {
"must": [
{
"match": {
"project": {
"query": <project>,
"type": "phrase"
}
}
},
{
"range": {
"updated": {
"gte": <startDate>,
"format": "epoch_millis"
}
}
}
]
}
},
"aggs": {
"3": {
"date_histogram": {
"field": "End_Time",
"interval": "1M",
"time_zone": "CST6CDT",
"min_doc_count": 1
},
"aggs": {
"2": {
"sum": {
"field": "cost"
}
}
}
}
}
Upd: Thanks guys! With your help I wrote the query:
{
"query": {
"bool": {
"must": [
{
"range": {
"End_Time": {
"gte": 1485892800000,
"format": "epoch_millis"
}
}
}
],
"should": [
{
"match": {
"Project_Name": {
"query": "A",
"type": "phrase"
}
}
},
{
"match": {
"Project_Name": {
"query": "B",
"type": "phrase"
}
}
}
]
}
},
"aggs": {
"3": {
"date_histogram": {
"field": "End_Time",
"interval": "1M",
"time_zone": "CST6CDT",
"min_doc_count": 1
},
"aggs": {
"project_agg": {
"terms": {
"field": "Project_ID"
},
"aggs": {
"2": {
"sum": {
"field": "Cost"
}
}
}
}
}
}
}
}
But it returns something strange:
"aggregations": {
"3": {
"buckets": [
{
"key_as_string": "2017-02-01T00:00:00.000-06:00",
"key": 1485928800000,
"doc_count": 17095,
"project_agg": {
"doc_count_error_upper_bound": 36,
"sum_other_doc_count": 3503,
"buckets": [
{
"2": {
"value": 2536.8616891294323
},
"key": 834879987748,
"doc_count": 2243
},
{
"2": {
"value": 3438.766646153458
},
"key": 497952557271,
"doc_count": 1785
},
{
"2": {
"value": 13066.367076588496
},
"key": 1057394416300,
"doc_count": 1736
},
...
Here are 10 buckets for each month. I expect to see only 2 values for each project. What's wrong?

The query you wrote gives you total cost(irrespective of project) per month, you need to have another aggregation to group by project in between aggregation 3 and aggregation 2.
If you only want for projects A and B, then use filter in aggregation.
"size": 0,
"aggs": {
"project": {
"filter": {
"bool": {
"must": [
{
"terms": {
"project": [
"a",
"b"
]
}
}
]
}
},
"aggs": {
"3": {
"date_histogram": {
"field": "End_Time",
"interval": "1M",
"time_zone": "CST6CDT",
"min_doc_count": 1
},
"aggs": {
"project_agg": {
"terms": {
"field": "project"
},
"aggs": {
"2": {
"sum": {
"field": "cost"
}
}
}
}
}
}
}
}
}

You need to aggregate on project before you aggregate cost:
{
"aggs": {
"3": {
"date_histogram": {
"field": "End_Time",
"interval": "1M",
"time_zone": "CST6CDT",
"min_doc_count": 1
},
"aggs": {
"2": {
"terms": {
"field": "project"
},
"aggs": {
"1": {
"sum": {
"field": "cost"
}
}
}
}
}
}
}
}
For the filtering it depends on how you want to do the search. For a list of projects you could use:
"query": {
"bool": {
"must": [
{ "terms": { "project": [ "a", "b" ] } } //Assuming field is mapped as "analyzed"
]
}
}
In case your mapping contains a .keyword variety, you would format the terms filter like this:
{ "terms": { "project.keyword": [ "A", "B" ] } } //Assuming field is mapped as "not_analyzed" or has a keyword field.
Here is an example of how a field is mapped in ES 5.5 as "text" with a "keword" field:
"ShortTextContent" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
In this case I can access the analyzed version using "ShortTextContent" and the not_analyzed version using "ShortTextContent.keyword"

Related

Elasticsearch query with time range and cardinality

I need help with ES query for both Time Range and Cardinality. For now, my query for Time Range is as follow:
query={
"query": {
"bool": {
"must": [
{
"query_string": {
"query": querystr_var,
"analyze_wildcard": "true"
}
}
]
}
},
"size": 0,
"_source": {
"excludes": []
},
"aggs": {
"range": {
"date_range": {
"field": timeField_var,
"format" : "yyyy-MM-dd HH:mm:ss.SSS",
"ranges": [
{
"from": startDateTime_var,
"to": endDateTime_var,
"key": "CurrentCount"
},
{
"from": prev1WeekStartDateTime_var,
"to": prev1WeekEndDateTime_var,
"key": "Prev1WeekCount"
}
],
"keyed": "true"
}
}
}
}
The above query is work fine, but now I need to also count for unique "CustomerID" using cardinality, I tried below but the result is the same as before, no effect:
query={
"query": {
"bool": {
"must": [
{
"query_string": {
"query": querystr_var,
"analyze_wildcard": "true"
}
}
]
}
},
"size": 0,
"_source": {
"excludes": []
},
"aggs": {
"session_count": {
"cardinality": {
"field": "CustomerID"
}
},
"range": {
"date_range": {
"field": timeField_var,
"format" : "yyyy-MM-dd HH:mm:ss.SSS",
"ranges": [
{
"from": startDateTime_var,
"to": endDateTime_var,
"key": "CurrentCount"
},
{
"from": prevWeekStartDateTime_var,
"to": prevWeekEndDateTime_var,
"key": "PrevWeekCount"
}
],
"keyed": "true"
}
}
}
}
Can you please help with this query. Thanks so much!
Your query seems to be correct. I tried a similar query (with aggregation), with some sample data, and the result is as expected.
Index Data:
{
"date": "2015-01-01",
"customerId": 1
}
{
"date": "2015-02-01",
"customerId": 2
}
{
"date": "2015-03-01",
"customerId": 3
}
{
"date": "2015-04-01",
"customerId": 3
}
Search Query:
{
"size":0,
"aggs": {
"session_count": {
"cardinality": {
"field": "customerId"
}
},
"range": {
"date_range": {
"field": "date",
"ranges": [
{
"from": "2015-01-01",
"to": "2015-05-01"
}
],
"keyed": "true"
}
}
}
}
Search Result:
"aggregations": {
"session_count": {
"value": 3
},
"range": {
"buckets": {
"2015-01-01T00:00:00.000Z-2015-05-01T00:00:00.000Z": {
"from": 1.4200704E12,
"from_as_string": "2015-01-01T00:00:00.000Z",
"to": 1.4304384E12,
"to_as_string": "2015-05-01T00:00:00.000Z",
"doc_count": 4
}
}
}
}
Ok so after losing lots of hair, I found out that I need to put the "cardinality" under each of the separated date_range, something like this:
...
"aggs": {
"currentCount": {
"date_range": {
"field": timeField_var,
"format" : "yyyy-MM-dd HH:mm:ss.SSS",
"ranges": [
{
"from": startDateTime_var,
"to": endDateTime_var,
"key": "CurrentCount"
}
],
"keyed": "true"
},
"aggs": {
"currentUnique": {
"cardinality": {
"field": "CustomerID"
}
}
}
},
"previousCount": {
"date_range": {
"field": timeField_var,
"format" : "yyyy-MM-dd HH:mm:ss.SSS",
"ranges": [
{
"from": prevWeekStartDateTime_var,
"to": prevWeekEndDateTime_var,
"key": "previousUnique"
}
],
"keyed": "true"
},
"aggs": {
"previousUnique": {
"cardinality": {
"field": "CustomerID"
}
}
}
}

How to shift elastic graph by 1 hr?

I have a visualization on hourly basis. Data from 1 to 2 is displayed at 1 o'clock. I want it to be displayed at 2 o'clock. How can I shift the graph by 1 ?
This is the query that I'm using-
Query -
{
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "*",
"analyze_wildcard": true
}
},
{
"match": {
"server-status.name.keyword": {
"query": "https-x509",
"type": "phrase"
}
}
},
{
"range": {
"server-status.meta.current-time": {
"gte": 1550660541174,
"lte": 1550674941175,
"format": "epoch_millis"
}
}
}
],
"must_not": []
}
},
"size": 0,
"_source": {
"excludes": []
},
"aggs": {
"2": {
"date_histogram": {
"field": "server-status.meta.current-time",
"interval": "1h",
"time_zone": "CST6CDT",
"min_doc_count": 1
},
"aggs": {
"4": {
"terms": {
"field": "server-status.type.keyword",
"include": "http-server",
"size": 500,
"order": {
"1": "desc"
}
},
"aggs": {
"1": {
"sum": {
"field": "server-status.status-properties.request-rate.value",
"script": "_value/60"
}
},
"3": {
"terms": {
"field": "server-status.name.keyword",
"size": 5,
"order": {
"1": "desc"
}
},
"aggs": {
"1": {
"sum": {
"field": "server-status.status-properties.request-rate.value",
"script": "_value/60"
}
}
}
}
}
}
}
}
}
}
I would like to shift the values by 1 hr. For example if the value is 2.0 at 2019-02-20T05:00:00.000-06:00 I want it to be displayed for 2019-02-20T06:00:00.000-06:00
Just a possible workaround:
Kibana display time based on browser timezone. You could set the timezone in Kibana configuration for a timezone of your interests.
Update:
You could use date_range aggregation and choose key for those buckets. You will need to generate the aggregation based on your time_range and interval.
For example:
"aggs": {
"range": {
"date_range": {
"field": "date",
"ranges": [
{
"key": "bucket1",
"to": "2016/02/01"
},
{
"key": "bucket2",
"from": "2016/02/01",
"to" : "now/d"
}
]
}
}
}
Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-daterange-aggregation.html

Elastic Search: Bool Query in nested properties

Lets assume I have data structured like this:
{ "id": "120400871755634330808993320",
"name": "Metaalschroef binnenzeskant, DIN 912 RVS A4-80",
"description": "m16x70 cilinderschroef bzk a4-80 din912 klasse 80",
"fullDescription": "Metaalschroef met een binnenzeskant cilinderkop",
"synonyms": [],
"properties": [
{
"name": "draad",
"value": "16",
"sort": 99
},
{
"name": "lengte",
"value": "70",
"sort": 99
},
{
"name": "materiaal",
"value": "roestvaststaal",
"sort": 99
},
{
"name": "kwaliteit (materiaal)",
"value": "A4",
"sort": 99
},
{
"name": "DIN",
"value": "912",
"sort": 99
},
{
"name": "AISI",
"value": "316",
"sort": 99
},
{
"name": "draadsoort",
"value": "metrisch",
"sort": 99
},
{
"name": "Merk",
"value": "Elcee Holland",
"sort": 1
}
]
}
How do I write a boolean query where I select all documents that have a property with name "draad" and value "16" and a property with name "lengte" and value "70".
Right now I have this but it returns 0 results:
"query" : {
"nested" : {
"path" : "properties",
"query" : {
"bool" : {
"must" : [{
"bool" : {
"must" : [{
"term" : {
"properties.name" : "Merk"
}
}, {
"term" : {
"properties.value" : "Facom"
}
}
]
}
}, {
"bool" : {
"must" : [{
"term" : {
"properties.name" : "materiaal"
}
}, {
"term" : {
"properties.value" : "kunststof"
}
}
]
}
}
]
}
}
}
}
Replacing the highest level "must" with "should" returns too many results, which makes sense as it translates to an "or".
When using must, the engine is trying to search for nested documents with name:Merk and value:Facom. But also with name:materiaal and value:kunststof - which is impossible to happen in the same nested document at once.
When using should as you mentioned, it translate to or - which is indeed possible.
Problem is, you also getting the entire parent document with all it's nested documents.
In my own answer I'm showing the steps to create an index with nested documents (you should mark the field properties as nested type`).
After complete those steps, you'll be able to get results with the following query:
{
"_source": [
"id",
"name",
"description"
],
"query": {
"bool": {
"must": [
{
"nested": {
"path": "properties",
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"term": {
"properties.name": "Merk"
}
},
{
"term": {
"properties.value": "Facom"
}
}
]
}
},
{
"bool": {
"must": [
{
"term": {
"properties.name": "materiaal"
}
},
{
"term": {
"properties.value": "kunststof"
}
}
]
}
}
]
}
},
"inner_hits":{
"size": 10
}
}
}
]
}
}
}
I found a solution that is working very well!
My property object now looks like this:
{
"name": "breedte(mm)",
"value": "1000",
"unit": "mm",
"sort": 99,
"nameSlug": "breedte-mm",
"slug": "breedte-mm-1000"
},
I added a slug (containing a normalized string for key + value) and a nameslug which is a normalized string for the name.
My index is mapped like this:
"properties": {
"type": "nested",
"include_in_parent": true,
"properties": {
"name": {
"type": "keyword"
},
"nameSlug": {
"type": "keyword"
},
"slug": {
"type": "keyword"
},
"sort": {
"type": "long"
},
"unit": {
"type": "text",
"index": false
},
"value": {
"type": "keyword"
}
}
}
The "include_in_parent" is important here. It allows me to do the query below:
"query": {
"bool": {
"must": [
{
"terms": {
"properties.slug": [
"merk-orbis",
"merk-bahco"
]
}
},
{
"terms": {
"properties.slug": [
"materiaal-staal",
"materiaal-kunststof"
]
}
}
]
}
},
This queries searches for all documents where "merk" is "Orbis" or "Bahco" and where "materiaal" is "staal" or "kunststof".
My aggregations look like this:
"merk_query": {
"filter": {
"bool": {
"must": [
{
"terms": {
"properties.slug": [
"materiaal-staal",
"materiaal-kunststof"
]
}
}
]
}
},
"aggs": {
"merk_facets": {
"nested": {
"path": "properties"
},
"aggs": {
"merk_only": {
"filter": {
"term": {
"properties.nameSlug": {
"value": "merk"
}
}
},
"aggs": {
"facets": {
"terms": {
"field": "properties.name",
"size": 1
},
"aggs": {
"facetvalues": {
"terms": {
"field": "properties.value",
"size": 10
}
}
}
}
}
}
}
}
}
},
I run filteraggregate which filters all documents that match a facet (but not the current one I am bulding).
The result of this aggragate is something like this:
"merk_query": {
"doc_count": 7686,
"merk_facets": {
"doc_count": 68658,
"merk_only": {
"doc_count": 7659,
"facets": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Merk",
"doc_count": 7659,
"facetvalues": {
"doc_count_error_upper_bound": 10,
"sum_other_doc_count": 438,
"buckets": [
{
"key": "Orbis",
"doc_count": 6295
},
{
"key": "DX",
"doc_count": 344
},
{
"key": "AXA",
"doc_count": 176
},
{
"key": "Talen Tools",
"doc_count": 127
},
{
"key": "Nemef",
"doc_count": 73
},
{
"key": "bonfix",
"doc_count": 67
},
{
"key": "Bahco",
"doc_count": 64
},
{
"key": "Henderson",
"doc_count": 27
},
{
"key": "Maasland Groep",
"doc_count": 25
},
{
"key": "SYSTEC",
"doc_count": 23
}
]
}
}
]
}
}
}
}
},
And this is the end result in the browser:

Elasticsearch missing aggs

I am doing an Elasticsearch query and having problems with the aggs going missing.
If I do the query below I get the aggs back without issue:
{
"query": {
"filtered": {
"query": {
"query_string": {
"query": "*wet*",
"fields": [
"Name",
"Summary",
"Description",
"Location",
"Features",
"TypeName",
"CategoryName"
]
},
"filter": {
"term": {
"TypeID": "13"
}
}
}
}
},
"aggs": {
"type": {
"terms": {
"field": "TypeID"
}
},
"category": {
"terms": {
"field": "CategoryID"
}
},
"max_price": {
"max": {
"field": "Price"
}
},
"min_price": {
"min": {
"field": "Price"
}
},
"filter-type": {
"term": {
"TypeID": "13"
}
}
},
"from": 0,
"size": 50,
"sort": {
"_score": {
"order": "desc"
}
},
"explain": false
}
However, as soon as I add a filter, the aggs are no longer returned. I can't see what I am doing wrong so any help would be really appreciated.
The one that doesn't return aggs looks like this:
{
"query": {
"filtered": {
"query": {
"query_string": {
"query": "*wet*",
"fields": [
"Name",
"Summary",
"Description",
"Location",
"Features",
"TypeName",
"CategoryName"
]
},
"filter": {
"term": {
"TypeID": "13"
}
}
}
}
},
"aggs": {
"type": {
"terms": {
"field": "TypeID"
}
},
"category": {
"terms": {
"field": "CategoryID"
}
},
"max_price": {
"max": {
"field": "Price"
}
},
"min_price": {
"min": {
"field": "Price"
}
},
"filter-type": {
"term": {
"TypeID": "13"
}
}
},
"from": 0,
"size": 50,
"sort": {
"_score": {
"order": "desc"
}
},
"explain": false
}
This is the correct syntax for filters aggregation. You have to wrap all you aggregation inside filter like this
{
"size": 0,
"query": {
"filtered": {
"query": {
"query_string": {
"query": "*wet*",
"fields": [
"Name",
"Summary",
"Description",
"Location",
"Features",
"TypeName",
"CategoryName"
]
}
},
"filter": {
"term": {
"TypeID": "13"
}
}
}
},
"aggs": {
"filter-type": {
"filter": {
"term": {
"TypeID": "13"
}
},
"aggs": {
"type": {
"terms": {
"field": "TypeID"
}
},
"category": {
"terms": {
"field": "CategoryID"
}
},
"max_price": {
"max": {
"field": "Price"
}
},
"min_price": {
"min": {
"field": "Price"
}
}
}
}
}
}
Also in this case filter in aggs is redundant as you are already using the same filter in your query.

ElasticSearch Nested Filter and Aggregation

So I have this mapping:
"employee": {
"properties": {
"DaysOff": {
"type": "nested",
"properties": {
"Date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"Days": {
"type": "double"
},
"ID": {
"type": "long"
}
}
}
}
}
So basically a employee can have days off. Each day off they have is stored in an array under the property DaysOff. Days can be a fraction of a day, so if an employee took half a day off then it would be 0.5.
So I have this search:
{
"size": 45,
"filter": {
"nested": {
"path": "DaysOff",
"filter": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
}
}
}
}
which brings me back 45 documents. which is correct. I'm just can't figure out how to now apply an aggregation to these documents in order to get back the sum of all the days that have been taken.
Using this resource I tried this aggs but didn't get me the correct result:
{
"size": 45,
"filter": {
"nested": {
"path": "DaysOff",
"filter": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
}
}
},
"aggs": {
"sum_docs": {
"nested": {
"path": "DaysOff"
},
"aggs": {
"stepped_down": {
"sum": {
"field": "DaysOff.Days"
}
}
}
}
}
}
You need to filter on those nested documents to get the correct results, From the docs
Because nested documents are indexed as separate documents, they can only be accessed within the scope of the nested query,
I created index like this
POST employee
{
"mappings": {
"emp_map": {
"properties": {
"DaysOff": {
"type": "nested",
"properties": {
"Date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"Days": {
"type": "double"
},
"ID": {
"type": "long"
}
}
},
"name": {
"type": "string"
}
}
}
}
},
Then I indexed few documents like this,
PUT employee/emp_map/1
{
"name" : "messi",
"DaysOff" : [
{
"Date" : "2015-11-01",
"Days" : 1,
"ID" : 11
},
{
"Date" : "2014-11-01",
"Days" : 2,
"ID" : 11
},
{
"Date" : "2015-12-01",
"Days" : 0.5,
"ID" : 11
}
]
}
PUT employee/emp_map/2
{
"name" : "ronaldo",
"DaysOff" : [
{
"Date" : "2015-10-01",
"Days" : 3,
"ID" : 12
},
{
"Date" : "2014-11-01",
"Days" : 2,
"ID" : 12
},
{
"Date" : "2015-12-01",
"Days" : 0.5,
"ID" : 12
}
]
}
PUT employee/emp_map/3
{
"name" : "suarez",
"DaysOff" : [
{
"Date" : "2015-11-01",
"Days" : 4,
"ID" : 13
},
{
"Date" : "2015-11-09",
"Days" : 2,
"ID" : 13
},
{
"Date" : "2015-12-01",
"Days" : 1.5,
"ID" : 13
}
]
}
This is my query, notice the filter aggregation in nested aggregation, without that ES will give you sum of all the days taken off.
GET employee/_search
{
"query": {
"bool": {
"filter": {
"nested": {
"path": "DaysOff",
"query": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
}
}
}
}
},
"aggs": {
"emp_name": {
"terms": {
"field": "name",
"size": 10
},
"aggs": {
"nesting": {
"nested": {
"path": "DaysOff"
},
"aggs": {
"filter_date": {
"filter": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
},
"aggs": {
"sum_taken_off_days": {
"sum": {
"field": "DaysOff.Days"
}
}
}
}
}
}
}
}
},
"size": 0
}
This is the result I get,
"aggregations": {
"emp_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "messi",
"doc_count": 1,
"nesting": {
"doc_count": 3,
"filter_date": {
"doc_count": 2,
"sum_taken_off_days": {
"value": 1.5
}
}
}
},
{
"key": "ronaldo",
"doc_count": 1,
"nesting": {
"doc_count": 3,
"filter_date": {
"doc_count": 1,
"sum_taken_off_days": {
"value": 0.5
}
}
}
},
{
"key": "suarez",
"doc_count": 1,
"nesting": {
"doc_count": 3,
"filter_date": {
"doc_count": 3,
"sum_taken_off_days": {
"value": 7.5
}
}
}
}
]
}
}
P.S : This is per employee, you can remove emp_name terms aggregation to get sum of all employees.

Resources