Elasticsearch : Is it possible to not analysed aggregation query on analysed field? - elasticsearch

I have certain document which stores the brand names in analysed form for ex: {"name":"Sam-sung"} {"name":"Motion:Systems"}. There are cases where i would want to aggregation these brands under timestamp.
my query as follow ,
{
"size": 0,
"aggs": {
"filtered_aggs": {
"filter": {
"range": {
"#timestamp":{
"gte":"2016-07-18T14:23:41.459Z",
"lte":"2016-07-18T14:53:10.017Z"
}
}
},
"aggs": {
"execute_time": {
"terms": {
"field": "brands",
"size": 0
}
}
}
}
}
}
but the return results will be
{
...
"aggregations": {
"states": {
"buckets": [
{
"key": "Sam",
"doc_count": 5
},
{
"key": "sung",
"doc_count": 5
},
{
"key": "Motion",
"doc_count": 1
},
{
"key": "Systems",
"doc_count": 1
}
]
}
}
}
but i want to the results is
{
...
"aggregations": {
"states": {
"buckets": [
{
"key": "Sam-sung",
"doc_count": 5
},
{
"key": "Motion:Systems",
"doc_count": 1
}
]
}
}
}
Is there any way in which i can make not analysed query on analysed field in elastic search?

You need to add a not_analyzed sub-field to your brands fields and then aggregate on that field.
PUT /index/_mapping/type
{
"properties": {
"brands": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
Then you need to fully reindex your data in order to populate the new sub-fields brands.raw.
Finally, you can change your query to this:
POST index/_search
{
"size": 0,
"aggs": {
"filtered_aggs": {
"filter": {
"range": {
"#timestamp":{
"gte":"2016-07-18T14:23:41.459Z",
"lte":"2016-07-18T14:53:10.017Z"
}
}
},
"aggs": {
"execute_time": {
"terms": {
"field": "brands.raw",
"size": 0
}
}
}
}
}
}

Related

ElasticSearch aggregation query with List in documents

I have following records of car sales of different brands in different cities.
Document -1
{
"city": "Delhi",
"cars":[{
"name":"Toyota",
"purchase":100,
"sold":80
},{
"name":"Honda",
"purchase":200,
"sold":150
}]
}
Document -2
{
"city": "Delhi",
"cars":[{
"name":"Toyota",
"purchase":50,
"sold":40
},{
"name":"Honda",
"purchase":150,
"sold":120
}]
}
I am trying to come up with query to aggregate car statistics for a given city but not getting the right query.
Required result:
{
"city": "Delhi",
"cars":[{
"name":"Toyota",
"purchase":150,
"sold":120
},{
"name":"Honda",
"purchase":350,
"sold":270
}]
}
First you need to map your array as a nested field (script would be complicated and not performant). Nested field are indexed, aggregation will be pretty fast.
remove your index / or create a new one. Please note i use test as type.
{
"mappings": {
"test": {
"properties": {
"city": {
"type": "keyword"
},
"cars": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"purchase": {
"type": "integer"
},
"sold": {
"type": "integer"
}
}
}
}
}
}
}
Index your document (same way you did)
For the aggregation:
{
"size": 0,
"aggs": {
"avg_grade": {
"terms": {
"field": "city"
},
"aggs": {
"resellers": {
"nested": {
"path": "cars"
},
"aggs": {
"agg_name": {
"terms": {
"field": "cars.name"
},
"aggs": {
"avg_pur": {
"sum": {
"field": "cars.purchase"
}
},
"avg_sold": {
"sum": {
"field": "cars.sold"
}
}
}
}
}
}
}
}
}
}
result:
buckets": [
{
"key": "Honda",
"doc_count": 2,
"avg_pur": {
"value": 350
},
"avg_sold": {
"value": 270
}
}
,
{
"key": "Toyota",
"doc_count": 2,
"avg_pur": {
"value": 150
},
"avg_sold": {
"value": 120
}
}
]
if you have index the name / city field as a text (you have to ask first if this is necessary), use .keyword in the term aggregation ("cars.name.keyword").

Elasticsearch Date Histogram with a Point in Time count of documents

I am attempting to create a date histogram showing the number of employees on a monthly basis.
Employee mapping looks something like this:
{
"number": 1234,
"firstName": "Chris",
"lastName": "Smith",
"employmentDates: [
{
"startDate": "2014-10-03T06:00:00Z",
"endDate": "2017-11-04T06:00:00Z"
}
],
"lastPaidOnDate": "2017-11-10T06:00:00Z",
....
}
Given a start end scenario like this (for three employees):
|----------------|
|-----------------------------|
|---| |---------------------|
^ ^ ^ ^ ^ ^
I would expect the histogram to be similar to this:
"aggregations": {
"employees_per_month": {
"buckets": [
{
"key_as_string": "2017-01-01",
"doc_count": 1
},
{
"key_as_string": "2017-02-01",
"doc_count": 2
},
{
"key_as_string": "2017-03-01",
"doc_count": 2
},
{
"key_as_string": "2017-04-01",
"doc_count": 3
},
{
"key_as_string": "2017-05-01",
"doc_count": 3
},
{
"key_as_string": "2017-06-01",
"doc_count": 2
}
]
}
}
It seems like I need to have a sub-aggregation on a scripted field, but I'm not sure where to start.
Your assistance is greatly appreciated.
I believe it's can be done by using DateHistogram. But I'm suggesting a a simple approach. You will have to run the query every time for one specific month:
{
"size": 0,
"aggregations": {
"bool_agg": {
"filter": {
"bool": {
"must": [
{
"range": {
"employmentDates.startDate": {
"lt": "2017-12-01T00:00:00Z"
}
}
},
{
"range": {
"employmentDates.endDate": {
"gte": "2017-11-01T00:00:00Z"
}
}
}
]
}
},
"aggregations": {
"distinct_agg": {
"cardinality": {
"field": "number"
}
}
}
}
}
}
bool_agg: using Filter Aggregation to filter only employment in November
distinct_agg: using Cardinality Aggregation to count, by unique field number, the total employees
Pay attention that if employmentDates would contain more then one record, e.g:
"employmentDates: [
{
"startDate": "2014-10-03T06:00:00Z",
"endDate": "2017-11-04T06:00:00Z"
}
{
"startDate": "2018-03-03T06:00:00Z",
"endDate": "2018-07-04T06:00:00Z"
}
You will must go nested with Nested Datatype, example can be found here.
And update the query to:
{
"size": 0,
"aggregations": {
"nested_agg": {
"nested": {
"path": "employmentDates"
},
"aggregations": {
"bool_agg": {
"filter": {
"bool": {
"must": [
{
"range": {
"employmentDates.startDate": {
"lt": "2017-12-01T00:00:00Z"
}
}
},
{
"range": {
"employmentDates.endDate": {
"gte": "2017-11-01T00:00:00Z"
}
}
}
]
}
},
"aggregations": {
"comment_to_issue": {
"reverse_nested": {},
"aggregations": {
"distinct_agg": {
"cardinality": {
"field": "number"
}
}
}
}
}
}
}
}
}
}

Elastic Search query return terms within array of a specific type

I've a mapping of an index as following:
{"tagged_index":{"mappings":{"tagged":{"properties":{"tags":{"properties":{"resources":{"properties":{"tagName":{"type":"string"},"type":{"type":"string"}}}}},"content":{"type":"string"}}}}}}
Where Resources is an array which can have multiple tags. For example
{"_id":"82906194","_source":{"tags":{"resources":[{"type":"Person","tagName":"Kim_Kardashian",},{"type":"Person","tagName":"Kanye_West",},{"type":"City","tagName":"New_York",},...},"content":" Popular NEWS ..."}}
,
{"_id":"82906195","_source":{"tags":{"resources":[{"type":"City","tagName":"London",},{"type":"Country","tagName":"USA",},{"type":"Music","tagName":"Hello",},...},"content":" Adele's Hello..."}},
...
I do know how to extract important terms[tagName] with the below query, but I do not want terms[tagName] of all types.
How can I extract only the terms which are for example Cities only [type:City]? (I would like to get a list of tagName where the type is City i.e. London, New_York, Berlin,...)
{"size":0,"query":{"filtered":{"query":{"query_string":{"query":"*","analyze_wildcard":true}}}},"aggs":{"Cities":{"terms":{"field":"tags.resources.tagName","size":10,"order":{"_count":"desc"}}}}}
Following is how the required output should look like:
{"took":1200,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":5179261,"max_score":0.0,"hits":[]},"aggregations":{"Cities":{"doc_count_error_upper_bound":46737,"sum_other_doc_count":36037440,"buckets":[{"key":"London","doc_count":332820},{"key":"New_York","doc_count":211274},{"key":"Berlin","doc_count":156954},{"key":"Amsterdam","doc_count":132173},...
Can you try this:
{
"_source" : ["tags.resources.tagName"]
"query": {
"term": {
"tags.resources.type": {
"value": "City"
}
}
}
}
Above query will fetch those resources which are of type city provided resources is of object type.
After Edit
Problem Group By Tag name which are Of city Type. That would not be achieved with the current mapping you have. You will have to change resources field to nested type.
Mapping would look like.
"mappings": {
"resource": {
"properties": {
"tags": {
"properties": {
"content": {
"type": "string"
},
"resources": {
"type": "nested",
"properties": {
"tagName": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
}
Final query would be :
{
"size": 0,
"query": {
"nested": {
"path": "tags.resources",
"query": {
"match": {
"tags.resources.type": "city"
}
}
}
},
"aggs": {
"resources Nested path": {
"nested": {
"path": "tags.resources"
},
"aggs": {
"city type": {
"filter": {
"term": {
"tags.resources.type": "city"
}
},
"aggs": {
"group By tagName": {
"terms": {
"field": "tags.resources.tagName"
}
}
}
}
}
}
}
}
Output would be:
"aggregations": {
"resources Nested path": {
"doc_count": 6,
"city type": {
"doc_count": 2,
"group By tagName": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "london",
"doc_count": 1
},
{
"key": "new_york",
"doc_count": 1
}
]
}
}
}
}

Elasticsearch nested cardinality aggregation

I have a mapping with nested schema, i am tring to do aggregation on nested field and order by docid count.
select name, count(distinct docid) as uniqueid from table
group by name
order by uniqueid desc
Above is what i am trying to do.
{
"size": 0,
"aggs": {
"samples": {
"nested": {
"path": "sample"
},
"aggs": {
"sample": {
"terms": {
"field": "sample.name",
"order": {
"DocCounts": "desc"
}
},
"aggs": {
"DocCounts": {
"cardinality": {
"field": "docid"
}
}
}
}
}
}
}
}
But in the result i am not getting the expected output
result:
"buckets": [
{
"key": "xxxxx",
"doc_count": 173256,
"DocCounts": {
"value": 0
}
},
{
"key": "yyyyy",
"doc_count": 63,
"DocCounts": {
"value": 0
}
}
]
i am getting the DocCounts = 0. This is not expected. What went wrong in my query.
I think your last nested aggregation is too much. Try to get rid of it:
{
"size": 0,
"aggs": {
"samples": {
"nested": {
"path": "sample"
},
"aggs": {
"sample": {
"terms": {
"field": "sample.name",
"order": {
"DocCounts": "desc"
}
},
"DocCounts": {
"cardinality": {
"field": "docid"
}
}
}
}
}
}
}
In general when doing some aggregation on nested type by value from upper scope, we observed that we need to put/copy the value from upper scope on nested type when storing document.
Then in your case aggregation would look like:
"aggs": {
"DocCounts": {
"cardinality": {
"field": "sample.docid"
}
}
}
It works in such case at least on version 1.7 of Elasticsearch.
You can use reverse nested aggregation on top of Cardinality aggregation on DocCounts. This is because when nested aggregation is applied, the query runs against the nested document. So to access any field of parent document inside nested doc, reverse nested aggregation can be used. Check ES Reference for more info on this.
Your cardinality query will look like:
"aggs": {
"internal_DocCounts": {
"reverse_nested": { },
"DocCounts": {
"cardinality": {
"field": "docid"
}
}
}
}
The response will look like:
"buckets": [
{
"key": "xxxxx",
"doc_count": 173256,
"internal_DocCounts": {
"doc_count": 173256,
"DocCounts": {
"value": <some_value>
}
}
},
{
"key": "yyyyy",
"doc_count": 63,
"internal_DocCounts": {
"doc_count": 63,
"DocCounts": {
"value": <some_value>
}
}
},
.....
Check this similar thread

elasticsearch: using nested agg after reverse_nested shows higher count than expected

Using Elasticsearch 2.2.0, I am doing this:
Grouping by a nested field: nested_path.nested_field
Using a reverse_nested agg so I can apply this filter: non_nested_field == "yay"
Using a nested agg so I can then get a count of the nested field I am grouping by: nested_path.nested_field
Problem: By using the reverse_nested agg I am getting a higher doc_count than I would expect.
Here is the mapping and docs I am indexing:
PUT /my_index
{
"mappings": {
"my_type": {
"properties": {
"nested_path": {
"type": "nested",
"properties": {
"nested_field": {
"type": "string"
}
}
},
"non_nested_field": {
"type": "string"
}
}
}
}
}
POST /my_index/my_type/1
{
"non_nested_field": "whoray",
"nested_path": [
{
"nested_field": "yes"
},
{
"nested_field": "yes"
},
{
"nested_field": "no"
}
]
}
POST /my_index/my_type/2
{
"non_nested_field": "yay",
"nested_path": [
{
"nested_field": "maybe"
},
{
"nested_field": "no"
}
]
}
Request body:
POST my_index/my_type/_search
{
"aggs": {
"nested_option": {
"nested": {
"path": "nested_path"
},
"aggs": {
"group_list": {
"terms": {
"field": "nested_path.nested_field",
"size": 100
},
"aggs": {
"level_1": {
"reverse_nested": {},
"aggs": {
"level_2": {
"filter": {
"term": {
"non_nested_field": "yay"
}
},
"aggs": {
"level_3": {
"nested": {
"path": "nested_path"
},
"aggs": {
"stat": {
"value_count": {
"field": "nested_path.nested_field"
}
}
}
}
}
}
}
}
}
}
}
}
},
"size": 0
}
Part of the response I get is this:
{
"aggregations": {
"nested_option": {
"doc_count": 5,
"group_list": {
"buckets": [
{
"key": "no",
"doc_count": 2,
"level_1": {
"doc_count": 2,
"level_2": {
"doc_count": 1,
"level_3": {
"doc_count": 2,
"stat": {
"value": 2
}
}
}
}
}
//....
]
}
}
}
}
In the first element of the buckets array in the response, level_1.level_2.doc_count is 1, and this is correct, because there's only one of the two docs indexed where nested_path.nested_field == "no" and non_nested_field == "yay". But level_1.level_2.level_3.doc_count in the response is 2. It should only be 1. This seems like a bug to me.

Resources