Aggregation, Query Context and filter Context not working in Elasticsearch 5.1 - elasticsearch

I am facing issue in migrating from elastic search 1.5 to 5.1.
Following is my elastic search - 1.5 Query:
{
"_source":["_id","spotlight"],
"query":{
"filtered":{
"filter":{
"and":[
{"term":{"gender":"female"}},
{"range":{"lastlogindate":{"gte":"2016-10-19 12:39:57"}}}
]
}
}
},
"filter":{
"and":[
{"term":{"maritalstatus":"1"}}
]
},
"sort":[{"member2_dummy7":{"order":"desc"}}],
"size":"0",
"aggs": {
"maritalstatus": {
"filter": {},
"aggs" : {
"filtered_maritalstatus": {"terms":{"field":"maritalstatus","size":5000}}
}
}
}
}
This query is giving me correct doc_count in aggregations. This doc_count is calculated over result set returned by query context and it ignores filter context.
I have written same query in elastic search 5.1:
{
"_source":["_id","spotlight"],
"query":{
"bool":{
"must":[
{"term":{"gender":"female"}},
{"range":{"lastlogindate":{"gte":"2016-10-19 12:39:57"}}}
],
"filter":{
"bool":{
"must":[
{"term":{"maritalstatus":"1"}}
]
}
}
}
},
"sort":[{"member2_dummy7":{"order":"DESC"}}],
"size":"0",
"aggs": {
"maritalstatus": {
"filter": {},
"aggs" : {
"filtered_maritalstatus": {"terms":{"field":"maritalstatus","size":5000}}
}
}
}
}
But in elastic search 5.1, it is returning me wrong doc_count in aggregation. I think it is taking filter in query context and hence, it is returning wrong doc_cout. Can someone tell me correct way to separate query and filter in elastic search 5.1?

Your 1.5 query uses post_filter which you have removed in your 5.1 query.
The equivalent query in ES 5.1 is the following (filtered/filter simply gets replaced as bool/filter and the top-level filter renamed to post_filter):
{
"_source": [
"_id",
"spotlight"
],
"query": {
"bool": {
"filter": [
{
"term": {
"gender": "female"
}
},
{
"range": {
"lastlogindate": {
"gte": "2016-10-19 12:39:57"
}
}
}
]
}
},
"post_filter": {
"term": {
"maritalstatus": "1"
}
},
"sort": [
{
"member2_dummy7": {
"order": "desc"
}
}
],
"size": "0",
"aggs": {
"maritalstatus": {
"filter": {},
"aggs": {
"filtered_maritalstatus": {
"terms": {
"field": "maritalstatus",
"size": 5000
}
}
}
}
}
}

Related

ElasticSearch aggs with function_score

I'm trying to exclude duplicated documents which have the same slug parameters to do it I use aggs in ElasticSearch (version 2.4). I use - this query:
{
"fields":[
"id",
"score"],
"size":0,
"query":{
"function_score":{
"query":{
"bool":{
"should":[
{
"match":{
"main_headline.en":{
"query":"headline_for_search"
}
}
},
{
"match":{
"body.en":"body for search"
}
}],
"must_not":{
"term":{
"id":75333
}
},
"filter":[
{
"term":{
"status":3
}
},
[
{
"term":{
"sites":6
}
}]]
}
},
"functions":[
{
"gauss":{
"published_at":{
"scale":"140w",
"decay":0.3
}
}
}
]
},
"aggs":{
"postslug":{
"terms":{
"field":"slug",
"order":{
"top_score":"desc"
}
},
"aggs":{
"grouppost":{
"top_hits": {
"_source": {
"include": [
"id",
"slug",
]
},
"size" : 10
}
}
}
}
}
}
}
When I run it I get error
failed to parse search source. expected field name but got [START_OBJECT]
I can`t figure out where is a mistake.
Without section aggs all works fine (except present duplicates)
I see one issue which relates to the fact that in the source filtering section include should read includes. Also, the aggs section is not at the right location, you have it in the query section, and it should be at the top-level:
{
"fields": [
"id",
"score"
],
"size": 0,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"match": {
"main_headline.en": {
"query": "headline_for_search"
}
}
},
{
"match": {
"body.en": "body for search"
}
}
],
"must_not": {
"term": {
"id": 75333
}
},
"filter": [
{
"term": {
"status": 3
}
},
[
{
"term": {
"sites": 6
}
}
]
]
}
},
"functions": [
{
"gauss": {
"published_at": {
"scale": "140w",
"decay": 0.3
}
}
}
]
}
},
"aggs": {
"postslug": {
"terms": {
"field": "slug",
"order": {
"top_score": "desc"
}
},
"aggs": {
"grouppost": {
"top_hits": {
"_source": {
"includes": [
"id",
"slug"
]
},
"size": 10
}
}
}
}
}
}

Elasticsearch aggregations on nested inner hits

I got a large amount of data in Elasticsearch. My douments have a nested field called "records" that contains a list of objects with several fields.
I want to be able to query specific objects from the records list, and therefore I use the inner_hits field in my query, but It doesn't help because aggregation uses size 0 so no results are returned.
I didn't succeed to make an aggregation work only for inner_hits, as aggregation returns results for all the objects inside records no matter the query.
This is the query I am using:
(Each document has first_timestamp and last_timestamp fields, and each object in the records list has a timestamp field)
curl -XPOST 'localhost:9200/_msearch?pretty' -H 'Content-Type: application/json' -d'
{
"index":[
"my_index"
],
"search_type":"count",
"ignore_unavailable":true
}
{
"size":0,
"query":{
"filtered":{
"query":{
"nested":{
"path":"records",
"query":{
"term":{
"records.data.field1":"value1"
}
},
"inner_hits":{}
}
},
"filter":{
"bool":{
"must":[
{
"range":{
"first_timestamp":{
"gte":1504548296273,
"lte":1504549196273,
"format":"epoch_millis"
}
}
}
],
}
}
}
},
"aggs":{
"nested_2":{
"nested":{
"path":"records"
},
"aggs":{
"2":{
"date_histogram":{
"field":"records.timestamp",
"interval":"1s",
"min_doc_count":1,
"extended_bounds":{
"min":1504548296273,
"max":1504549196273
}
}
}
}
}
}
}'
Your query is pretty complex.
To be short, here is your requested query:
{
"size": 0,
"aggregations": {
"nested_A": {
"nested": {
"path": "records"
},
"aggregations": {
"bool_aggregation_A": {
"filter": {
"bool": {
"must": [
{
"term": {
"records.data.field1": "value1"
}
}
]
}
},
"aggregations": {
"reverse_aggregation": {
"reverse_nested": {},
"aggregations": {
"bool_aggregation_B": {
"filter": {
"bool": {
"must": [
{
"range": {
"first_timestamp": {
"gte": 1504548296273,
"lte": 1504549196273,
"format": "epoch_millis"
}
}
}
]
}
},
"aggregations": {
"nested_B": {
"nested": {
"path": "records"
},
"aggregations": {
"my_histogram": {
"date_histogram": {
"field": "records.timestamp",
"interval": "1s",
"min_doc_count": 1,
"extended_bounds": {
"min": 1504548296273,
"max": 1504549196273
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
Now, let me explain every step by aggregations' names:
size: 0 -> we are not interested in hits, only aggregations
nested_A -> data.field1 is under records so we dive our scope to records
bool_aggregation_A -> filter by data.field1: value1
reverse_aggregation -> first_timestamp is not in nested document, we need to scope out from records
bool_aggregation_B -> filter by first_timestamp range
nested_B -> now, we scope again into records for timestamp field (located under records)
my_histogram -> finally, aggregate date histogram by timestamp field
Inner_hits aggregation is not supported by elasticsearch. The reason behind it is that inner_hits is a very expensive operation and applying aggregation on inner_hits is like exponential increase in complexity of operation.
Here is the github link of the issue.
If you want aggregation on inner_hits you can probably use the following approach:
Make flexible query where you only get the required hit from elastic and aggregate over it. Repeat it multiple time to get all the hits and aggregate simultaneously. This approach may lead you with multiple search query which is not advisable.
You can make your application layer handle the aggregation logic by writing smart aggregation parser and run those parser on response from elasticsearch. This approach is a little better but you have an overhead of maintaining the parser according to changing needs.
I would personally recommend you to change your data-mapping style in elasticsearch so that you are able to run aggregation on it.
You can also check the code like this
PUT records
{
"mappings": {
"properties": {
"records": {
"type": "nested"
}
}
}
}
POST records/_doc
{
"records": [
{
"data": "test1",
"value": 1
},
{
"data": "test2",
"value": 2
}
]
}
GET records/_search
{
"size": 0,
"aggs": {
"all_nested_count": {
"nested": {
"path": "records"
},
"aggs": {
"bool_aggs": {
"filter": {
"bool": {
"must": [
{
"term": {
"records.data": "test2"
}
}
]
}
},
"aggs": {
"filtered_aggs": {
"sum": {
"field": "records.value"
}
}
}
}
}
}
}
}
Ref: https://www.elastic.co/guide/en/elasticsearch/reference/current/inner-hits.html

Get unique values of a field in elasticsearch and get all records

I am using elasticsearch 2x version
I want distinct values in a field. I am getting only 10 values in the query.
How do I change this to view all distinct records?
Following is the query I am using:
GET messages-2017.04*/_search
{
"fields": ["_index"],
"query": {
"bool": {
"must":{
"bool":{
"should": [
{
"match": {
"RouteData": {
"query": "Q25B",
"type": "phrase"
}
}
}
]
}
}
}
}
}
I need to get all distinct _index names from the DB.
You need to use a terms aggregation instead, like this:
POST messages-2017.04*/_search
{
"size": 0,
"query": {
"bool": {
"must":{
"bool":{
"should": [
{
"match": {
"RouteData": {
"query": "Q25B",
"type": "phrase"
}
}
}
]
}
}
}
},
"aggs": {
"all_indexes": {
"terms": {
"field": "_index",
"size": 100
}
}
}
}

Must not query elasticsearch

I have my request:
{
"size": 10,
"query": {
"filtered": {
"query": {
"bool": {
"must": [
{"term": {"event": "matchmaking_done"}}
]
}
},
"filter": {
"range": {
"#timestamp": {
"gt" : "2016-06-01T00:00:00.000Z",
"lte" : "2016-06-01T00:05:00.000Z"
}
}
}
}
},
"aggs" : {
"user-ids" : {
"terms" : { "field" : "user_id",
"size": 0
}
}
}
}
And I need to add into this request parameter - does not contain field pvp_league! I tried add must_not but can't understand how to do this correct.
Help please!
You answered it yourself, but the ES 2.x way to do this is to not use the filtered query because it has been deprecated and it will be removed in ES 5.0. ES 2.x introduces the concept of the "filter" context rather than every query being either just a query or a filter; now every query is both a filter or a query (scored), just depending on the context it's used in.
For your query, this therefore becomes a little simpler because of the simplified bool / filter syntax:
{
"size":10,
"query":{
"bool":{
"must":[
{
"term":{
"event":"matchmaking_done"
}
}
],
"must_not":[
{
"exists":{
"field":"pvp_league"
}
}
],
"filter":[
{
"range":{
"#timestamp":{
"gt":"2016-06-01T00:00:00.000Z",
"lte":"2016-06-01T00:05:00.000Z"
}
}
}
]
}
},
"aggs":{
"user-ids":{
"terms":{
"field":"user_id",
"size":0
}
}
}
}
As a very big aside, specifying "size" : 0 for the terms aggregation, you are requesting all unique terms, up to INT_MAX. That is not a scalable request (works great with 10 user_ids, or even 100, but not 10000 users).
As a not-so-bad aside, your request doesn't need a query context at all because nothing about the search side of it cares about relevance. Your term query ("event" : "matchmaking_done") either matches or it doesn't. Since you either want it to match or not, but you don't really care about order inherently, you should use this in the filter context. This changes the request to:
{
"size": 10,
"query": {
"bool": {
"must_not": [
{
"exists": {
"field": "pvp_league"
}
}
],
"filter":[
{
"range": {
"#timestamp": {
"gt":"2016-06-01T00:00:00.000Z",
"lte":"2016-06-01T00:05:00.000Z"
}
}
},
{
"term": {
"event": "matchmaking_done"
}
}
]
}
},
"aggs": {
"user-ids": {
"terms": {
"field": "user_id",
"size": 0
}
}
}
}
I've found solution! It looks like this:
{
"size": 10,
"query": {
"filtered": {
"query": {
"bool": {
"must": [
{"term": {"event": "matchmaking_done"}}
],
"must_not": [
{"filtered": {
"filter": {
"exists": {
"field": "pvp_league"
}
}
}
}
]
}
},
"filter": {
"range": {
"#timestamp": {
"gt" : "2016-06-01T00:00:00.000Z",
"lte" : "2016-06-01T00:05:00.000Z"
}
}
}
}
},
"aggs" : {
"user-ids" : {
"terms" : { "field" : "user_id",
"size": 0
}
}
}
}

Elastic search filtered query, query part being ignored?

I'm building up the following search in code, the idea being that it filters down the set of matches then queries this so I can add score based on certain fields. For some reason the filter part works but whatever I put in the query (i.e. in the below I have no index sdfsdfsdf) it still returns anything matching the filter.
Is the syntax wrong?
{
"query":{
"filtered":{
"query":{
"bool":{
"must":{
"match":{
"sdfsdfsdf":{
"query":"4",
"boost":2.0
}
}
}
},
"filter":{
"bool":{
"must":[
{
"terms":{
"_id":[
"55f93ead5df34f1900abc20b",
"55f8ab0226ec4bb216d7c938",
"55dc4e949dcf833308c63d6b"
]
}
},
{
"range":{
"published_date":{
"lte":"now"
}
}
}
],
"must_not":{
"terms":{
"_id":[
"55f0a799acccc28204a5058c"
]
}
}
}
}
}
}
}
}
Your filter is not at the right level. It should not be inside query but at the same level as query like this:
{
"query": {
"filtered": {
"query": { <--- query and filter at the same level
"bool": {
"must": {
"match": {
"sdfsdfsdf": {
"query": "4",
"boost": 2
}
}
}
}
},
"filter": { <--- query and filter at the same level
"bool": {
"must": [
{
"terms": {
"_id": [
"55f93ead5df34f1900abc20b",
"55f8ab0226ec4bb216d7c938",
"55dc4e949dcf833308c63d6b"
]
}
},
{
"range": {
"published_date": {
"lte": "now"
}
}
}
],
"must_not": {
"terms": {
"_id": [
"55f0a799acccc28204a5058c"
]
}
}
}
}
}
}
}
You need to replace sdfsdfsdf with your existing field name in your type, e.g. title, otherwise I think it will fallback to match_all query.
"match":{
"title":{
"query": "some text here",
"boost":2.0
}
}

Resources