elastic search query for aggregating on sub-fields of array - elasticsearch

My elastic search index contain data in such format, the data is an array of object which contain date , order, visit for that date on that term :-
{
"term": "ふるさと納税",
"data": [
{
"date": "2018-01-25",
"visits": 17670,
"ranking": 1,
"orders": 154
},
{
"date": "2018-02-14",
"visits": 13758,
"ranking": 1,
"orders": 116
},
{
"date": "2017-12-24",
"visits": 142578,
"ranking": 1,
"orders": 2565
},
{
"date": "2018-03-08",
"visits": 21799,
"ranking": 1,
"orders": 312
}
]
},{
"term": "帯 中古 振袖",
"data": [
{
"date": "2018-01-30",
"ranking": 2966,
"orders": 0,
"visits": 345
}
]
}
i would like to sum all the visits and orders for the term within a defined date range
I have created this query :-
{
"_source": [],
"query": {
"bool": {
"filter": [
{"range": {"data.date": {"gte" : "2018-03-21"}}},
{"range": {"data.date": {"lte" : "2018-03-21"}}}
]
}
},
"aggs" : {
"by_term": {
"terms": {
"field": "term",
"order":{"sum_ranking":"desc"},
"size":100
},"aggs": {
"sum_ranking": {
"sum": {
"field" : "data.visits"
}
}
}
}
},
"from" : 0,
"size" : 0
}
it seems the filter is not working .
can any one help.
The mapping is :-
{
"settings" : {
"number_of_shards" : 1
},
"mappings" : {
"keyword" : {
"properties" : {
"term" : { "type" : "keyword" }
}
}
}
}

Related

Elasticsearch Normalised Score with Boost Documents

I am building a query that takes a set of codes and geo_point locations. The result should be a list of documents ordered by distance to origin. However, I would like to be normalised with let say a score of 10 for the document in the origin location and decreasing according to distance from origin. I have actually managed to build this search but I also would like to increase the score of a document if this includes an additional variable in the list of codes.
These are the requirements:
The output should be a list of documents which score is normalised according to distance from origin.
Documents returned should contain at least one yvar (i.e. yvar1 OR yvar2 OR yvar3 OR yvar...).
Only documents after a certain date should be returned
Only documents containing all the xvars passed to the query must be returned.
If a document has an additional x variable (e.g xvar4) the score for this document, should be increased by 0.1. This is the bit I am struggling with.
This is my mapping:
{
"mappings": {
"properties": {
"codes": {
"type": "keyword"
},
"date": {
"type": "date",
"format": "dd/MM/yyyy"
},
"coordinates": {"type": "geo_point"}
}
}
}
Some example documents (NB: The distanceToOrigin is for analysing the output only):
{ "create" : { "_index": "my-index", "_id" : "1" } }
{ "id": 1, "coordinates": { "lat": 51.5132, "lon": -0.1362}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 0 }
{ "create" : { "_index": "my-index", "_id" : "2" } }
{ "id": 2, "coordinates": { "lat": 52.9114, "lon": 0.5580}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "xvar4", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 114 }
{ "create" : { "_index": "my-index", "_id" : "3" } }
{ "id": 3, "coordinates": { "lat": 51.4890, "lon": -0.6029}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 22 }
{ "create" : { "_index": "my-index", "_id" : "4" } }
{ "id": 4, "coordinates": { "lat": 57.2555, "lon": -3.2692}, "available capacity": 5, "last updated": "01/11/2021", "ResponseCodes": ["xvar1", "xvar2", "xvar3", "yvar1", "yvar2", "yvar3" ] ,"distanceTOorigin": 530 }
My query which produces a normalised list of documents:
{
"query": {
"function_score": {
"query": { "match_all": {} },
"boost": "1",
"functions": [
{
"filter": [
{ "range": { "date":{ "gte": "01/11/2000" }}},
{ "terms_set": { "codes" : { "terms" : ["yvar1", "yvar2", "yvar3" ],
"minimum_should_match_script": { "source": "1" }}}}
],
"random_score": {},
"weight": 1
},
{
"filter": [
{ "terms_set": { "codes" : { "terms" : ["xvar1", "xvar2", "xvar3" ],
"minimum_should_match_script": { "source": "params.num_terms" }}}}
],
"weight": 1
},
{
"exp": {
"coordinates": {
"origin": "51.5132, -0.1362",
"offset": "0km",
"decay": 0.5,
"scale":"350km"}
},
"weight": 10
}
],
"max_boost": 10,
"score_mode": "max",
"boost_mode": "multiply"
}
}
}
This is what I tried as a query (substituting the match_all query) but does not work as I end up with a non-normalised list
"query": {
"bool": {
"should": [
{
"terms_set": { "codes" : { "terms" : ["xvar4"],
"minimum_should_match_script": { "source": "0" }, "boost" : 0.1}}
},
{
"match_all": {}
}
]
}
}
Any help for this ealsticsearch beginner will be greatly appreciated.
I found the solution by accessing the _score in a script_score query:
{
"query": {
"script_score": {
"query": {
"match": { "codes": "xvar4" }
},
"script": {
"source": "_score +0.1"
}
}
}
}

Nested Query Elastic Search

Currently I am trying to search/filter a nested Document in Elastic Search Spring Data.
The Current Document Structure is:
{
"id": 1,
"customername": "Cust#123",
"policydetails": {
"address": {
"city": "Irvine",
"state": "CA",
"address2": "23994384, Out OF World",
"post_code": "92617"
},
"policy_data": [
{
"id": 1,
"status": true,
"issue": "Variation Issue"
},
{
"id": 32,
"status": false,
"issue": "NoiseIssue"
}
]
}
}
Now we need to filter out the policy_data which has Noise Issue and If there is no Policy Data which has Noise Issue the policy_data will be null inside the parent document.
I have tried to use this Query
{
"query": {
"bool": {
"must": [
{
"match": {
"customername": "Cust#345"
}
},
{
"nested": {
"path": "policiesDetails.policy_data",
"query": {
"bool": {
"must": {
"terms": {
"policiesDetails.policy_data.issue": [
"Noise Issue"
]
}
}
}
}
}
}
]
}
}
}
This works Fine to filter nested Document. But If the Nested Document does not has the match it removes the entire document from the view.
What i want is if nested filter does not match:-
{
"id": 1,
"customername": "Cust#123",
"policydetails": {
"address": {
"city": "Irvine",
"state": "CA",
"address2": "23994384, Out OF World",
"post_code": "92617"
},
"policy_data": null
}
If any nested document is not found then parent document will not be returned.
You can use should clause for policy_data. If nested document is found it will be returned under inner_hits otherwise parent document will be returned
{
"query": {
"bool": {
"must": [
{
"match": {
"customername": "Cust#345"
}
}
],
"should": [
{
"nested": {
"path": "policydetails.policy_data",
"inner_hits": {}, --> to return matched policy_data
"query": {
"bool": {
"must": {
"terms": {
"policydetails.policy_data.issue": [
"Noise Issue"
]
}
}
}
}
}
}
]
}
},
"_source": ["id","customername","policydetails.address"] --> selected fields
}
Result:
{
"_index" : "index116",
"_type" : "_doc",
"_id" : "f1SxGHoB5tcHqHDtAkTC",
"_score" : 0.2876821,
"_source" : {
"policydetails" : {
"address" : {
"city" : "Irvine",
"address2" : "23994384, Out OF World",
"post_code" : "92617",
"state" : "CA"
}
},
"id" : 1,
"customername" : "Cust#123"
},
"inner_hits" : {
"policydetails.policy_data" : {
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ] --> nested query result , matched document returned
}
}
}
}

Should and Filter combination in ElasticSearch

I have this query which return the correct result
GET /person/_search
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"nameDetails.name.nameValue.surname": {
"value": "Pibba",
"fuzziness": "AUTO"
}
}
},
{
"fuzzy": {
"nameDetails.nameValue.firstName": {
"value": "Fawsu",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
and the result is below:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 3.6012557,
"hits" : [
{
"_index" : "person",
"_type" : "_doc",
"_id" : "70002",
"_score" : 3.6012557,
"_source" : {
"gender" : "Male",
"activeStatus" : "Inactive",
"deceased" : "No",
"nameDetails" : {
"name" : [
{
"nameValue" : {
"firstName" : "Fawsu",
"middleName" : "L.",
"surname" : "Pibba"
},
"nameType" : "Primary Name"
},
{
"nameValue" : {
"firstName" : "Fausu",
"middleName" : "L.",
"surname" : "Pibba"
},
"nameType" : "Spelling Variation"
}
]
}
}
}
]
}
But when I add the filter for Gender, it returns no result
GET /person/_search
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"nameDetails.name.nameValue.surname": {
"value": "Pibba",
"fuzziness": "AUTO"
}
}
},
{
"fuzzy": {
"nameDetails.nameValue.firstName": {
"value": "Fawsu",
"fuzziness": "AUTO"
}
}
}
],
"filter": [
{
"term": {
"gender": "Male"
}
}
]
}
}
}
Even I just use filter, it return no result
GET /person/_search
{
"query": {
"bool": {
"filter": [
{
"term": {
"gender": "Male"
}
}
]
}
}
}
You are not getting any search result, because you are using the term query (in the filter clause). Term query will return the document only if it has an exact match.
A standard analyzer is used when no analyzer is specified, which will tokenize Male to male. So either you can search for male instead of Male or use any of the below solutions.
If you have not defined any explicit index mapping, you need to add .keyword to the gender field. This uses the keyword analyzer instead of the standard analyzer (notice the ".keyword" after gender field). Try out this below query -
{
"query": {
"bool": {
"filter": [
{
"term": {
"gender.keyword": "Male"
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "66879128",
"_type": "_doc",
"_id": "1",
"_score": 0.0,
"_source": {
"gender": "Male",
"activeStatus": "Inactive",
"deceased": "No",
"nameDetails": {
"name": [
{
"nameValue": {
"firstName": "Fawsu",
"middleName": "L.",
"surname": "Pibba"
},
"nameType": "Primary Name"
},
{
"nameValue": {
"firstName": "Fausu",
"middleName": "L.",
"surname": "Pibba"
},
"nameType": "Spelling Variation"
}
]
}
}
}
]
If you have defined index mapping, then modify the mapping for gender field as shown below
{
"mappings": {
"properties": {
"gender": {
"type": "keyword"
}
}
}
}

Elasticsearch: aggregation and select docs only having max value of field

I am using elastic search 6.5.
Basically, based on my query my index can return multiple documents, I need only those documents which has the max value for a particular field.
E.g.
{
"query": {
"bool": {
"must": [
{
"match": { "header.date" : "2019-07-02" }
},
{
"match": { "header.field" : "ABC" }
},
{
"bool": {
"should": [
{
"regexp": { "body.meta.field": "myregex1" }
},
{
"regexp": { "body.meta.field": "myregex2" }
}
]
}
}
]
}
},
"size" : 10000
}
The above query will return lots of documents/messages as per the query. The sample data returned is:
"header" : {
"id" : "Text_20190702101200123_111",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex1",
"timestamp": "2019-07-02T10:12:00.123Z",
}
}
-----------------
"header" : {
"id" : "Text_20190702151200123_121",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex2",
"timestamp": "2019-07-02T15:12:00.123Z",
}
}
-----------------
"header" : {
"id" : "Text_20190702081200133_124",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex1",
"timestamp": "2019-07-02T08:12:00.133Z",
}
}
So based on the above 3 documents, I only want the max timestamp one to be shown i.e. "timestamp": "2019-07-02T15:12:00.123Z"
I only want one document in above example.
I tried doing it as below:
{
"query": {
"bool": {
"must": [
{
"match": { "header.date" : "2019-07-02" }
},
{
"match": { "header.field" : "ABC" }
},
{
"bool": {
"should": [
{
"regexp": { "body.meta.field": "myregex1" }
},
{
"regexp": { "body.meta.field": "myregex2" }
}
]
}
}
]
}
},
"aggs": {
"group": {
"terms": {
"field": "header.id",
"order": { "group_docs" : "desc" }
},
"aggs" : {
"group_docs": { "max" : { "field": "body.meta.tiemstamp" } }
}
}
},
"size": "10000"
}
Executing the above, I am still getting all the 3 documents, instead of only one.
I do get the buckets though, but I need only one of them and not all the buckets.
The output in addition to all the records,
"aggregations": {
"group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Text_20190702151200123_121",
"doc_count": 29,
"group_docs": {
"value": 1564551683867,
"value_as_string": "2019-07-02T15:12:00.123Z"
}
},
{
"key": "Text_20190702101200123_111",
"doc_count": 29,
"group_docs": {
"value": 1564551633912,
"value_as_string": "2019-07-02T10:12:00.123Z"
}
},
{
"key": "Text_20190702081200133_124",
"doc_count": 29,
"group_docs": {
"value": 1564510566971,
"value_as_string": "2019-07-02T08:12:00.133Z"
}
}
]
}
}
What am I missing here?
Please note that I can have more than one messages for same timestamp. So I want them all i.e. all the messages/documents belonging to the max time stamp.
In above example there are 29 messages for same timestamp (It can go to any number). So there are 29 * 3 messages being retrieved by my query after using the above aggregation.
Basically I am able to group correctly, I am looking for something like HAVING in SQl?

How to aggregate until a certain value is reached in ElasticSearch?

I would like to aggregate a list of documents (each of them has two fields - timestamp and amount) by "amount" field until a certain value is reached. For example I would like to get list of documents sorted by timestamp which total amount is equal to 100. Is it possible to do in one query?
Here is my query which returns total amount - I would like to add here a condition to stop aggregation when a certain value is reached.
{
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": 1525168583
}
}
}
]
}
},
"aggs": {
"total_amount": {
"sum": {
"field": "amount"
}
}
},
"sort": [
"timestamp"
],
"size": 10000
}
Thank You
It's perfectly possible using a combination of function_score scripting for mimicking sorting, filter aggs for the range gte query and a healthy amount of scripted_metric aggs to limit the summation up to a certain amount.
Let's first set up a mapping and ingest some docs:
PUT summation
{
"mappings": {
"properties": {
"timestamp": {
"type": "date",
"format": "epoch_second"
}
}
}
}
POST summation/_doc
{
"context": "newest",
"timestamp": 1587049128,
"amount": 20
}
POST summation/_doc
{
"context": "2nd newest",
"timestamp": 1586049128,
"amount": 30
}
POST summation/_doc
{
"context": "3rd newest",
"timestamp": 1585049128,
"amount": 40
}
POST summation/_doc
{
"context": "4th newest",
"timestamp": 1585049128,
"amount": 30
}
Then perform the query:
GET summation/_search
{
"size": 0,
"aggs": {
"filtered_agg": {
"filter": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1585049128
}
}
},
{
"function_score": {
"query": {
"match_all": {}
},
"script_score": {
"script": {
"source": "return (params['now'] - doc['timestamp'].date.toMillis())",
"params": {
"now": 1587049676
}
}
}
}
}
]
}
},
"aggs": {
"limited_sum": {
"scripted_metric": {
"init_script": """
state['my_hash'] = new HashMap();
state['my_hash'].put('sum', 0);
state['my_hash'].put('docs', new ArrayList());
""",
"map_script": """
if (state['my_hash']['sum'] <= 100) {
state['my_hash']['sum'] += doc['amount'].value;
state['my_hash']['docs'].add(doc['context.keyword'].value);
}
""",
"combine_script": "return state['my_hash']",
"reduce_script": "return states[0]"
}
}
}
}
}
}
yielding
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"filtered_agg" : {
"meta" : { },
"doc_count" : 4,
"limited_sum" : {
"value" : {
"docs" : [
"newest",
"2nd newest",
"3rd newest",
"4th newest"
],
"sum" : 120
}
}
}
}
}
I've chosen here to only return the doc.contexts but you can adjust it to retrieve whatever you like -- be it IDs, amounts etc.

Resources