Related
I have an elasticsearch index containing user locations.
I need to perform aggregate query with geo bounding box using geohash grid, and for buckets that have documents count less than some value, i need to return all documents.
How can i do this?
Since you have not given any relevant information about the index which you have created and the user locations.
I am considering the below data:
index Def
{
"mappings": {
"properties": {
"location": {
"type": "geo_point"
}
}
}
}
Index Sample Doc
POST _bulk
{"index":{"_id":1}}
{"location":"52.37408,4.912350","name":"The golden dragon"}
{"index":{"_id":2}}
{"location":"52.369219,4.901618","name":"Burger King"}
{"index":{"_id":3}}
{"location":"52.371667,4.914722","name":"Wendys"}
{"index":{"_id":4}}
{"location":"51.222900,4.405200","name":"Taco Bell"}
{"index":{"_id":5}}
{"location":"48.861111,2.336389","name":"McDonalds"}
{"index":{"_id":6}}
{"location":"48.860000,2.327000","name":"KFC"}
According to your question:
When requesting detailed buckets a filter like geo_bounding_box
should be applied to narrow the subject area
To know more about this, you can refer to this official ES doc
Now, in order to filter data based on doc_count with aggregations, we can use bucket_selector pipeline aggregation.
From documentation
Pipeline aggregations work on the outputs produced from other
aggregations rather than from document sets, adding information to the
output tree.
So, the amount of work that need to be done to calculate doc_count will be the same.
Query
{
"aggs": {
"location": {
"filter": {
"geo_bounding_box": {
"location": {
"top_left": {
"lat": 52.5225,
"lon": 4.5552
},
"bottom_right": {
"lat": 52.2291,
"lon": 5.2322
}
}
}
},
"aggs": {
"around_amsterdam": {
"geohash_grid": {
"field": "location",
"precision": 8
},
"aggs": {
"the_filter": {
"bucket_selector": {
"buckets_path": {
"the_doc_count": "_count"
},
"script": "params.the_doc_count < 2"
}
}
}
}
}
}
}
}
Search Result
"hits": {
"total": {
"value": 6,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "restaurant",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"location": "52.37408,4.912350",
"name": "The golden dragon"
}
},
{
"_index": "restaurant",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"location": "52.369219,4.901618",
"name": "Burger King"
}
},
{
"_index": "restaurant",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"location": "52.371667,4.914722",
"name": "Wendys"
}
},
{
"_index": "restaurant",
"_type": "_doc",
"_id": "4",
"_score": 1.0,
"_source": {
"location": "51.222900,4.405200",
"name": "Taco Bell"
}
},
{
"_index": "restaurant",
"_type": "_doc",
"_id": "5",
"_score": 1.0,
"_source": {
"location": "48.861111,2.336389",
"name": "McDonalds"
}
},
{
"_index": "restaurant",
"_type": "_doc",
"_id": "6",
"_score": 1.0,
"_source": {
"location": "48.860000,2.327000",
"name": "KFC"
}
}
]
},
"aggregations": {
"location": {
"doc_count": 3,
"around_amsterdam": {
"buckets": [
{
"key": "u173zy3j",
"doc_count": 1
},
{
"key": "u173zvfz",
"doc_count": 1
},
{
"key": "u173zt90",
"doc_count": 1
}
]
}
}
}
}
It will filter out all the documents, whose count is less than 2 based on "params.the_doc_count < 2"
I have a document with a tags field contain "john smith"
This query returns it:
{
"query": {
"bool": {
"filter": {
"term": {
"tags": "john"
}
}
}
}
}
But this not:
{
"query": {
"bool": {
"filter": {
"term": {
"tags": "john smith"
}
}
}
}
}
Why? How can I reach filter matches multiple words?
You need to use the terms query if I understand your requirements properly, which is you want to search for multiple values ie john or smith.
index def
{
"mappings": {
"properties": {
"tags": {
"type": "text"
}
}
}
}
Index sample docs
{
"tags" : "john Lay"
}
{
"tags" : "john opster"
}
{
"tags" : "john smith"
}
Search query for john or lay
{
"query" : {
"terms" : {
"tags" : ["john", "lay"],
"boost" : 1.0
}
}
}
And search result
"hits": [
{
"_index": "so_auto",
"_type": "_doc",
"_id": "9",
"_score": 1.0,
"_source": {
"model_name": "john smith"
}
},
{
"_index": "so_auto",
"_type": "_doc",
"_id": "10",
"_score": 1.0,
"_source": {
"model_name": "john opster"
}
},
{
"_index": "so_auto",
"_type": "_doc",
"_id": "12",
"_score": 1.0,
"_source": {
"model_name": "john Lay"
}
}
I have create sample index with provided fields and it gave correct answer.
Mapping of the tags field that I have used to create index is :
"mappings": {
"properties": {
"tags": {
"type": "keyword"
}
}
}
We are using keyword field since in term query you will require exact match.
I have created three documents in this index with following tags field :
{
"_index": "secesindex",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"tags": "John Smith"
}
},
{
"_index": "secesindex",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"tags": "John Farraday"
}
},
{
"_index": "secesindex",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"tags": "John"
}
}
Now when I am running query as mentioned above :
{
"query": {
"bool": {
"filter": {
"term": {
"tags": "John Smith"
}
}
}
}
}
It gives exact match to the document which has tags field value equal to "John Smith".
"hits": [
{
"_index": "secesindex",
"_type": "_doc",
"_id": "1",
"_score": 0.0,
"_source": {
"tags": "John Smith"
}
}
]
I have written the following lucene query in elasticsearch for getting documents with Id field as mentioned:
GET requirements_v3/_search
{
"from": 0,
"size": 10,
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{"match": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}},
{
"match": {
"Id": "048b7907-2b5a-438a-ace9-f1e1fd67ca69"
}
},
{
"match": {
"Id": "3b385896-1207-4f6d-8ae9-f3ced84cf1fa"
}
},
{
"match": {
"Id": "0aa1db52-c0fb-4bf6-9223-00edccc32703"
}
},
{
"match": {
"Id": "8c399993-f273-4ee0-a1ab-3a85c6848113"
}
},
{
"match": {
"Id": "4461eb37-487e-4899-a7be-914640fab0e0"
}
},
{
"match": {
"Id": "07052261-b904-4bfc-a6fd-3acd28114c6a"
}
},
{
"match": {
"Id": "95816ff0-9eae-4196-99fc-86c6f43395fd"
}
},
{
"match": {
"Id": "ea8a59a6-2b2f-467a-9beb-e281b1581a0a"
}
},
{
"match": {
"Id": "33f87d98-024f-4893-aa1c-8d438a98cd1f"
}
}
]
}
}
}
}
The response for the above query is:
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 18,
"max_score": 0,
"hits": [
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"_score": 0,
"_source": {
"Id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "4461eb37-487e-4899-a7be-914640fab0e0",
"_score": 0,
"_source": {
"Id": "4461eb37-487e-4899-a7be-914640fab0e0",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"_score": 0,
"_source": {
"Id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"_score": 0,
"_source": {
"Id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"_score": 0,
"_source": {
"Id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"Name": "Detail Page - Build"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"_score": 0,
"_source": {
"Id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"_score": 0,
"_source": {
"Id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"Name": "HUC"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"_score": 0,
"_source": {
"Id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"Name": "DAMS UpsertUnenrollPrice" }
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"_score": 0,
"_source": {
"Id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"Name": "Item Setup"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"_score": 0,
"_source": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"Name": "Installments"
}
}
]
}
}
This mentions totalHits as '18'. Why is it returning more items than 10? I believe match query should be used for 'exact' matches, so why more documents are returned here?
P.S.: I know I can use the Ids query for this, but I want to know why is this not returning the correct response
Update: Setting the size to 20 returns the following response:
{
"took": 195,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 18,
"max_score": 0,
"hits": [
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"_score": 0,
"_source": {
"Id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "4461eb37-487e-4899-a7be-914640fab0e0",
"_score": 0,
"_source": {
"Id": "4461eb37-487e-4899-a7be-914640fab0e0",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"_score": 0,
"_source": {
"Id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"_score": 0,
"_source": {
"Id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"_score": 0,
"_source": {
"Id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"Name": "Detail Page - Build"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"_score": 0,
"_source": {
"Id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"_score": 0,
"_source": {
"Id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"Name": "HUC"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"_score": 0,
"_source": {
"Id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"Name": "DAMS UpsertUnenrollPrice"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"_score": 0,
"_source": {
"Id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"Name": "Item Setup"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"_score": 0,
"_source": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"Name": "Installments"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "b9437079-47c4-487e-abf0-1ff076f69e0f",
"_score": 0,
"_source": {
"Id": "b9437079-47c4-487e-abf0-1ff076f69e0f",
"Name": "Detail Page - Strings "
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "0aa1db52-c0fb-4bf6-9223-00edccc32703",
"_score": 0,
"_source": {
"Id": "0aa1db52-c0fb-4bf6-9223-00edccc32703",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "ea8a59a6-2b2f-467a-9beb-e281b1581a0a",
"_score": 0,
"_source": {
"Id": "ea8a59a6-2b2f-467a-9beb-e281b1581a0a",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "fd259359-4f6d-4530-ac29-fcebe00d66a6",
"_score": 0,
"_source": {
"Id": "fd259359-4f6d-4530-ac29-fcebe00d66a6",
"Name": "Invite Platform"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "1b2ba0bb-3e7f-46fb-b904-07460b84848b",
"_score": 0,
"_source": {
"Id": "1b2ba0bb-3e7f-46fb-b904-07460b84848b",
"Name": "Training"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "8c399993-f273-4ee0-a1ab-3a85c6848113",
"_score": 0,
"_source": {
"Id": "8c399993-f273-4ee0-a1ab-3a85c6848113",
"Name": "Configure ASIN for Reporting"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "3b385896-1207-4f6d-8ae9-f3ced84cf1fa",
"_score": 0,
"_source": {
"Id": "3b385896-1207-4f6d-8ae9-f3ced84cf1fa",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "048b7907-2b5a-438a-ace9-f1e1fd67ca69",
"_score": 0,
"_source": {
"Id": "048b7907-2b5a-438a-ace9-f1e1fd67ca69",
"Name": "Invite Platform"
}
}
]
}
}
Lets understand this by the following mapping e.g:
{
"_doc": {
"properties": {
"Id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
The above mapping is created dynamically by elasticsearch. Lets us now focus on Id field. Its type is text. By default the analyzer for text datatype is standard analyzer. When this analyzer is applied on the input for this field it get tokenized into terms. So for example if you input value for Id is 33f87d98-024f-4893-aa1c-8d438a98cd1f following tokens get generated:
33f87d98
024f
4893
aa1c
8d438a98cd1f
As you can see the input value is splitted by - being used as delimiter. This is because standard analyzer is applied on it.
There is another sub-field under Id which is keyword and its type is keyword. For type keyword the input is indexed as it is without applying any modification.
Now lets understand why more documents get matched and result count is more than expected. In your query you used match query on Id field as below:
{
"match": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}
}
By default match query uses the same analyzer that is applied on the field in mapping. So on the Id value in the query again the same analyzer is applied and the input is splitted into tokens in a similar way as above. The default operator that is applied between tokens of match query input string is OR and hence your query actually becomes:
b8bf49a4 OR 960b OR 4fa8 OR 8c5f OR a3fce4b4d07b
There if any of the above tokens match to any of the indexed terms stored in Id field, the document is considered a match.
Solution for the above based on above mapping:
Use the keyword field instead. So the query becomes:
{
"match": {
"Id.keyword": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}
}
More on how match works see here.
Also as mention by #Curious_MInd in his answer its better to use terms than using multiple match in should.
As you said that your Id is text as well as keyword so you should use Id.keyword for matching exact values like
GET requirements_v3/_search
{
"from": 0,
"size": 10,
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{"match": {
"Id.keyword": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}},
{
"match": {
"Id.keyword": "048b7907-2b5a-438a-ace9-f1e1fd67ca69"
}
}
]
}
}
}
}
But I guess you should use terms if you wants to match multiple exact values. Have a look here. For an example:
{
"terms" : {
"Id" : ["b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b", "048b7907-2b5a-438a-ace9-f1e1fd67ca69"]
}
}
I have the following search query:
{
"query": {
"match": {
"name": "testlib"
}
}
}
When I do this query I get the three results below. What I want to do now is only return one result: the newest #timestamp that doesn't contain version_pre. So in this case, only return AV6qvDXDyHw9vNh6Wlpl.
[
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDXDyHw9vNh6Wlpl",
"_score": 0.2876821,
"_source": {
"#timestamp": "2017-09-21T11:02:15-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 1
}
},
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDF5MtcMTuGknsVs",
"_score": 0.18232156,
"_source": {
"#timestamp": "2017-09-20T17:21:35-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 0
}
},
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDnVyHw9vNh6Wlpn",
"_score": 0.18232156,
"_source": {
"#timestamp": "2017-09-22T13:56:55-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 2,
"version_pre": 0
}
}
]
Use sort (https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-sort.html) and https://www.elastic.co/guide/en/elasticsearch/reference/2.3/query-dsl-exists-query.html:
{
"size" : 1,
"sort" : [{ "#timestamp" : {"order" : "asc"}}],
"query" : {
"bool": {
"must_not": {
"exists": {
"field": "version_pre"
}
}
}
Or even, via query string:
/_search?sort=#timestamp:desc&size=1&q=_missing_:version_pre
I'm looking into switching from solr to elasticsearch and have indexed a bunch of documents into it without providing a schema/mapping and a lot of the fields that i would have previously set as indexed strings in solr have been set as both text and keyword fields using multi-fields.
Is there any benifit to having a keyword field also as a text field using multi-fields? in my case most values in fields are single words so i'd imagine it wouldn't matter if they are sent to the analyzer but the es docs seem to imply that keyword fields are not considered when searching or at least treated differently?
Just to expand on that a little further if i search for the term "ipad" would a document score higher if it had "ipad" in a keyword field as well as some other text field vs the same document without the keyword field? and if say "ipad" was only in a keyword field would the document still match?
To answer my own question i created a quick test, pretty much keyword and text fields are equivalent when searching and multi-fields seem to get the same score as their primary type so i guess the second field has no effect on search scoring
Weirdly a multi word value in both keyword and text fields got the same score which i would have expecting the keyword field to score lower or not at all but for my purposes that is fine so i'm not going to investigate it further.
Index Creation
PUT test_index
{
"settings" : {
"number_of_shards" : 1
},
"mappings" : {
"test_type" : {
"properties" : {
"multifield": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"keywordfield": {
"type": "keyword"
},
"textfield": {
"type": "text"
}
}
}
}
}
Data Insert
POST /_bulk
{ "update": { "_index": "test_index", "_type": "test_type", "_id": 1 }
{ "doc" : { "multifield" : "ipad" }, "doc_as_upsert" : true }
{ "update": { "_index": "test_index", "_type": "test_type", "_id": 2 }
{ "doc" : { "keywordfield" : "ipad" }, "doc_as_upsert" : true }
{ "update": { "_index": "test_index", "_type": "test_type", "_id": 3 }
{ "doc" : { "keywordfield" : "a green ipad" }, "doc_as_upsert" : true }
{ "update": { "_index": "test_index", "_type": "test_type", "_id": 4 }
{ "doc" : { "textfield" : "a yellow ipad" }, "doc_as_upsert" : true }
{ "update": { "_index": "test_index", "_type": "test_type", "_id": 5 }
{ "doc" : { "keywordfield" : "ipad", "textfield" : "ipad" }, "doc_as_upsert" : true }
{ "update": { "_index": "test_index", "_type": "test_type", "_id": 6 }
{ "doc" : { "keywordfield" : "unrelated", "textfield" : "hopefully this wont show up" }, "doc_as_upsert" : true }
{ "update": { "_index": "test_index", "_type": "test_type", "_id": 7 }
{ "doc" : { "textfield" : "ipad" }, "doc_as_upsert" : true }
Results
GET /test_index/_search?q=ipad
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0.28122374,
"hits": [
{
"_index": "test_index",
"_type": "test_type",
"_id": "5",
"_score": 0.28122374,
"_source": {
"keywordfield": "ipad",
"textfield": "ipad"
}
},
{
"_index": "test_index",
"_type": "test_type",
"_id": "1",
"_score": 0.2734406,
"_source": {
"multifield": "ipad"
}
},
{
"_index": "test_index",
"_type": "test_type",
"_id": "2",
"_score": 0.2734406,
"_source": {
"keywordfield": "ipad"
}
},
{
"_index": "test_index",
"_type": "test_type",
"_id": "7",
"_score": 0.2734406,
"_source": {
"textfield": "ipad"
}
},
{
"_index": "test_index",
"_type": "test_type",
"_id": "3",
"_score": 0.16417998,
"_source": {
"keywordfield": "a green ipad"
}
},
{
"_index": "test_index",
"_type": "test_type",
"_id": "4",
"_score": 0.16417998,
"_source": {
"textfield": "a yellow ipad"
}
}
]
}
}