Combine inner hits in elasticsearch? - elasticsearch

I currently have a dataset that features a nested datatype in products, these are all listed within different vendors. I have various queries that check for search terms within the nested products array, ideally I want to be able to combine all the inner hits so that I can sort on such things as score rankings and price. At the moment the search results come back on a per document basis. Is it possible to combine inner hits in elasticsearch so that I get just a list of all the matching products?
Example Query
{
"_source": {
"includes": [ "*" ],
"excludes": [ "products" ]
},
"query": {
"nested": {
"path": "products",
"inner_hits": {
"size": 10,
"_source": [
"title"
]
},
"query": {
"bool": {
"must": [
{
"match": {
"products.title" : {
"query": "Dress",
"fuzziness" : 0
}
}
}
]
}
}
}
}
}
Example output
{
"took": 477,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 2.9072125,
"hits": [
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a806c7af36d28314de953ff",
"_score": 2.9072125,
"_source": {
"name": "Argos",
"locations": [
{
"lon": -2.242797,
"lat": 53.482952
}
]
},
"inner_hits": {
"products": {
"hits": {
"total": 3,
"max_score": 3.0782251,
"hits": [
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a806c7af36d28314de953ff",
"_nested": {
"field": "products",
"offset": 3348
},
"_score": 3.0782251,
"_source": {
"title": "HOME Set of 2 Dress Covers - White"
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a806c7af36d28314de953ff",
"_nested": {
"field": "products",
"offset": 2599
},
"_score": 3.0782251,
"_source": {
"title": "Chad Valley Designabear Spotty Dress Outfit"
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a806c7af36d28314de953ff",
"_nested": {
"field": "products",
"offset": 771
},
"_score": 2.5651875,
"_source": {
"title": "Melissa and Doug Abby & Emma Magnetic Wooden Dress Up"
}
}
]
}
}
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_score": 2.3227787,
"_source": {
"name": "Superdry",
"locations": [
{
"lon": -2.241703,
"lat": 53.483469
}
]
},
"inner_hits": {
"products": {
"hits": {
"total": 186,
"max_score": 2.378731,
"hits": [
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6420
},
"_score": 2.378731,
"_source": {
"title": "Alexia Off Shoulder Dress"
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6417
},
"_score": 2.378731,
"_source": {
"title": "Erin Festival Skater Dress "
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6416
},
"_score": 2.378731,
"_source": {
"title": "Erin Racer Dress "
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6415
},
"_score": 2.378731,
"_source": {
"title": "Alice Knot Dress"
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6412
},
"_score": 2.378731,
"_source": {
"title": "Alice Knot Dress"
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6389
},
"_score": 2.378731,
"_source": {
"title": "Lagoon Logo Midi Dress"
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6388
},
"_score": 2.378731,
"_source": {
"title": "50's Boardwalk Dress "
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6386
},
"_score": 2.378731,
"_source": {
"title": "50's Boardwalk Dress "
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6385
},
"_score": 2.378731,
"_source": {
"title": "Graphic Sweat Dress"
}
},
{
"_index": "shopit",
"_type": "businesses",
"_id": "5a5c3beb734d1d3471839b1d",
"_nested": {
"field": "products",
"offset": 6382
},
"_score": 2.378731,
"_source": {
"title": "Breton Bardot Stripe Dress"
}
}
]
}
}
}
}
]
}
}

Nevermind, I should of paid better attention to the elasticsearch documentation which states:
Search requests return the whole document, not just the matching
nested documents. Although there are plans afoot to support returning
the best -matching nested documents with the root document, this is
not yet supported.
I think parent-child relationships are probably the way to go with this.

Related

Wrong score in elastic search result

Not getting the correct score for the elastic search query result.
ES Query -
{
"from": 0,
"size": 10,
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(emergency) OR (emergency*) OR (*emergency) OR (*emergency*)",
"fields": [
"MDMGlobalData.Name1"
]
}
}
]
}
}
}
ES result -
{
"took": 29,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 798,
"relation": "eq"
},
"max_score": 9.169065,
"hits": [
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551037160",
"_score": 9.169065,
"_source": {
"MDMGlobalData": {
"Name1": "PARAGON EMERGENCY"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551040507",
"_score": 9.169065,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY MD"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551076447",
"_score": 9.169065,
"_source": {
"MDMGlobalData": {
"Name1": "COASTAL EMERGENCY"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551100746",
"_score": 9.169065,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY MD"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551090880",
"_score": 9.169065,
"_source": {
"MDMGlobalData": {
"Name1": "PAFFORD EMERGENCY"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551106787",
"_score": 9.169065,
"_source": {
"MDMGlobalData": {
"Name1": "CAPROCK EMERGENCY"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551021568",
"_score": 9.121077,
"_source": {
"MDMGlobalData": {
"Name1": "WILTON EMERGENCY"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551124137",
"_score": 9.121077,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY ONE"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551125549",
"_score": 9.121077,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY ONE"
}
}
},
{
"_index": "customermasterdata",
"_type": "_doc",
"_id": "MDMCM551133066",
"_score": 9.121077,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY MD"
}
}
}
]
}
}
Ideally, The first set in the result should be the Name1 which has value just "emergency" or start with the word "emergency"
And how could we have the same score for almost first 5 result sets? Being the Name1 value is different.
Due to wrong scoring, the results are messed up.
How to correct the score in the result?
No, That need not be the case. Because ES follows Lucene scoring function
Reason for the same score:
You have only two terms in each document - emergency and one more word
Emergency word matches as it is. Field Length is same
Number of occurrence is one. i.e Term frequencies are same.
Relevancy is same for all the terms. idf
Coord is same as your doc contains only one occurrence of Emergency
But if you have a document with Emergency X Y Z, then score of this will be lower than the other documents which you have. Because term frequency is higher for this one.
And if you have only Emergency, score of this document will be higher than all.
It is perfectly normal to have same score in your scenario as user doesn't know which emergency he/she meant.
Update:
{
"query":{
"bool":{
"must":{
"term":{
"MDMGlobalData.Name1":"emergency"
}
}
}
}
}
With the sample data, output:
"hits": [
{
"_index": "emerge",
"_type": "_doc",
"_id": "iN1hKnMBojxRtp6HNI7d",
"_score": 0.10938574,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY"
}
}
},
{
"_index": "emerge",
"_type": "_doc",
"_id": "g91TKnMBojxRtp6Hto4q",
"_score": 0.08701137,
"_source": {
"MDMGlobalData": {
"Name1": "PARAGON EMERGENCY"
}
}
},
{
"_index": "emerge",
"_type": "_doc",
"_id": "hN1TKnMBojxRtp6H2I6A",
"_score": 0.08701137,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY MD"
}
}
},
{
"_index": "emerge",
"_type": "_doc",
"_id": "hd1TKnMBojxRtp6H_I6_",
"_score": 0.08701137,
"_source": {
"MDMGlobalData": {
"Name1": "COASTAL EMERGENCY"
}
}
},
{
"_index": "emerge",
"_type": "_doc",
"_id": "h91VKnMBojxRtp6HYI4e",
"_score": 0.07223585,
"_source": {
"MDMGlobalData": {
"Name1": "EMERGENCY MD X"
}
}
}
]

"match" query along with "should" clause giving more than required match results in Elasticsearch

I have written the following lucene query in elasticsearch for getting documents with Id field as mentioned:
GET requirements_v3/_search
{
"from": 0,
"size": 10,
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{"match": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}},
{
"match": {
"Id": "048b7907-2b5a-438a-ace9-f1e1fd67ca69"
}
},
{
"match": {
"Id": "3b385896-1207-4f6d-8ae9-f3ced84cf1fa"
}
},
{
"match": {
"Id": "0aa1db52-c0fb-4bf6-9223-00edccc32703"
}
},
{
"match": {
"Id": "8c399993-f273-4ee0-a1ab-3a85c6848113"
}
},
{
"match": {
"Id": "4461eb37-487e-4899-a7be-914640fab0e0"
}
},
{
"match": {
"Id": "07052261-b904-4bfc-a6fd-3acd28114c6a"
}
},
{
"match": {
"Id": "95816ff0-9eae-4196-99fc-86c6f43395fd"
}
},
{
"match": {
"Id": "ea8a59a6-2b2f-467a-9beb-e281b1581a0a"
}
},
{
"match": {
"Id": "33f87d98-024f-4893-aa1c-8d438a98cd1f"
}
}
]
}
}
}
}
The response for the above query is:
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 18,
"max_score": 0,
"hits": [
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"_score": 0,
"_source": {
"Id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "4461eb37-487e-4899-a7be-914640fab0e0",
"_score": 0,
"_source": {
"Id": "4461eb37-487e-4899-a7be-914640fab0e0",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"_score": 0,
"_source": {
"Id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"_score": 0,
"_source": {
"Id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"_score": 0,
"_source": {
"Id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"Name": "Detail Page - Build"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"_score": 0,
"_source": {
"Id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"_score": 0,
"_source": {
"Id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"Name": "HUC"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"_score": 0,
"_source": {
"Id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"Name": "DAMS UpsertUnenrollPrice" }
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"_score": 0,
"_source": {
"Id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"Name": "Item Setup"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"_score": 0,
"_source": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"Name": "Installments"
}
}
]
}
}
This mentions totalHits as '18'. Why is it returning more items than 10? I believe match query should be used for 'exact' matches, so why more documents are returned here?
P.S.: I know I can use the Ids query for this, but I want to know why is this not returning the correct response
Update: Setting the size to 20 returns the following response:
{
"took": 195,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 18,
"max_score": 0,
"hits": [
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"_score": 0,
"_source": {
"Id": "9d8060da-c3e2-4f6d-b4e2-17e65b266c76",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "4461eb37-487e-4899-a7be-914640fab0e0",
"_score": 0,
"_source": {
"Id": "4461eb37-487e-4899-a7be-914640fab0e0",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"_score": 0,
"_source": {
"Id": "33f87d98-024f-4893-aa1c-8d438a98cd1f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"_score": 0,
"_source": {
"Id": "d75d9a7c-e145-487e-922f-102c16d0026f",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"_score": 0,
"_source": {
"Id": "007eadb7-adda-487e-b7fe-6f6b5648de2e",
"Name": "Detail Page - Build"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"_score": 0,
"_source": {
"Id": "95816ff0-9eae-4196-99fc-86c6f43395fd",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"_score": 0,
"_source": {
"Id": "07052261-b904-4bfc-a6fd-3acd28114c6a",
"Name": "HUC"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"_score": 0,
"_source": {
"Id": "d60daf3a-4681-4bfc-a3a9-b04b5b005f73",
"Name": "DAMS UpsertUnenrollPrice"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"_score": 0,
"_source": {
"Id": "c1b367f2-a57a-487e-994c-84470e0f9db4",
"Name": "Item Setup"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"_score": 0,
"_source": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b",
"Name": "Installments"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "b9437079-47c4-487e-abf0-1ff076f69e0f",
"_score": 0,
"_source": {
"Id": "b9437079-47c4-487e-abf0-1ff076f69e0f",
"Name": "Detail Page - Strings "
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "0aa1db52-c0fb-4bf6-9223-00edccc32703",
"_score": 0,
"_source": {
"Id": "0aa1db52-c0fb-4bf6-9223-00edccc32703",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "ea8a59a6-2b2f-467a-9beb-e281b1581a0a",
"_score": 0,
"_source": {
"Id": "ea8a59a6-2b2f-467a-9beb-e281b1581a0a",
"Name": "Create Configurator"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "fd259359-4f6d-4530-ac29-fcebe00d66a6",
"_score": 0,
"_source": {
"Id": "fd259359-4f6d-4530-ac29-fcebe00d66a6",
"Name": "Invite Platform"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "1b2ba0bb-3e7f-46fb-b904-07460b84848b",
"_score": 0,
"_source": {
"Id": "1b2ba0bb-3e7f-46fb-b904-07460b84848b",
"Name": "Training"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "8c399993-f273-4ee0-a1ab-3a85c6848113",
"_score": 0,
"_source": {
"Id": "8c399993-f273-4ee0-a1ab-3a85c6848113",
"Name": "Configure ASIN for Reporting"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "3b385896-1207-4f6d-8ae9-f3ced84cf1fa",
"_score": 0,
"_source": {
"Id": "3b385896-1207-4f6d-8ae9-f3ced84cf1fa",
"Name": "Create Extended/Limited Warranty Configuration"
}
},
{
"_index": "requirements_v3",
"_type": "_doc",
"_id": "048b7907-2b5a-438a-ace9-f1e1fd67ca69",
"_score": 0,
"_source": {
"Id": "048b7907-2b5a-438a-ace9-f1e1fd67ca69",
"Name": "Invite Platform"
}
}
]
}
}
Lets understand this by the following mapping e.g:
{
"_doc": {
"properties": {
"Id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"Name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
The above mapping is created dynamically by elasticsearch. Lets us now focus on Id field. Its type is text. By default the analyzer for text datatype is standard analyzer. When this analyzer is applied on the input for this field it get tokenized into terms. So for example if you input value for Id is 33f87d98-024f-4893-aa1c-8d438a98cd1f following tokens get generated:
33f87d98
024f
4893
aa1c
8d438a98cd1f
As you can see the input value is splitted by - being used as delimiter. This is because standard analyzer is applied on it.
There is another sub-field under Id which is keyword and its type is keyword. For type keyword the input is indexed as it is without applying any modification.
Now lets understand why more documents get matched and result count is more than expected. In your query you used match query on Id field as below:
{
"match": {
"Id": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}
}
By default match query uses the same analyzer that is applied on the field in mapping. So on the Id value in the query again the same analyzer is applied and the input is splitted into tokens in a similar way as above. The default operator that is applied between tokens of match query input string is OR and hence your query actually becomes:
b8bf49a4 OR 960b OR 4fa8 OR 8c5f OR a3fce4b4d07b
There if any of the above tokens match to any of the indexed terms stored in Id field, the document is considered a match.
Solution for the above based on above mapping:
Use the keyword field instead. So the query becomes:
{
"match": {
"Id.keyword": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}
}
More on how match works see here.
Also as mention by #Curious_MInd in his answer its better to use terms than using multiple match in should.
As you said that your Id is text as well as keyword so you should use Id.keyword for matching exact values like
GET requirements_v3/_search
{
"from": 0,
"size": 10,
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{"match": {
"Id.keyword": "b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b"
}},
{
"match": {
"Id.keyword": "048b7907-2b5a-438a-ace9-f1e1fd67ca69"
}
}
]
}
}
}
}
But I guess you should use terms if you wants to match multiple exact values. Have a look here. For an example:
{
"terms" : {
"Id" : ["b8bf49a4-960b-4fa8-8c5f-a3fce4b4d07b", "048b7907-2b5a-438a-ace9-f1e1fd67ca69"]
}
}

How to sort by match prioritising the most left words matched

How to sort by match prioritising the most left words matched
Explanation
Sort the prefix query by the word it matches, but prioritising the matches in the words more at left.
Tests I've made
Data
DELETE /test
PUT /test
PUT /test/person/_mapping
{
"properties": {
"name": {
"type": "multi_field",
"fields": {
"name": {"type": "string"},
"original": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
PUT /test/person/1
{"name": "Berta Kassulke"}
PUT /test/person/2
{"name": "Kaley Bartoletti"}
PUT /test/person/3
{"name": "Kali Hahn"}
PUT /test/person/4
{"name": "Karolann Klein"}
PUT /test/person/5
{"name": "Sofia Mandez Kaloo"}
The mapping was added for the 'sort on original value' test.
Simple query
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
}
}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": 1,
"_source": {
"name": "Karolann Klein"
}
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": 1,
"_source": {
"name": "Sofia Mandez Kaloo"
}
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": 1,
"_source": {
"name": "Berta Kassulke"
}
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": 1,
"_source": {
"name": "Kaley Bartoletti"
}
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": 1,
"_source": {
"name": "Kali Hahn"
}
}
]
}
}
With sorting
Request
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name": {"order": "asc"}}
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"berta"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"kaloo"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"karolann"
]
}
]
}
}
With sort on original value
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name.original": {"order": "asc"}}
}
Result
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"Berta Kassulke"
]
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"Kaley Bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"Kali Hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"Karolann Klein"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"Sofia Mandez Kaloo"
]
}
]
}
}
Intended result
Sorted by name ASC but prioritising the matches on the most left words
Kaley Bartoletti
Kali Hahn
Karolann Klein
Berta Kassulke
Sofia Mandez Kaloo
Good Question. One way to achieve this would be with the combination of edge ngram filter and span first query
This is my setting
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase",
"edge_filter",
"asciifolding"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 8
}
}
}
},
"mappings": {
"person": {
"properties": {
"name": {
"type": "string",
"analyzer": "my_custom_analyzer",
"search_analyzer": "standard",
"fields": {
"standard": {
"type": "string"
}
}
}
}
}
}
}
After that I inserted your sample documents. Then I wrote the following query with dis_max. Notice that end parameter for first span query is 1 so this will prioritize(higher score) leftmost match. I am first sorting by score and then by name.
{
"query": {
"dis_max": {
"tie_breaker": 0.7,
"boost": 1.2,
"queries": [
{
"match": {
"name": "ka"
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 1
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 2
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"name.standard": {
"order": "asc"
}
}
]
}
The result I get
"hits": [
{
"_index": "esedge",
"_type": "policy_data",
"_id": "2",
"_score": 0.72272325,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
0.72272325,
"bartoletti"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "3",
"_score": 0.72272325,
"_source": {
"name": "Kali Hahn"
},
"sort": [
0.72272325,
"hahn"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "4",
"_score": 0.72272325,
"_source": {
"name": "Karolann Klein"
},
"sort": [
0.72272325,
"karolann"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "1",
"_score": 0.54295504,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
0.54295504,
"berta"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "5",
"_score": 0.2905494,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
0.2905494,
"kaloo"
]
}
]
I hope this helps.

ElasticSearch Order By String Length

I am using ElasticSearch via NEST c#. I have large list of information about people
{
firstName: 'Frank',
lastName: 'Jones',
City: 'New York'
}
I'd like to be able to filter and sort this list of items by lastName as well as order by the length so people who only have 5 characters in their name will be at the beginning of the result set then people with 10 characters.
So with some pseudo code I'd like to do something like
list.wildcard("j*").sort(m => lastName.length)
You can do the sorting with script-based sorting.
As a toy example, I set up a trivial index with a few documents:
PUT /test_index
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"name":"Bob"}
{"index":{"_id":2}}
{"name":"Jeff"}
{"index":{"_id":3}}
{"name":"Darlene"}
{"index":{"_id":4}}
{"name":"Jose"}
Then I can order search results like this:
POST /test_index/_search
{
"query": {
"match_all": {}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": null,
"_source": {
"name": "Bob"
},
"sort": [
3
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
To filter by length, I can use a script filter in a similar way:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"script": {
"script": "doc['name'].value.length() > 3",
"params": {}
}
}
}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/22fef6dc5453eaaae3be5fb7609663cc77c43dab
P.S.: If any of the last names will contain spaces, you might want to use "index": "not_analyzed" on that field.

Elasticsearch aggregation with date_histogram gives wrong result for buckets

I have data with timestamp. I want to do date_histogram on that.
When I run the query it return total as 13 which is correct, but it shows one record in 2014-10-10, but I cant find that record in data I have.
curl http://localhost:9200/test/test/_search -X POST -d '{"fields":
["creation_time"],
"query" :
{"filtered":
{"query":
{"match":
{"type": "test.type"}
}
}
},
"aggs":
{"group_by_created_by":
{"date_histogram":
{"field":"creation_time", "interval": "1d"}
}
}
}' | python -m json.tool
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 2083 100 1733 100 350 234k 48590 --:--:-- --:--:-- --:--:-- 241k
{
"_shards": {
"failed": 0,
"successful": 5,
"total": 5
},
"aggregations": {
"group_by_created_at": {
"buckets": [
{
"doc_count": 12,
"key": 1412812800000,
"key_as_string": "2014-10-09T00:00:00.000Z"
},
{
"doc_count": 1,
"key": 1412899200000,
"key_as_string": "2014-10-10T00:00:00.000Z"
}
]
}
},
"hits": {
"hits": [
{
"_id": "qk5EGDqUSoW-ckZU9bnSsA",
"_index": "test",
"_score": 3.730029,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T16:35:39.535389"
]
}
},
{
"_id": "GnglI_3xRYii_oE5q91FUg",
"_index": "test",
"_score": 3.6149597,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T17:16:55.677919"
]
}
},
{
"_id": "ELP1f_-IS8SJiT4i4Vh6_g",
"_index": "test",
"_score": 2.974081,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T01:21:21.691270"
]
}
},
{
"_id": "ySlIV4vWRvm_q0-9p87dEQ",
"_index": "test",
"_score": 2.974081,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T01:33:51.291644"
]
}
},
{
"_id": "swXVnMmJSsmNW30zeJvCoQ",
"_index": "test",
"_score": 2.974081,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T17:08:45.738821"
]
}
},
{
"_id": "h0j6L-VGTnyChSIevtt2og",
"_index": "test",
"_score": 2.974081,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T22:35:16.908080"
]
}
},
{
"_id": "ANoTEXIgRgml6gLD4YKtIg",
"_index": "test",
"_score": 2.9459102,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T01:25:18.869175"
]
}
},
{
"_id": "FSCPBsogT5OXghBUmKXidQ",
"_index": "test",
"_score": 2.9459102,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T01:42:49.000599"
]
}
},
{
"_id": "VEw6XbIySvW7h7GF7h4ynA",
"_index": "test",
"_score": 2.9459102,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T16:45:51.563595"
]
}
},
{
"_id": "J9NfffAvRPmFxtOBZ6IsCA",
"_index": "test",
"_score": 2.9169223,
"_type": "test",
"fields": {
"creation_time": [
"2014-10-09T01:23:30.546353"
]
}
}
],
"max_score": 3.730029,
"total": 13
},
"timed_out": false,
"took": 4
}
If you see the above examples, then there is no record on 10-10 but aggregation shows one record in that bucket.
Aggregations are done on all matching documents.
You do not set the size which means you the default 10 documents under hits. Change the size to 13(+) and your 2014-10-10 document should show.
When you have more results, which will make it unhandy to manually check all results, you can also use top_hits as a sub-aggregator to get a peak of what is in the bucket (there's a size option there as well).
If you count your hits, you will see there are only 10 objects. This is because, by default, Elasticsearch will return only the top ten result hits.
However, even if not present in the hits, all the documents matching the query are taken into account when computing your aggregations.
Try to update your query to :
{
"size": 13,
"fields": ["creation_time"],
"query" :
{"filtered":
{"query":
{"match":
{"type": "test.type"}
}
}
},
"aggs":
{"group_by_created_by":
{"date_histogram":
{"field":"creation_time", "interval": "1d"}
}
}
}
And you will see the document which has been created on the 10-10.

Resources