I'm trying to build a food search engine on Elasticsearch that should meet following use cases -
If the user searches for 'coff' then it should return all the documents with phrase 'coffee' in their name and the priority should be for food items that have 'coffee' at the starting of their name.
If the user searches for 'green tea' then it should give priority to the documents that have both the phrases 'green tea' instead of splitting 'green' and 'tea'
If the phrase does not exist in the 'name' then it should also search in the alias field.
To manage the first case, I've used the edge n-grams analyzer.
Mapping -
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"analyzer_keyword": {
"tokenizer": "standard",
"filter": "lowercase"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": {
"doc": {
"properties": {
"alias": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name": {
"type": "text",
"search_analyzer": "analyzer_keyword",
"analyzer": "edge_ngram_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
This is the search query that I'm using but it's not exactly returning the relevant search results
{
"query": {
"multi_match": {
"query": "coffee",
"fields": ["name^2", "alias"]
}
}
}
There are over 1500 food items with 'coffee' in their name but the above query is only returning 2
{
"took": 745,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 8.657346,
"hits": [
{
"_index": "food-master",
"_type": "doc",
"_id": "a9uzinABb4g7LgmgoI1I",
"_score": 8.657346,
"_source": {
"id": 17463,
"name": "Rotiboy, coffee bun",
"alias": [
"Mexican Coffee Bun (Rotiboy)",
"Mexican coffee bun"
],
}
},
{
"_index": "food-master",
"_type": "doc",
"_id": "TNuzinABb4g7LgmgoFVI",
"_score": 7.0164866,
"_source": {
"id": 1344,
"name": "Coffee with sugar",
"alias": [
"Heart Friendly",
"Coffee With Sugar",
"Coffee With Milk and Sugar",
"Gluten Free",
"Hypertension Friendly"
],
}
}
]
}
}
In the mapping, if I remove the analyzer_keyword then it returns relevant results but the documents that start with 'coffee' are not prioritized
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1323,
"max_score": 57.561867,
"hits": [
{
"_index": "food-master-new",
"_type": "doc",
"_id": "nduzinABb4g7LgmgoINI",
"_score": 57.561867,
"_source": {
"name": "Egg Coffee",
"alias": [],
"id": 12609
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "dNuzinABb4g7LgmgoFVI",
"_score": 55.811295,
"_source": {
"name": "Coffee (Black)",
"alias": [
"Weight Loss",
"Diabetes Friendly",
"Gluten Free",
"Lactose Free",
"Heart Friendly",
"Hypertension Friendly"
],
"id": 1341
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "NduzinABb4g7LgmgoHxI",
"_score": 54.303185,
"_source": {
"name": "Brewed Coffee",
"alias": [
"StarBucks"
],
"id": 15679
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "ltuzinABb4g7LgmgoJJI",
"_score": 54.303185,
"_source": {
"name": "Coffee - Masala",
"alias": [],
"id": 11329
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "oduzinABb4g7LgmgoGpI",
"_score": 53.171227,
"_source": {
"name": "Coffee, German",
"alias": [],
"id": 12257
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "YNuzinABb4g7LgmgoFRI",
"_score": 52.929176,
"_source": {
"name": "Soy Milk Coffee",
"alias": [
"Gluten Free",
"Lactose Free",
"Weight Loss",
"Diabetes Friendly",
"Heart Friendly",
"Hypertension Friendly"
],
"id": 978
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "8duzinABb4g7LgmgoFRI",
"_score": 52.068523,
"_source": {
"name": "Cold Coffee (Soy Milk)",
"alias": [
"Soy Milk"
],
"id": 1097
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "tNuzinABb4g7LgmgoF9I",
"_score": 50.956154,
"_source": {
"name": "Coffee Frappe",
"alias": [],
"id": 3142
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "ZduzinABb4g7LgmgoF5I",
"_score": 49.810112,
"_source": {
"name": "Big Apple Coffee",
"alias": [],
"id": 3130
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "eduzinABb4g7LgmgoHtI",
"_score": 49.62197,
"_source": {
"name": "Mexican Coffee",
"alias": [],
"id": 13604
}
}
]
}
}
If I change the tokenizer to 'keyword' from 'standard' then I face the same problem and it also splits phrases into individual words - 'green tea' to 'green' and 'tea'
Any suggestions on what I might be getting wrong with respect to analyzers? I've tried all possible combinations but meeting all 3 scenarios with high accuracy is getting a little difficult.
Related
The mapping of the elastic search documents are as follows:
{
"mappings": {
"properties": {
"vote_id": {
"type": "keyword"
},
"user_id": {
"type": "text"
},
"song_id": {
"type": "text"
},
"type": {
"type": "byte"
},
"timestamp": {
"type": "date"
}
}
}
}
I want to aggregate these votes such that it returns songs that you AND your friends like.
So far, I have buckets of songs that you and your friends like, but some buckets may be songs that only your friends like.
{
"query": {
"bool": {
"must": {
"terms": {
"user_id": ["you and your friend ids"]
}
}
}
},
"aggs": {
"songs": {
"terms": {
"field": "song_id"
},
"aggs": {
"docs": {
"top_hits": {
"size": "length of you and your friends",
"_source": ["vote_id", "song_id", "user_id", "type", "timestamp"]
}
},
"more_than_one": {
"bucket_selector": {
"buckets_path": {
"count": "_count"
},
"script": "params.count > 1"
}
}
},
}
}
}
I want to filter the buckets such that at least one of the documents in the top hits has your user id.
This is the current response
"aggregations": {
"songs": {
"buckets": [
{
"doc_count": 5,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotNtFBb9TCEfpk3S54q6gcMbjZB82Xc1_ZCgA6kYsUmvk",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:56.118207343Z",
"type": 1,
"user_id": "iusr8keSbPjX9ZqFhX4Dei4G",
"vote_id": "ivotNtFBb9TCEfpk3S54q6gcMbjZB82Xc1_ZCgA6kYsUmvk"
},
"_type": "_doc"
},
{
"_id": "ivotEFcqOlCL5htJZJ43NslAP555DaPj0Dgkcay_Ml2jAT4",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:52.143988883Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotEFcqOlCL5htJZJ43NslAP555DaPj0Dgkcay_Ml2jAT4"
},
"_type": "_doc"
},
{
"_id": "ivotToZ-0iBiM_zF5TP1Shj5C29WV3U0ibedlxvcQccimeo",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:50.178450007Z",
"type": 1,
"user_id": "iusrKDxnm75fADEpusbmx5JM",
"vote_id": "ivotToZ-0iBiM_zF5TP1Shj5C29WV3U0ibedlxvcQccimeo"
},
"_type": "_doc"
},
{
"_id": "ivotAHBPual232E12ggibhr6GfQ5E3f9Ryov0gYKGrIRB0Y",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:52.886305925Z",
"type": 1,
"user_id": "iusrJG4GwkWa6Y70LPkuNCPg",
"vote_id": "ivotAHBPual232E12ggibhr6GfQ5E3f9Ryov0gYKGrIRB0Y"
},
"_type": "_doc"
},
{
"_id": "ivot7a8rWunlFu_q5St44PYDeNelLq4bsxr9wzYP9D80wxE",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:49.031694548Z",
"type": 1,
"user_id": "iusrxunBXT1UD0IrvjqjgWaj",
"vote_id": "ivot7a8rWunlFu_q5St44PYDeNelLq4bsxr9wzYP9D80wxE"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 5
}
}
},
"key": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8"
},
{
"doc_count": 4,
"docs": {
"hits": {
"hits": [
{
"_id": "ivot9_2eQ_3eqU7SXQBnLWGQwFI5DE99Naf8wYbFNFrj1lk",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:55.761587927Z",
"type": 1,
"user_id": "iusr8keSbPjX9ZqFhX4Dei4G",
"vote_id": "ivot9_2eQ_3eqU7SXQBnLWGQwFI5DE99Naf8wYbFNFrj1lk"
},
"_type": "_doc"
},
{
"_id": "ivotUZRVSKzGbmlP4LlmBkMwMM8xcR4nGTE9KNpysVR0vXQ",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:52.555377592Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotUZRVSKzGbmlP4LlmBkMwMM8xcR4nGTE9KNpysVR0vXQ"
},
"_type": "_doc"
},
{
"_id": "ivot5Wj8pIkbO0JOV_5s2PqEvZU3sy0WSYYUSlgs2Qizfo8",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:49.756332674Z",
"type": 1,
"user_id": "iusrKDxnm75fADEpusbmx5JM",
"vote_id": "ivot5Wj8pIkbO0JOV_5s2PqEvZU3sy0WSYYUSlgs2Qizfo8"
},
"_type": "_doc"
},
{
"_id": "ivot8QNCJGsNtRiZYa-QMTUHEh5MHHHr4EKJsXm4UTAwJkg",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:53.26319105Z",
"type": 1,
"user_id": "iusrJG4GwkWa6Y70LPkuNCPg",
"vote_id": "ivot8QNCJGsNtRiZYa-QMTUHEh5MHHHr4EKJsXm4UTAwJkg"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 4
}
}
},
"key": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6"
},
{
"doc_count": 3,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotoCfJc_q5vuY27KmvZUo8s4tilI57_xJoPXqfSeJTikg",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc",
"timestamp": "2022-07-08T19:41:50.527352591Z",
"type": 1,
"user_id": "iusrL8FCabxg1YCeaakcVXG5",
"vote_id": "ivotoCfJc_q5vuY27KmvZUo8s4tilI57_xJoPXqfSeJTikg"
},
"_type": "_doc"
},
{
"_id": "ivotStjKDIRy6vfaO5dNws4wGPELywPRgg7D7uSavatfIEo",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc",
"timestamp": "2022-07-08T19:41:51.733375716Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotStjKDIRy6vfaO5dNws4wGPELywPRgg7D7uSavatfIEo"
},
"_type": "_doc"
},
{
"_id": "ivotUHj_Ebh-xIqqPJNEdWAuc_JO_mcVVG8F9wM67bJ7_6A",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc",
"timestamp": "2022-07-08T19:41:48.60900159Z",
"type": 1,
"user_id": "iusrxunBXT1UD0IrvjqjgWaj",
"vote_id": "ivotUHj_Ebh-xIqqPJNEdWAuc_JO_mcVVG8F9wM67bJ7_6A"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 3
}
}
},
"key": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc"
},
{
"doc_count": 3,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotE5xO9hZGLhrS2sL1mgf5UbcHtf_5qAwbrp3QwEtf4zk",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W",
"timestamp": "2022-07-08T19:41:54.99023451Z",
"type": 1,
"user_id": "iusre80pxIMFB XfF61SHlCiz",
"vote_id": "ivotE5xO9hZGLhrS2sL1mgf5UbcHtf_5qAwbrp3QwEtf4zk"
},
"_type": "_doc"
},
{
"_id": "ivotI0OCI6gz6oEV94hgvjZmGB-qA4n-EigirDRJpgeZt68",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W",
"timestamp": "2022-07-08T19:41:50.952366924Z",
"type": 1,
"user_id": "iusr3oWy3mxsBWu6CU4mlw5L",
"vote_id": "ivotI0OCI6gz6oEV94hgvjZmGB-qA4n-EigirDRJpgeZt68"
},
"_type": "_doc"
},
{
"_id": "ivotm7GrIeyWRHamPXF9klzTZ0La8H4evCgWkCTIpx8rLl4",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W",
"timestamp": "2022-07-08T19:41:48.234881506Z",
"type": 1,
"user_id": "iusrCbCltg4nzv0b2JfUbyhj",
"vote_id": "ivotm7GrIeyWRHamPXF9klzTZ0La8H4evCgWkCTIpx8rLl4"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 3
}
}
},
"key": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W"
},
{
"doc_count": 2,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotGWwrgPehs9s7ZwZACzkVNp4-_SUaUu3noUKyBH8IBnw",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngfMv4IemhtjXX78LTqxKFBc1VMeUz6And",
"timestamp": "2022-07-08T19:41:53.785333717Z",
"type": 1,
"user_id": "iusrYvNFTaTg4RBNBxG63nkY",
"vote_id": "ivotGWwrgPehs9s7ZwZACzkVNp4-_SUaUu3noUKyBH8IBnw"
},
"_type": "_doc"
},
{
"_id": "ivotrWG5cy6vEbe0N4JO4IKzHZyahOlkyPctCdrBnBu-v9Q",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngfMv4IemhtjXX78LTqxKFBc1VMeUz6And",
"timestamp": "2022-07-08T19:41:51.303745591Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotrWG5cy6vEbe0N4JO4IKzHZyahOlkyPctCdrBnBu-v9Q"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 2
}
}
},
"key": "isngfMv4IemhtjXX78LTqxKFBc1VMeUz6And"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
I want to filter out the aggregate hits that have documents with your own id
To simplify:
PUT /test/vendors/1
{
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
PUT /test/vendors/2
{
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
PUT /test/vendors/3
{
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
Then search:
GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ]
}
}
}
I understand why I get Jack who works in San Fran -- it's because he's a doctor too. However, I can't figure out why the match score is the SAME for him. The other two were matched with the place too, weren't they? why aren't Ron and Tom scored higher?
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.9245277,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}
Is there a way to force it to score less when less search keywords are found? Also, If I'n going to wrong way about this kind of search and there's a better pattern/way to do it -- I'd appreciate to be pointed in the right direction.
Your search structure is incorrect. The search query above is ignoring the place property and that's why you get the same score for all documents (only type property is taken into account). The reason for that is because works_at is a nested mapping, which should be treated differently when searching.
First, you should defined works_at as a nested mapping (read more here). Then you'll have to adjust your query to work with that nested mapping, see an example here.
GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ],
"type": "most_fields" . <---- I WAS MISSING THIS
}
}
}
once in, that gave the correct results, where the "San Fran" guy is scored lower.
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.2122098,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}
i am planning to make an elastic search based auto complete module for an e commerce website.i am using edge_ngram for suggestions.I am trying out this configuration.
**My index creation :**
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase"
]
},
"autocomplete_search": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 10,
"token_chars": [
"letter","digit"
]
}
}
}
},
"mappings": {
"doc": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
**Inserting Data**
PUT my_index/doc/1
{
"title": "iphone s"
}
PUT my_index/doc/9
{
"title": "iphone ka"
}
PUT my_index/doc/11
{
"title": "iphone ka t"
}
PUT my_index/doc/15
{
"title": "iphone 6"
}
PUT my_index/doc/14
{
"title": "iphone 6 16GB"
}
PUT my_index/doc/3
{
"title": "iphone k"
}
POST my_index/_refresh
POST my_index/_analyze
{
"tokenizer": "autocomplete",
"text": "iphone 6"
}
POST my_index/_analyze
{
"analyzer": "pattern",
"text": "iphone 6"
}
**Autocomplete suggestions**
When i am trying to find out closets match to iphone 6.It is not showing correct result.
GET my_index/_search
{
"query": {
"match": {
"title": {
"query": "iphone 6",
"operator": "and"
}
}
}
}
**Above query yielding :**
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0.28582606,
"hits": [
{
"_index": "my_index",
"_type": "doc",
"_id": "1",
"_score": 0.28582606,
"_source": {
"title": "iphone s"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "9",
"_score": 0.25811607,
"_source": {
"title": "iphone ka"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "14",
"_score": 0.24257512,
"_source": {
"title": "iphone 6 16GB"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "3",
"_score": 0.19100356,
"_source": {
"title": "iphone k"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "15",
"_score": 0.1862728,
"_source": {
"title": "iphone 6"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "11",
"_score": 0.16358379,
"_source": {
"title": "iphone ka t"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "2",
"_score": 0.15861572,
"_source": {
"title": "iphone 5 s"
}
}
]
}
}
But result should be :
{
"_index": "my_index",
"_type": "doc",
"_id": "15",
"_score": 1,
"_source": {
"title": "iphone 6"
}
}
Please let me know if i am missing something on this,I am new to this so not aware of any other method that may yield better results.
You are using autocomplete_search as your search_analyzer. If you look how your text is analyzed using search analyzer specified by you.
POST my_index/_analyze
{
"analyzer": "autocomplete_search",
"text": "iphone 6"
}
You will get
{
"tokens": [
{
"token": "iphone", ===> Only one token
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 0
}
]
}
Since all the documents have this (iphone) token in reverse index. So all the documents are returned
In case you want to match desired results, you can use the same analyzer used while indexing.
{
"query": {
"match": {
"title": {
"query": "iphone 6",
"operator": "and",
"analyzer" : "autocomplete"
}
}
}
}
How to sort by match prioritising the most left words matched
Explanation
Sort the prefix query by the word it matches, but prioritising the matches in the words more at left.
Tests I've made
Data
DELETE /test
PUT /test
PUT /test/person/_mapping
{
"properties": {
"name": {
"type": "multi_field",
"fields": {
"name": {"type": "string"},
"original": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
PUT /test/person/1
{"name": "Berta Kassulke"}
PUT /test/person/2
{"name": "Kaley Bartoletti"}
PUT /test/person/3
{"name": "Kali Hahn"}
PUT /test/person/4
{"name": "Karolann Klein"}
PUT /test/person/5
{"name": "Sofia Mandez Kaloo"}
The mapping was added for the 'sort on original value' test.
Simple query
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
}
}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": 1,
"_source": {
"name": "Karolann Klein"
}
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": 1,
"_source": {
"name": "Sofia Mandez Kaloo"
}
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": 1,
"_source": {
"name": "Berta Kassulke"
}
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": 1,
"_source": {
"name": "Kaley Bartoletti"
}
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": 1,
"_source": {
"name": "Kali Hahn"
}
}
]
}
}
With sorting
Request
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name": {"order": "asc"}}
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"berta"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"kaloo"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"karolann"
]
}
]
}
}
With sort on original value
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name.original": {"order": "asc"}}
}
Result
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"Berta Kassulke"
]
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"Kaley Bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"Kali Hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"Karolann Klein"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"Sofia Mandez Kaloo"
]
}
]
}
}
Intended result
Sorted by name ASC but prioritising the matches on the most left words
Kaley Bartoletti
Kali Hahn
Karolann Klein
Berta Kassulke
Sofia Mandez Kaloo
Good Question. One way to achieve this would be with the combination of edge ngram filter and span first query
This is my setting
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase",
"edge_filter",
"asciifolding"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 8
}
}
}
},
"mappings": {
"person": {
"properties": {
"name": {
"type": "string",
"analyzer": "my_custom_analyzer",
"search_analyzer": "standard",
"fields": {
"standard": {
"type": "string"
}
}
}
}
}
}
}
After that I inserted your sample documents. Then I wrote the following query with dis_max. Notice that end parameter for first span query is 1 so this will prioritize(higher score) leftmost match. I am first sorting by score and then by name.
{
"query": {
"dis_max": {
"tie_breaker": 0.7,
"boost": 1.2,
"queries": [
{
"match": {
"name": "ka"
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 1
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 2
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"name.standard": {
"order": "asc"
}
}
]
}
The result I get
"hits": [
{
"_index": "esedge",
"_type": "policy_data",
"_id": "2",
"_score": 0.72272325,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
0.72272325,
"bartoletti"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "3",
"_score": 0.72272325,
"_source": {
"name": "Kali Hahn"
},
"sort": [
0.72272325,
"hahn"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "4",
"_score": 0.72272325,
"_source": {
"name": "Karolann Klein"
},
"sort": [
0.72272325,
"karolann"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "1",
"_score": 0.54295504,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
0.54295504,
"berta"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "5",
"_score": 0.2905494,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
0.2905494,
"kaloo"
]
}
]
I hope this helps.
I am using ElasticSearch via NEST c#. I have large list of information about people
{
firstName: 'Frank',
lastName: 'Jones',
City: 'New York'
}
I'd like to be able to filter and sort this list of items by lastName as well as order by the length so people who only have 5 characters in their name will be at the beginning of the result set then people with 10 characters.
So with some pseudo code I'd like to do something like
list.wildcard("j*").sort(m => lastName.length)
You can do the sorting with script-based sorting.
As a toy example, I set up a trivial index with a few documents:
PUT /test_index
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"name":"Bob"}
{"index":{"_id":2}}
{"name":"Jeff"}
{"index":{"_id":3}}
{"name":"Darlene"}
{"index":{"_id":4}}
{"name":"Jose"}
Then I can order search results like this:
POST /test_index/_search
{
"query": {
"match_all": {}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": null,
"_source": {
"name": "Bob"
},
"sort": [
3
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
To filter by length, I can use a script filter in a similar way:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"script": {
"script": "doc['name'].value.length() > 3",
"params": {}
}
}
}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/22fef6dc5453eaaae3be5fb7609663cc77c43dab
P.S.: If any of the last names will contain spaces, you might want to use "index": "not_analyzed" on that field.