I'm try do a query like this in Elastic Search:
Return me all the devices of an app that had some logs between two dates and for each device return me the total number of logs
For this I've a parent-child relationship. I've the parent device type that has the device information and then a child entity device_logs that has the number of logs for each day.
I tried to run the following query with a custom score function. I do get the right devices, but the score has the sum of all the device_logs entries instead of the entries in the dates range.
Any idea if it's possible to do this kind of query?
{
"query": {
"bool": {
"filter" :
[
{
"term": {"app": 347}
}
],
"must" :
[
{
"has_child": {
"type": "device_logs",
"inner_hits" : {},
"query": {
"bool": {
"filter": {
"range": {
"date": {
"from": "2017-01-15T00:00:00Z",
"include_lower": true,
"include_upper": true,
"to": "2017-01-17T23:59:59Z"
}
}
}
}
}
}
},
{
"has_child": {
"type": "device_logs",
"score_mode": "sum",
"query" : {
"function_score" : {
"script_score": {
"script": "_score * doc['logs'].value"
}
}
}
}
}
]
}
}
}
EDIT: Adding mappings and some docs
Here you have the mappings:
"mappings": {
"device": {
"properties": {
"app": {
"type": "long",
"include_in_all": false
},
"created_at": {
"type": "date",
"include_in_all": false
},
"id": {
"type": "long",
"include_in_all": false
},
"language": {
"type": "keyword",
"include_in_all": false,
"ignore_above": 256
},
"last_log_at": {
"type": "date",
"include_in_all": false
},
"last_ping_at": {
"type": "date",
"include_in_all": false
},
"last_seen_at": {
"type": "date"
},
"log_enabled": {
"type": "boolean"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
"lowercase": {
"type": "text",
"analyzer": "case_insensitive_sort"
}
}
},
"os_version": {
"type": "keyword",
"include_in_all": false,
"ignore_above": 256
},
"timezone": {
"type": "keyword",
"include_in_all": false,
"ignore_above": 256
},
"type": {
"type": "keyword",
"ignore_above": 256
},
"udid": {
"type": "keyword",
"ignore_above": 256
},
"version": {
"properties": {
"build": {
"type": "keyword",
"include_in_all": false,
"ignore_above": 256
},
"id": {
"type": "long",
"include_in_all": false
},
"version": {
"type": "keyword",
"include_in_all": false,
"ignore_above": 256
}
}
}
}
},
"device_logs": {
"_parent": {
"type": "device"
},
"_routing": {
"required": true
},
"properties": {
"_": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"app": {
"type": "long",
"include_in_all": false
},
"date": {
"type": "date",
"include_in_all": false
},
"errors": {
"type": "long",
"include_in_all": false
},
"logs": {
"type": "long",
"include_in_all": false
},
"warnings": {
"type": "long",
"include_in_all": false
}
}
}
}
And some documents:
{
"_index": "devices",
"_type": "device_logs",
"_id": "22466_2017.01.17",
"_score": 1,
"_routing": "22466",
"_parent": "22466",
"_source": {
"_": "22466_2017.01.17",
"app": 200,
"date": "2017-01-17T00:00:00Z",
"logs": 660,
"warnings": 238,
"errors": 217
}
}
{
"_index": "devices",
"_type": "device",
"_id": "22466",
"_score": 1,
"_source": {
"id": 22466,
"udid": "770CA14ED7FE861EC452",
"name": "Edward's iPhone",
"type": "iPhone7,2",
"app": 200,
"log_enabled": false,
"created_at": "2016-12-21T10:55:02Z",
"last_seen_at": "2017-01-19T10:07:33Z",
"last_log_at": "2017-01-19T11:07:40.756275026+01:00",
"language": "en-US",
"os_version": "9.2",
"timezone": "GMT+1",
"version.id": 7305,
"version.version": "1",
"version.build": "100"
}
}
I have solved your query.
From the first look at the query, I was doubtful that you are not filtering the child documents in one of the must filters before applying the function score on the child document.
I have used the following set of documents for this query
parent doc
{
"id": 22466,
"udid": "770CA14ED7FE861EC452",
"name": "Edward's iPhone",
"type": "iPhone7,2",
"app": 347,
"log_enabled": false,
"created_at": "2016-12-21T10:55:02Z",
"last_seen_at": "2017-01-19T10:07:33Z",
"last_log_at": "2017-01-19T11:07:40.756275026+01:00",
"language": "en-US",
"os_version": "9.2",
"timezone": "GMT+1",
"version.id": 7305,
"version.version": "1",
"version.build": "100"
}
child docs
{
"_type": "device_logs",
"_id": "22466_2017.01.17",
"_score": 0,
"_routing": "22466",
"_parent": "22466",
"_source": {
"_": "22466_2017.01.17",
"app": 200,
"date": "2017-01-17T00:00:00Z",
"logs": 660,
"warnings": 238,
"errors": 217
}
},
{
"_type": "device_logs",
"_id": "22466_2017.02.17",
"_score": 0,
"_routing": "22466",
"_parent": "22466",
"_source": {
"_": "22466_2017.02.17",
"app": 200,
"date": "2017-01-17T00:00:00Z",
"logs": 200,
"warnings": 238,
"errors": 217
}
},
{
"_type": "device_logs",
"_id": "22466_2017.02.20",
"_score": 0,
"_routing": "22466",
"_parent": "22466",
"_source": {
"_": "22466_2017.02.20",
"app": 200,
"date": "2017-01-20T00:00:00Z",
"logs": 200,
"warnings": 238,
"errors": 217
}
}
Note - The first must filter only filter the documents for innerhits.
Please use the following query:
{
"query": {
"bool": {
"filter": [{
"term": {
"app": 347
}
}],
"must": [{
"has_child": {
"type": "device_logs",
"inner_hits": {},
"query": {
"bool": {
"filter": {
"range": {
"date": {
"from": "2017-01-15T00:00:00Z",
"include_lower": true,
"include_upper": true,
"to": "2017-01-17T23:59:59Z"
}
}
}
}
}
}
}, {
"has_child": {
"type": "device_logs",
"score_mode": "sum",
"query": {
"function_score": {
"query": {
"bool": {
"filter": {
"range": {
"date": {
"from": "2017-01-15T00:00:00Z",
"include_lower": true,
"include_upper": true,
"to": "2017-01-17T23:59:59Z"
}
}
}
}
},
"score_mode": "sum",
"boost_mode": "sum",
"script_score": {
"script": "_score + doc['logs'].value"
}
}
}
}
}]
}
}
}
Few references https://github.com/elastic/elasticsearch/issues/10051
Following is the response I get with explain bool set to true
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 861,
"hits": [
{
"_shard": "[array_index1][0]",
"_node": "nnauJDrIS8-QCqicOMF23g",
"_index": "array_index1",
"_type": "device",
"_id": "22466",
"_score": 861,
"_source": {
"id": 22466,
"udid": "770CA14ED7FE861EC452",
"name": "Edward's iPhone",
"type": "iPhone7,2",
"app": 347,
"log_enabled": false,
"created_at": "2016-12-21T10:55:02Z",
"last_seen_at": "2017-01-19T10:07:33Z",
"last_log_at": "2017-01-19T11:07:40.756275026+01:00",
"language": "en-US",
"os_version": "9.2",
"timezone": "GMT+1",
"version.id": 7305,
"version.version": "1",
"version.build": "100"
},
"_explanation": {
"value": 861,
"description": "sum of:",
"details": [
{
"value": 1,
"description": "A match, join value 22466",
"details": []
},
{
"value": 860,
"description": "A match, join value 22466",
"details": []
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": "app:[347 TO 347], product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
},
"inner_hits": {
"device_logs": {
"hits": {
"total": 2,
"max_score": 0,
"hits": [
{
"_type": "device_logs",
"_id": "22466_2017.01.17",
"_score": 0,
"_routing": "22466",
"_parent": "22466",
"_source": {
"_": "22466_2017.01.17",
"app": 200,
"date": "2017-01-17T00:00:00Z",
"logs": 660,
"warnings": 238,
"errors": 217
}
},
{
"_type": "device_logs",
"_id": "22466_2017.02.17",
"_score": 0,
"_routing": "22466",
"_parent": "22466",
"_source": {
"_": "22466_2017.02.17",
"app": 200,
"date": "2017-01-17T00:00:00Z",
"logs": 200,
"warnings": 238,
"errors": 217
}
}
]
}
}
}
}
]
}
}
Please verify your results.
Related
I have address with address component stored in my elasticSearch, each address looks like following in my ES :
{
"_index": "properties",
"_type": "_doc",
"_id": "property_5235354",
"_score": 32.839436,
"_source": {
"id": 5235354,
"branchid": 1,
"suburb": "Lyons",
"postcode": "2606",
"state": "ACT",
"#timestamp": "2021-09-27T08:56:08.827Z",
"agencycode": "X",
"address": "54-5 Burnie St Lyons ACT 2606 AUS",
"streetnumber": "5",
"branchcode": "X_ACT",
"unitnumber": "54",
"agencyid": 1,
"streetname": "Burnie St",
"#version": "1"
}
}
To search specific address on the basis of components I am considering following points :
There could be abbreviation of street names like "James Street" -> "James St"
Matching by address components with exact match in case insensitive manner
Please let me know if you think I should consider something else
To do this I tried following :
{
"query": {
"bool": {
"should": [
{
"match": {
"streetname.keyword": "Burnie Street"
}
},
{
"match": {
"streetname.keyword": "Burnie St"
}
}
],
"must": [
{
"match": {
"unitnumber.keyword": "54"
}
},
{
"match": {
"streetnumber.keyword": "5"
}
},
{
"match": {
"suburb.keyword": "Lyons"
}
},
{
"match": {
"state": "ACT"
}
},
{
"match": {
"postcode.keyword": "2606"
}
}
]
}
},
"size": 1000
}
Need your help on solving these :
Above query is also returning invalid result like address : 54-5 Burnie Avenue Lyons ACT 2606 AUS which is Burnie Avenue not Burnie Street.
If I give burnie street instead of Burnie Street, its unable to find the data.
More information :
This is the full result of _search API with above request body where addresses 54-5 Burnie St Lyons ACT 2606 AUS & 54/5 Burnie Street Lyons ACT 2606 are right match but 54-5 Burnie Avenue Lyons ACT 2606 AUS is an invalid match
{
"took": 1476,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 32.839436,
"hits": [
{
"_index": "properties",
"_type": "_doc",
"_id": "property_5235354",
"_score": 32.839436,
"_source": {
"id": 5235354,
"branchid": 1,
"suburb": "Lyons",
"postcode": "2606",
"state": "ACT",
"#timestamp": "2021-09-27T08:56:08.827Z",
"agencycode": "X",
"address": "54-5 Burnie St Lyons ACT 2606 AUS",
"streetnumber": "5",
"branchcode": "X_ACT",
"unitnumber": "54",
"agencyid": 1,
"streetname": "Burnie St",
"#version": "1"
}
},
{
"_index": "properties",
"_type": "_doc",
"_id": "property_11081",
"_score": 28.954222,
"_source": {
"id": 11081,
"branchid": 1,
"suburb": "Lyons",
"postcode": "2606",
"state": "ACT",
"#timestamp": "2021-09-27T08:56:08.163Z",
"agencycode": "X",
"address": "54/5 Burnie Street Lyons ACT 2606",
"streetnumber": "5",
"branchcode": "X_ACT",
"unitnumber": "54",
"agencyid": 1,
"streetname": "Burnie Street",
"#version": "1"
}
},
{
"_index": "properties",
"_type": "_doc",
"_id": "property_5235356",
"_score": 22.677355,
"_source": {
"id": 5235356,
"branchid": 1,
"suburb": "Lyons",
"postcode": "2606",
"state": "ACT",
"#timestamp": "2021-09-27T08:56:08.847Z",
"agencycode": "X",
"address": "54-5 Burnie Avenue Lyons ACT 2606 AUS",
"streetnumber": "5",
"branchcode": "X_ACT",
"unitnumber": "54",
"agencyid": 1,
"streetname": "Burnie Avenue",
"#version": "1"
}
}
]
}
}
You need to use a combination of bool/must/should query clause, term query (for exact match ignoring the case), and match_phrase_prefix query
Index Mapping:
{
"mappings": {
"properties": {
"#timestamp": {
"type": "date"
},
"#version": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"address": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"agencycode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"agencyid": {
"type": "long"
},
"branchcode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"branchid": {
"type": "long"
},
"id": {
"type": "long"
},
"postcode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"state": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"streetname": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"streetnumber": {
"type": "integer"
},
"suburb": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"unitnumber": {
"type": "integer"
}
}
}
}
Search Query:
{
"query": {
"bool": {
"must": [
{
"term": {
"streetnumber": "5"
}
},
{
"term": {
"unitnumber": "54"
}
},
{
"bool": {
"should": [
{
"term": {
"streetname.keyword": {
"value": "Burnie Street",
"case_insensitive": "true"
}
}
},
{
"match_phrase_prefix": {
"streetname": "Burnie St"
}
}
]
}
},
{
"term": {
"suburb.keyword": {
"value": "Lyons",
"case_insensitive": "true"
}
}
},
{
"term": {
"postcode.keyword": "2606"
}
},
{
"term": {
"state.keyword": {
"value": "ACT",
"case_insensitive": "true"
}
}
}
]
}
}
}
I want to get data with different value but same field name in array of object
I have this data with stringFacets array of object contains criteria in elastict search 7.9
{
"took": 2238,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": [{
"stringFacets": [{
"name": "criterion",
"value": "Accès Wifi"
},
{
"name": "criterion",
"value": "Piscine"
}
]
}]
}
I want in my search get documents where stringFacets.name = "criterion" and stringFacets.value = "Piscine" and stringFacets.value = "Accès Wifi"
I tried this but no result
{
"query": {
"bool": {
"must": [
{
"nested": {
"path": "stringFacets",
"query": {
"bool": {
"must": [
{
"term": {
"stringFacets.name": "criterion"
}
},
{
"term": {
"stringFacets.value": "Piscine"
}
},
{
"term": {
"stringFacets.value": "Accès Wifi"
}
}
]
}
}
}
}
]
}
}
My mapping
{
"settings": {
"number_of_shards": "1"
},
"mappings": {
"dynamic": false,
"dynamic_templates": [{
"results": {
"mapping": {
"type": "text",
"index": false
},
"path_match": "results.*"
}
}
],
"properties": {
"#version" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"#timestamp" : {
"type" : "date"
},
"booleanFacets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "boolean"
}
}
},
"stringFacets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"locationFacets": {
"type": "geo_point"
},
"integerFacets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "long"
}
}
},
"decimalFacets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "double"
}
}
},
"datetimeFacets": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "date"
}
}
},
"availabilities": {
"type": "nested",
"properties": {
"start": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSSSSS"
},
"end": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSSSSS"
},
"price": {
"type": "double"
},
"duration": {
"type": "long"
}
}
}
}
}
}
Thanks
The nested type is a specialized version of the object data type that
allows arrays of objects to be indexed in a way that they can be
queried independently of each other.
You are getting 0 results because there is no single object in your sample data that match all three conditions.
You can use inner_hits resulting in an inner nested query to automatically match the relevant nesting level, rather than the root
Modify your query as
{
"query": {
"nested": {
"path": "stringFacets",
"query": {
"bool": {
"should": [
{
"term": {
"stringFacets.name": "criterion"
}
},
{
"term": {
"stringFacets.value": "Piscine"
}
},
{
"term": {
"stringFacets.value": "Accès Wifi"
}
}
],
"minimum_should_match":2
}
},
"inner_hits": {}
}
}
}
Search Result will be
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.8754687,
"hits": [
{
"_index": "66170374",
"_type": "_doc",
"_id": "1",
"_score": 0.8754687,
"_source": {
"stringFacets": [
{
"name": "criterion",
"value": "Accès Wifi"
},
{
"name": "criterion",
"value": "Piscine"
}
]
},
"inner_hits": {
"stringFacets": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.8754687,
"hits": [
{
"_index": "66170374",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "stringFacets",
"offset": 0
},
"_score": 0.8754687,
"_source": {
"name": "criterion", // note this
"value": "Accès Wifi"
}
},
{
"_index": "66170374",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "stringFacets",
"offset": 1
},
"_score": 0.8754687,
"_source": {
"name": "criterion", // note this
"value": "Piscine"
}
}
]
}
}
}
}
]
}
How do I always return the documents with the lowest value in the "url_length" field regardless of (from) that I sent to search?
in the query below, I request the results that have the word (netflix) and that the field (pgrk) is between 9 and 10 and that the field (url_length) is less than 4, but when I put it ("from": 1, "size ": 1) does not return the doc of (_id 15) that has the word (netflix) the field pgrk = 10 and the field (url_length) = 2. Returns the doc of (_id 14) that has the word (netflix) the field pgrk = 10 and the field (url_length) = 3
just return the doc of (_id 15) that has the field (url_length) = 2 if I put it in the query from ZERO ("from": 0, "size": 1)
because I had it searched ("from": 1, "size": 1,) and didn't bring the record of (_id 15) that has the ("url_length" = 2) brought the record of (_id 14) that has the ("url_length" = 3)
{
"from": 1,
"size": 1,
"sort": [
{
"pgrk": {
"order": "desc"
}
},
{
"url_length": {
"order": "asc"
}
}
],
"query": {
"bool": {
"must": {
"multi_match": {
"query": "netflix",
"type": "cross_fields",
"fields": [
"tittle",
"description",
"url"
],
"operator": "and"
}
},
"filter": [
{
"range": {
"pgrk": {
"gte": 9,
"lte" : 10
}
}
},
{
"range": {
"url_length": {
"lt" : 4
}
}
}
]
}
}
}
if I put ("from": 1, "size": 1,) it does not return the record (_id 15) that has "url_length = 2" returns the doc of _id 14 that has "url_length = 3" as shown below:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "teste",
"_type": "_doc",
"_id": "14",
"_score": null,
"_source": {
"url": "www.333.com",
"title": "netflix netflix netflix netflix netflix netflix netflix netflix netflix netflix",
"description": "tudo sobre netflix netflix netflix netflix netflix netflix",
"pgrk": "10",
"url_length": "3"
},
"sort": [
10,
3
]
}
]
}
}
if I put ("from": 0, "size": 1,) then it returns the record (_id 15) that has "url_length = 2"
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "teste",
"_type": "_doc",
"_id": "15",
"_score": null,
"_source": {
"url": "www.netflix.yahoo.com",
"title": "melhor filme",
"description": "tudo sobre series",
"pgrk": "10",
"url_length": "2"
},
"sort": [
10,
2
]
}
]
}
}
how do I always return the documents with the lowest value in the "url_length" field regardless of (from) that I sent to search?
follows my mapping:
{
"settings": {
"index": {
"number_of_shards": "5",
"number_of_replicas": "0",
"analysis": {
"filter": {
"stemmer_plural_portugues": {
"name": "minimal_portuguese",
"stopwords" : ["http", "https", "ftp", "www"],
"type": "stemmer"
}
},
"analyzer": {
"analyzer_customizado": {
"filter": [
"lowercase",
"stemmer_plural_portugues",
"asciifolding"
],
"tokenizer": "lowercase"
}
}
}
}
},
"mappings": {
"properties": {
"q": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"id": {
"type": "long"
},
"#timestamp": {
"type": "date"
},
"data": {
"type": "date"
},
"#version": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"quebrado": {
"type": "byte"
},
"pgrk": {
"type": "integer"
},
"url_length": {
"type": "integer"
},
"term": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"titulo": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"descricao": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"url": {
"analyzer": "analyzer_customizado",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
}
}
I have a weird problem with Elasticsearch 6.0.
I have an index with the following mapping:
{
"cities": {
"mappings": {
"cities": {
"properties": {
"city": {
"properties": {
"id": {
"type": "long"
},
"name": {
"properties": {
"en": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"it": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"slug": {
"properties": {
"en": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"it": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"doctype": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"suggest": {
"type": "completion",
"analyzer": "accents",
"search_analyzer": "simple",
"preserve_separators": true,
"preserve_position_increments": false,
"max_input_length": 50
},
"weight": {
"type": "long"
}
}
}
}
}
}
I have these documents in my index:
{
"_index": "cities",
"_type": "cities",
"_id": "991-city",
"_version": 128,
"found": true,
"_source": {
"doctype": "city",
"suggest": {
"input": [
"nazaré",
"nazare",
"나자레",
"najare",
"najale",
"ナザレ",
"Ναζαρέ"
],
"weight": 1807
},
"weight": 3012,
"city": {
"id": 991,
"name": {
"en": "Nazaré",
"it": "Nazaré"
},
"slug": {
"en": "nazare",
"it": "nazare"
}
}
}
}
{
"_index": "cities",
"_type": "cities",
"_id": "1085-city",
"_version": 128,
"found": true,
"_source": {
"doctype": "city",
"suggest": {
"input": [
"nazareth",
"nazaret",
"拿撒勒",
"na sa le",
"sa le",
"le",
"na-sa-lei",
"나사렛",
"nasares",
"nasales",
"ナザレス",
"nazaresu",
"नज़ारेथ",
"nj'aareth",
"aareth",
"najaratha",
"Назарет",
"Ναζαρέτ",
"názáret",
"nazaretas"
],
"weight": 1809
},
"weight": 3015,
"city": {
"id": 1085,
"name": {
"en": "Nazareth",
"it": "Nazareth"
},
"slug": {
"en": "nazareth",
"it": "nazareth"
}
}
}
}
Now, when I search using the suggester, with the following query:
POST /cities/_search
{
"suggest":{
"suggest":{
"prefix":"nazare",
"completion":{
"field":"suggest"
}
}
}
}
I expect to have both documents in my results, but I only get the second one (nazareth) back:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": 0.0,
"hits": []
},
"suggest": {
"suggest": [
{
"text": "nazare",
"offset": 0,
"length": 6,
"options": [
{
"text": "nazaresu",
"_index": "cities",
"_type": "cities",
"_id": "1085-city",
"_score": 1809.0,
"_source": {
"doctype": "city",
"suggest": {
"input": [
"nazareth",
"nazaret",
"拿撒勒",
"na sa le",
"sa le",
"le",
"na-sa-lei",
"나사렛",
"nasares",
"nasales",
"ナザレス",
"nazaresu",
"नज़ारेथ",
"nj'aareth",
"aareth",
"najaratha",
"Назарет",
"Ναζαρέτ",
"názáret",
"nazaretas"
],
"weight": 1809
},
"weight": 3015,
"city": {
"id": 1085,
"name": {
"en": "Nazareth",
"it": "Nazareth"
},
"slug": {
"en": "nazareth",
"it": "nazareth"
}
}
}
}
]
}
]
}
}
This is unexpected, because in the suggester input for the first document, the term that I searched "nazare" appears exactly as I input it.
Another fun fact is that if I search for "najare" instead of "nazare" I get the correct results.
Any hint will be really appreciated!
For a quick solution, use the size parameter in the completion object of your query.
GET /cities/_search
{
"suggest":{
"suggest":{
"prefix":"nazare",
"completion":{
"field":"suggest",
"size": 100 <- HERE
}
}
}
}
The size parameter default to 5, so once elasticsearch as found 5 terms (and not document) having the correct prefix, it will stop looking for more terms (and consequently documents).
This limit is per term, not per document. So if one document contains 5 terms having the correct and you use the default value of 5, then possibly the other documents will not be returned.
I strongly believe that it is whats happening in your case. The returned document has at least 5 suggest terms having the prefix nazare so only this one will be returned.
For your fun fact, when you are searching najare, there is only one term having the correct prefix, so you have the correct result.
The tricky thing is that the results depends on the order elasticsearch retrieve the documents. If the first document would have been retrieved first, it would not have reach the size threshold (only 2 or 3 prefix occurrences), the next document would be also retrieved and you would have get the correct result.
Also, unless necessary, avoid using a very high value (e.g. > 1000) for the sizeparameter. It might impact the performance particularly for short or common prefixes.
I have an issue regarding ElasticSearch and More like this query.
Having mapping:
{
"directory.v1": {
"mappings": {
"profile.event": {
"properties": {
"event": {
"properties": {
"naics": {
"type": "nested",
"properties": {
"type": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
}
},
"user_id": {
"type": "long"
}
}
}
}
}
}
and document (A) as a source and document (B) to be found with more like this query (for A)
Profile A (used as source):
{
"_index": "directory.v1",
"_type": "profile.event",
"_id": "83731111.559",
"_score": 1,
"_source": {
"user_id": 8373,
"event": {
"naics": [
{
"value": 331,
"type": "naics"
},
{
"value": 74,
"type": "naics"
},
{
"value": 938,
"type": "naics"
},
{
"value": 2048,
"type": "naics"
},
{
"value": 939,
"type": "naics"
},
{
"value": 2049,
"type": "naics"
},
{
"value": 940,
"type": "naics"
},
{
"value": 2050,
"type": "naics"
},
{
"value": 941,
"type": "naics"
},
{
"value": 2051,
"type": "naics"
},
{
"value": 942,
"type": "naics"
},
{
"value": 2052,
"type": "naics"
},
{
"value": 943,
"type": "naics"
},
{
"value": 2053,
"type": "naics"
},
{
"value": 944,
"type": "naics"
},
{
"value": 2054,
"type": "naics"
},
{
"value": 945,
"type": "naics"
},
{
"value": 2055,
"type": "naics"
},
{
"value": 473,
"type": "naics"
},
{
"value": 128,
"type": "naics"
},
{
"value": 10,
"type": "naics"
},
{
"value": 1242,
"type": "naics"
},
{
"value": 472,
"type": "naics"
},
{
"value": 1241,
"type": "naics"
}
]
}
}
}
Profile B:
{
"_index": "directory.v1",
"_type": "profile.event",
"_id": "46124111.559",
"_score": 1,
"_source": {
"user_id": 46124,
"event": {
"naics": [
{
"value": 331,
"type": "naics"
},
{
"value": 74,
"type": "naics"
},
{
"value": 938,
"type": "naics"
},
{
"value": 2048,
"type": "naics"
},
{
"value": 939,
"type": "naics"
},
{
"value": 2049,
"type": "naics"
},
{
"value": 940,
"type": "naics"
},
{
"value": 2050,
"type": "naics"
},
{
"value": 941,
"type": "naics"
},
{
"value": 2051,
"type": "naics"
},
{
"value": 942,
"type": "naics"
},
{
"value": 2052,
"type": "naics"
},
{
"value": 943,
"type": "naics"
},
{
"value": 2053,
"type": "naics"
},
{
"value": 944,
"type": "naics"
},
{
"value": 2054,
"type": "naics"
},
{
"value": 945,
"type": "naics"
},
{
"value": 2055,
"type": "naics"
}
]
}
}
}
where B doc has all elements (naics) included in A document.
So that I really do not understand why for query:
{
"query": {
"nested": {
"path": "event.naics",
"query": {
"more_like_this": {
"like": [
{
"_id": "83731111.559",
"_type": "profile.event"
}
],
"fields": [
"event.naics.value"
],
"min_term_freq": 1,
"min_doc_freq": 1,
"minimum_should_match": "8%"
}
}
}
}
}
I have results!!
but when I increase min_should_match >= 9% it does not match at all and I get no results.
Also tried to do something like this which gets me some results up to 11%
{
"query": {
"nested": {
"path": "event.naics",
"query": {
"more_like_this": {
"like": [
{
"_id": "83731111.559",
"_type": "profile.event"
}
],
"fields": [
"event.naics.*"
],
"min_term_freq": 1,
"min_doc_freq": 1,
"minimum_should_match": "11%"
}
}
}
}
}
And termvecor for source document is:
{
"_index": "directory.v1",
"_type": "profile.event",
"_id": "83731111.559",
"_version": 5,
"found": true,
"took": 0,
"term_vectors": {}
}
If you get the term vector for document "A" for field event.naics.value you will see you have 24 terms in total each with term frequency 1.
So when you do 8% match that will be rounded down to 1 clause of the 24 generated should clauses, so you get a match. But 9% of 24 will round to 2 clauses should match which is no bueno as each of your nested document has only one value.
For calculation details you can see the bottom of this page
https://github.com/elastic/elasticsearch/blob/99f88f15c5febbca2d13b5b5fda27b844153bf1a/server/src/main/java/org/elasticsearch/common/lucene/search/Queries.java
And morelikethis source is here
https://github.com/elastic/elasticsearch/blob/46a79127edfb0cc93b7580624010ff81ca0cb2f4/server/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java
Term vector
POST /directory.v1/profile.event/83731111.559/_termvectors
{
"fields":["event.naics.value"],
"offsets" : false,
"payloads" : false,
"positions" : false,
"term_statistics" : true,
"field_statistics" : true
}