elasticsearch filter on nested array - elasticsearch

lets say records have city field as an array of city names.
records ex:
record 1:
{
cities : [
{name: city1},
{name : city2},
{name : city3}
]
}
record 2:
{
cities : [
{name: city2},
{name : city3},
{name : city4}
]
}
record 3:
{
cities : [
{name: city3},
{name : city4},
{name : city5}
]
}
requirement:
My filter criteria is to fetch the records matches with city1 or city2 or city3 but since the record 1 matches all 3 it should come first and record 2 matches 2 so it should come 2nd and record 3 matches only one so it should come last.

You don't have to use the nested data-type as you don't have the nested properties or complex object, its very simple and easy to achieve.
Working example
Index mapping
{
"mappings": {
"properties": {
"cities": {
"type": "text"
}
}
}
}
Index sample docs
{
"cities": [
"tel-aviv", "bangalore", "sf"
]
}
{
"cities": [
"tel-aviv"
]
}
{
"cities": [
"sf"
]
}
Search query
{
"query": {
"bool": {
"should": [
{
"match": {
"cities": "tel-aviv"
}
},
{
"match": {
"cities": "bangalore"
}
},
{
"match": {
"cities": "sf"
}
}
]
}
}
}
And search result with proper expected result and score
"hits": [
{
"_index": "cities",
"_type": "_doc",
"_id": "1",
"_score": 1.850198,
"_source": {
"cities": [
"tel-aviv",
"bangalore",
"sf"
]
}
},
{
"_index": "cities",
"_type": "_doc",
"_id": "2",
"_score": 0.9983525,
"_source": {
"cities": [
"tel-aviv"
]
}
},
{
"_index": "cities",
"_type": "_doc",
"_id": "3",
"_score": 0.6133945,
"_source": {
"cities": [
"sf"
]
}
}
]

Adding another answer with nested bool queries:
Index Mapping:
{
"mappings": {
"properties":{
"Cities": {
"type": "nested",
"dynamic": "true"
}
}}
}
Index Data:
{
"Cities": [
{
"id": 1,
"city": "Bangalore"
},
{
"id": 2,
"city": "Hyderabad"
},
{
"id": 3,
"city": "Delhi"
}
]
}
{
"Cities": [
{
"id": 1,
"city": "Bangalore"
},
{
"id": 2,
"city": "abc"
},
{
"id": 3,
"city": "Def"
}
]
}
Search Query:
{
"query": {
"bool": {
"should": [
{
"nested": {
"path": "Cities",
"query": {
"bool": {
"must": [
{
"match": {
"Cities.city": "Bangalore"
}
}
]
}
}
}
},
{
"nested": {
"path": "Cities",
"query": {
"bool": {
"must": [
{
"match": {
"Cities.city": "Hyderabad"
}
}
]
}
}
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "nested-63806067",
"_type": "_doc",
"_id": "1",
"_score": 3.297317, <-- note this
"_source": {
"Cities": [
{
"id": 1,
"city": "Bangalore"
},
{
"id": 2,
"city": "Hyderabad"
},
{
"id": 3,
"city": "Delhi"
}
]
}
},
{
"_index": "nested-63806067",
"_type": "_doc",
"_id": "2",
"_score": 1.6486585, <-- note this
"_source": {
"Cities": [
{
"id": 1,
"city": "Bangalore"
},
{
"id": 2,
"city": "abc"
},
{
"id": 3,
"city": "Def"
}
]
}
}
]

Related

ElasticSearch compound queries

My index data is
{
"first_name":"Kevin",
"last_name":"John",
"job": "IT"
}
{
"first_name":"John",
"last_name":"Thimothy",
"job": "Accountant"
}
{
"first_name":"Eric",
"last_name":"Villa",
"job": "Driver"
}
{
"first_name":"John",
"last_name":"Villa",
"job": "Student"
}
I am not sure if anyone could help me to build a query to get data that have first_name or last_name as John and have a job as IT or Student.
You need to use a combination of the bool/must/should clause
Search Query:
{
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"match": {
"first_name": "John"
}
},
{
"match": {
"last_name": "John"
}
}
]
}
},
{
"bool": {
"should": [
{
"match": {
"job": "IT"
}
},
{
"match": {
"job": "student"
}
}
]
}
}
]
}
}
}
Search Result will be
"hits": [
{
"_index": "66982646",
"_type": "_doc",
"_id": "1",
"_score": 2.4079456,
"_source": {
"first_name": "Kevin",
"last_name": "John",
"job": "IT"
}
},
{
"_index": "66982646",
"_type": "_doc",
"_id": "4",
"_score": 1.89712,
"_source": {
"first_name": "John",
"last_name": "Villa",
"job": "Student"
}
}
]

"should" query affect scoring, how to avoid that?

I would like to change the following ElasticSearch so the "should" array will not affect the scoring of the result. I want that the score will be calculated by the "query_string" for the name property only.
how can i achieve that with minimum chnages
GET customers/_search
{
"query": {
"bool": {
"must": [
{
"query_string": {
"default_field": "properties.name",
"query": "Joe*"
}
}
],
"should": [
{
"match": {
"properties.role": "admin"
}
},
{
"match": {
"properties.role": "sysop"
}
},
{
"match": {
"properties.role": "client"
}
},
{
"match": {
"properties.status": "public"
}
},
{
"match": {
"properties.status": "public"
}
}
],
"must_not": [
{
"match": {
"properties.status": "hide_from_search_results"
}
},
{
"match": {
"properties.status": "deleted"
}
},
{
"match": {
"properties.status": "banned"
}
},
{
"match": {
"properties.status": "hide_from_search_results"
}
},
{
"match": {
"properties.status": "deleted"
}
},
{
"match": {
"properties.status": "banned"
}
},
{
"match": {
"properties.status": "hide_from_search_results"
}
},
{
"match": {
"properties.status": "deleted"
}
},
{
"match": {
"properties.status": "banned"
}
}
]
}
},
"size": 30,
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"_script": {
"type": "string",
"order": "desc",
"script": {
"lang": "painless",
"source": "return doc['_index'][0] == 'customers' && doc.containsKey('properties.videoCount')?doc['properties.videoCount'].value:0"
}
}
},
{
"_script": {
"type": "string",
"order": "desc",
"script": {
"lang": "painless",
"source": "long timestampNow = new Date().getTime(); return doc['_index'][0] == 'customers' && doc.containsKey('properties.subscriptions.features.allow-application')?(timestampNow < doc['properties.subscriptions.features.first-on-search'].value.getMillis()):false"
}
}
},
{
"_script": {
"type": "string",
"order": "desc",
"script": {
"lang": "painless",
"source": "return doc['_index'][0] == 'customers' && doc.containsKey('properties.videoCount')?doc['properties.videoCount'].value:0"
}
}
}
]
}
You need to use a combination of bool should and filter clause to achieve your required result.
Adding a working example with index data, search query, and search result
Index Data:
{
"properties":{
"name": "Joe",
"role":"sysop"
}
}
{
"properties":{
"name": "Joe",
"role":"admin"
}
}
{
"properties":{
"name": "Joe",
"role":"student"
}
}
Search Query:
{
"query": {
"bool": {
"must": [
{
"query_string": {
"default_field": "properties.name",
"query": "Joe*"
}
}
],
"should": [
{
"bool": {
"filter": {
"bool": {
"should": [
{
"match": {
"properties.role": "student"
}
},
{
"match": {
"properties.role": "sysop"
}
}
]
}
}
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "65469210",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"properties": {
"name": "Joe",
"role": "admin"
}
}
},
{
"_index": "65469210",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"properties": {
"name": "Joe",
"role": "student"
}
}
},
{
"_index": "65469210",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"properties": {
"name": "Joe",
"role": "sysop"
}
}
}
]
You can even use the Explain API, to know how the score is calculated. Here you can see that the should clauses match have a value of 0.0. Therefore, they do not contribute in the overall scoring of the query.
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_shard": "[65469210][0]",
"_node": "g1iQ5TpzQli7sSx266LDEA",
"_index": "65469210",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"properties": {
"name": "Joe",
"role": "admin"
}
},
"_explanation": {
"value": 1.0,
"description": "sum of:",
"details": [
{
"value": 1.0,
"description": "properties.name:joe*",
"details": []
}
]
}
},
{
"_shard": "[65469210][0]",
"_node": "g1iQ5TpzQli7sSx266LDEA",
"_index": "65469210",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"properties": {
"name": "Joe",
"role": "student"
}
},
"_explanation": {
"value": 1.0,
"description": "sum of:",
"details": [
{
"value": 1.0,
"description": "properties.name:joe*",
"details": []
},
{
"value": 0.0, // note this
"description": "ConstantScore(properties.role:student properties.role:sysop)^0.0",
"details": []
}
]
}
},
{
"_shard": "[65469210][0]",
"_node": "g1iQ5TpzQli7sSx266LDEA",
"_index": "65469210",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"properties": {
"name": "Joe",
"role": "sysop"
}
},
"_explanation": {
"value": 1.0,
"description": "sum of:",
"details": [
{
"value": 1.0,
"description": "properties.name:joe*",
"details": []
},
{
"value": 0.0, // note this
"description": "ConstantScore(properties.role:student properties.role:sysop)^0.0",
"details": []
}
]
}
}
]
}
}
Use filter, filter just remove documents, and wont affect the score:
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-filter-context.html

Elasticsearch Similar Text Query

Given the following documents in an index (lets call it addresses):
{
ADDRESS: {
ID: 1,
LINE1: "steet 1",
CITY: "kuala lumpur",
COUNTRY: "MALAYSIA",
...
}
}
{
ADDRESS: {
ID: 2,
LINE1: "steet 1",
CITY: "kualalumpur city",
COUNTRY: "MALAYSIA",
...
}
}
{
ADDRESS: {
ID: 3,
LINE1: "steet 1",
CITY: "kualalumpur",
COUNTRY: "MALAYSIA",
...
}
}
{
ADDRESS: {
ID: 4,
LINE1: "steet 1",
CITY: "kuala lumpur city",
COUNTRY: "MALAYSIA",
...
}
}
At this point, I found the query to grab "kualalumpur", "kuala lumpur", "kualalumpur city" with the search text "kualalumpur".
But "kuala lumpur city" is missing from the result despite near similarity with "kualalumpur city".
Here is my query so far:
{
"query": {
"bool": {
"should": [
{"match": {"ADDRESS.STREET": {"query": "street 1", "fuzziness": 1, "operator": "AND"}}},
{
"bool": {
"should": [
{"match": {"ADDRESS.CITY": {"query": "kualalumpur", "fuzziness": 1, "operator": "OR"}}},
{"match": {"ADDRESS.CITY.keyword": {"query": "kualalumpur", "fuzziness": 1, "operator": "OR"}}}
]
}
}
],
"filter": {
"bool": {
"must": [
{"term": {"ADDRESS.COUNTRY.keyword": "MALAYSIA"}}
]
}
},
"minimum_should_match": 2
}
}
}
Given the condition, is it possible at all for Elasticsearch to return all four documents with search text "kualalumpur"?
You can use edge-n gram tokenizer on the country field to get the all four docs, tried it in my local and adding below working example.
Create custom analyzer and apply it on your field
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "edgeNGramTokenizer"
}
},
"tokenizer": {
"edgeNGramTokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "edgeNGram",
"max_gram": "40"
}
}
},
"max_ngram_diff": "50"
}
},
"mappings": {
"properties": {
"country": {
"type": "text",
"analyzer" : "ngram_analyzer"
}
}
}
}
Index your all four sample docs, like below
{
"country" : "kuala lumpur"
}
search query with term kualalumpur matches all four docs
{
"query": {
"match" : {
"country" : "kualalumpur"
}
}
}
"hits": [
{
"_index": "fuzzy",
"_type": "_doc",
"_id": "3",
"_score": 5.0003963,
"_source": {
"country": "kualalumpur"
}
},
{
"_index": "fuzzy",
"_type": "_doc",
"_id": "2",
"_score": 4.4082437,
"_source": {
"country": "kualalumpur city"
}
},
{
"_index": "fuzzy",
"_type": "_doc",
"_id": "1",
"_score": 0.5621849,
"_source": {
"country": "kuala lumpur"
}
},
{
"_index": "fuzzy",
"_type": "_doc",
"_id": "4",
"_score": 0.4956103,
"_source": {
"country": "kuala lumpur city"
}
}
]

Elasticsearch (v 5.2) tags with ranking implementation

I have a system that calculates tags of documents and index it into Elasticsearch, later server will search for those documents according to those tags. Now my problem is that I would like to add my own ranking / weight for each tag, and later search and have score of those documents according to the rankings / weights I set.
Assuming I have some documents like the below documents, how do I search and consider my_rank field per specific tag value (In this example user.first = Jhon)?
Example documents:
[
{
"_index": "ehud_test_nested",
"_type": "my_type",
"_id": "2",
"_score": 1,
"_source": {
"group": "tags",
"user": [
{
"first": "John",
"my_rank": 100
},
{
"first": "Alice",
"my_rank": 1
},
{
"first": "bob",
"my_rank": 3
}
]
}
},
{
"_index": "ehud_test_nested",
"_type": "my_type",
"_id": "1",
"_score": 1,
"_source": {
"group": "tags",
"user": [
{
"first": "John",
"my_rank": 1
},
{
"first": "Alice",
"my_rank": 10
},
{
"first": "bob",
"my_rank": 30
}
]
}
}
]
Found it.
User object must be of type nested.
Use Field value
factor in order to set rank inside scoring.
Example query:
{
"query": {
"nested": {
"path": "user",
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"match": {
"user.first": "John"
}
},
{
"match": {
"high.tag": "Alice"
}
}
]
}
},
"boost": "1",
"functions": [
{
"field_value_factor": {
"field": "user.my_rank",
"factor": 1,
"modifier": "none",
"missing": 1
}
}
]
}
}
}
}
}

elasticsearch terms on bool field not working

I have this query that returns always null :
{
"query": {
"bool": {
"should": {
"nested": {
"query": {
"bool": {
"must": [
{
"term": {
"old": false
}
}
]
}
},
"path": "jobOffers"
}
}
}
}
}
Here's what match all returns :
{
"hits": [{
"_index": "dev",
"_type": "recruitment",
"_id": "202837r",
"_score": 1,
"_routing": "202837",
"_parent": "202837",
"_source": {
"score": 1,
"jobOffers": [{
"jobId": "jksncdjkqsnhcjkqs",
"jobCompany": "company 1",
"jobTitle": "Comptable",
"old": false
}],
"totalCount": 1
}
},
{
"_index": "dev",
"_type": "recruitment",
"_id": "202838r",
"_score": 1,
"_routing": "202838",
"_parent": "202838",
"_source": {
"score": 1,
"jobOffers": [{
"jobId": "wxjkckjwxhcmlazdkklqjkcn",
"jobCompany": "company 2",
"jobTitle": "Commercial",
"old": false
},
{
"jobId": "lxjkckazdwxctrzadjkoo",
"jobCompany": "company 2",
"jobTitle": "Chargé de développement commercial",
"old": false
}
],
"totalCount": 2
},
...
}
I made sure I'am querying the right index and the right type. Is this behavior normal? How can I make it return the expected result?
In your query, you need to write jobOffers.old instead of just old
{
"query": {
"bool": {
"should": {
"nested": {
"query": {
"bool": {
"must": [
{
"term": {
"jobOffers.old": false <--- modify this
}
}
]
}
},
"path": "jobOffers"
}
}
}
}
}

Resources