ElasticSearch apply should and range

ElasticSearch apply should and range - elasticsearch

My situation:
I'm working with an ElasticSearch database and I cant apply a couple of "ORs" plus a couple of "ANDs". I'm writing the SQL query to show what I want, in my SQL query I've used confirmedPlayers and pendingPlayers as they were arrays, of course I know we cant do that in SQL, but I just wanted to take an example.
If you want me to add my mappings, I will, It is just I dont want to make extensive the post.
This is my query in SQL:
SELECT *
FROM match
WHERE (
"AVnJOMvXOX1s7Ny2Wu9O" in confirmedPlayers OR
"AVnJOMvXOX1s7Ny2Wu9O" in pendingPlayers OR
"AVnJOMvXOX1s7Ny2Wu9O" = creator
)
AND date >= "20/01/2016"
/* AND other filter will be added */
This is my match type info:
{
"took": 79,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 8,
"max_score": 1,
"hits": [
{
"_index": "yojuego",
"_type": "match",
"_id": "AVmak0bWIjogo0aNpbGs",
"_score": 1,
"_source": {
"title": "Mi primer match",
"date": "2016-01-13T20:31:20.000Z",
"fromTime": "19:00",
"toTime": "20:00",
"location": "casa de pablo",
"creator": "AVmabq-5Ijogo0aNpbGn",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVm0ETbT0Y26YggShbFa",
"_score": 1,
"_source": {
"title": "Mi primer match",
"date": "2016-01-13T20:31:20.000Z",
"fromTime": "19:00",
"toTime": "20:00",
"location": "casa de pablo",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [
"AVmBKi21XRKVuACJGZZZ",
"AVmabq-5Ijogo0aNpbGn"
],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVmab1G5Ijogo0aNpbGo",
"_score": 1,
"_source": {
"title": "Mi primer match",
"date": "2016-01-13T20:31:20.000Z",
"fromTime": "19:00",
"toTime": "20:00",
"location": "casa de pablo",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [
"AVmabVjUIjogo0aNpbGm",
"AVmBKi21XRKVuACJGZZZ"
],
"pendingPlayers": [
"AVmBKi21XRKVuACJGZZZ"
],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVm0EPX20Y26YggShbFZ",
"_score": 1,
"_source": {
"title": "Mi primer match",
"date": "2016-01-13T20:31:20.000Z",
"fromTime": "19:00",
"toTime": "20:00",
"location": "casa de pablo",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [
"AVmabVjUIjogo0aNpbGm",
"AVmabq-5Ijogo0aNpbGn"
],
"pendingPlayers": [
"AVmBKi21XRKVuACJGZZZ"
],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "match",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVnJOMvXOX1s7Ny2Wu9O",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-9fOJxj9yxI50RS3",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [
"AVnJOMvXOX1s7Ny2Wu9O"
],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-ykMJxj9yxI50RS1",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVnJOMvXOX1s7Ny2Wu9O",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-73OJxj9yxI50RS2",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [
"AVnJOMvXOX1s7Ny2Wu9O"
],
"pendingPlayers": [],
"comments": []
}
}
]
}
}
This query returns 4 matches, and it is OK.
http://localhost:9200/my_index/match
POST _search
{
"query": {
"bool": {
"should": [
{ "term": { "confirmedPlayers": { "value": "AVnJOMvXOX1s7Ny2Wu9O" } } },
{ "term": { "pendingPlayers": { "value": "AVnJOMvXOX1s7Ny2Wu9O" } } },
{ "term": { "creator": { "value": "AVnJOMvXOX1s7Ny2Wu9O" } } }
],
"must": [
{ "range": { "date": { "gte": "20/01/2016", "format": "dd/MM/yyyy" } } }
]
}
}
}
//RESULT
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1.6931472,
"hits": [
{
"_index": "yojuego",
"_type": "match",
"_id": "match",
"_score": 1.6931472,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVnJOMvXOX1s7Ny2Wu9O",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-73OJxj9yxI50RS2",
"_score": 1.6931472,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [
"AVnJOMvXOX1s7Ny2Wu9O"
],
"pendingPlayers": [],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-9fOJxj9yxI50RS3",
"_score": 1.287682,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [
"AVnJOMvXOX1s7Ny2Wu9O"
],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-ykMJxj9yxI50RS1",
"_score": 1.287682,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVnJOMvXOX1s7Ny2Wu9O",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [],
"comments": []
}
}
]
}
}
But this query is returning 4 matches too, and this is the case where it should not return anything.
POST _search
{
"query": {
"bool": {
"should": [
{ "term": { "confirmedPlayers": { "value": "inexistant" } } },
{ "term": { "pendingPlayers": { "value": "inexistant" } } },
{ "term": { "creator": { "value": "inexistant" } } }
],
"must": [
{ "range": { "date": { "gte": "20/01/2016", "format": "dd/MM/yyyy" } } }
]
}
}
}
//RESULT
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "yojuego",
"_type": "match",
"_id": "match",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVnJOMvXOX1s7Ny2Wu9O",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-9fOJxj9yxI50RS3",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [
"AVnJOMvXOX1s7Ny2Wu9O"
],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-ykMJxj9yxI50RS1",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVnJOMvXOX1s7Ny2Wu9O",
"matchType": "5",
"confirmedPlayers": [],
"pendingPlayers": [],
"comments": []
}
},
{
"_index": "yojuego",
"_type": "match",
"_id": "AVnm-73OJxj9yxI50RS2",
"_score": 1,
"_source": {
"title": "otro match 3",
"date": "2017-12-28T00:00:00.000Z",
"fromTime": "21:00",
"toTime": "22:00",
"location": "somewhere",
"creator": "AVmabVjUIjogo0aNpbGm",
"matchType": "5",
"confirmedPlayers": [
"AVnJOMvXOX1s7Ny2Wu9O"
],
"pendingPlayers": [],
"comments": []
}
}
]
}
}
Mappings:
{
"match": {
"properties": {
"title": { "type": "string" },
"date": { "type": "date" },
"fromTime": { "type": "string" },
"toTime": { "type": "string" },
"location": { "type": "string" },
"matchType": { "type": "integer" },
"creator": {
"type": "string",
"index": "not_analyzed"
},
"confirmedPlayers" : {
"type": "string",
"index": "not_analyzed"
},
"pendingPlayers" : {
"type": "string",
"index": "not_analyzed"
},
"comments" : {
"properties" : {
"id" : { "type" : "integer" },
"owner" : { "type" : "string" },
"text" : { "type" : "string" },
"writtenOn": { "type": "date" }
}
}
}
}
}
The problem cames up when I use should and must all togheter. If I use should and must separately they work fine.

Based on the result of your second example query (where you claim that 0 results should be returned), it seems you have some confusion about the way that should works in elasticsearch.
I'll quote from the documentation
should
The clause (query) should appear in the matching document. In a
boolean query with no must or filter clauses, one or more should
clauses must match a document. The minimum number of should clauses to
match can be set using the minimum_should_match parameter.
If you use a query with a should and a must, it isn't actually necessary that the should clause hits, only the must clause. If the should clauses do happen to hit, they will be ranked higher in the results.
You have options though. One option: you can write a simple should query, and set the minimum_should_match parameter, then wrap that query in a filtered clause to filter based on the date. Second option: create a nested query, with the must clause inside the should clause.

Related

How can I prioritize documents in Elasticsearch query

I have products in my index. Documents are basically structured like these:
{
"_id": "product",
"_source": {
...
"type": "product",
"id": 1,
"mainTaxon": {
"name": "T-SHIRT",
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA T-SHIRT"
],
"score": 50
},
]
}
},
{
"_id": "product",
"_source": {
...
"type": "product",
"id": 2,
"mainTaxon": {
"name": "JEANS",
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA JEANS"
],
"score": 50
},
]
}
}
}
When I search for 'BANANA' I would prioritize products with mainTaxon different from JEANS. So, every product with the mainTaxon name T_SHIRT or something else would be listed before products with mainTaxon JEANS.

You can use boosting query to prioritize documents
{
"query": {
"boosting": {
"positive": {
"match": {
"attributes.value": "banana"
}
},
"negative": {
"match": {
"mainTaxon.name": "JEANS"
}
},
"negative_boost": 0.5
}
}
}
Search Result will be
"hits": [
{
"_index": "67164768",
"_type": "_doc",
"_id": "1",
"_score": 0.5364054,
"_source": {
"type": "product",
"id": 1,
"mainTaxon": {
"name": "T-SHIRT"
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA T-SHIRT"
],
"score": 50
}
]
}
},
{
"_index": "67164768",
"_type": "_doc",
"_id": "2",
"_score": 0.32743764,
"_source": {
"type": "product",
"id": 2,
"mainTaxon": {
"name": "JEANS"
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA JEANS"
],
"score": 50
}
]
}
}
]

ES how to refactor a querying with a join like?

I've created 2 queries in elastic search:
GET poc-2020.01.09/_search
{
"size": 1000,
"query": {
"wildcard": {
"message": {
"value": "got*", <-------------
"boost": 1.0,
"rewrite": "constant_score"
}
}
}
}
returns:
{
"_index": "poc-2020.01.09",
"_type": "doc",
"_id": "YicNiG8BsW6Znkt6BLuc",
"_score": 1,
"_source": {
"offset": 618993630,
"node.tag": "taskmanager",
"logfile.name": "app-taskmanager-1-94hhg.log",
"logtype": "app",
"beat": {
"version": "6.3.2",
"hostname": "infra",
"name": "infra"
},
"#version": "1",
"type": "beats",
"tags": [
"beats_input_codec_plain_applied"
],
"#timestamp": "2020-01-09T02:05:51.251Z",
"source": "/nfsdata/ecs/log/app-taskmanager-1-94hhg.log",
"message": """2020-01-09 02:04:39,825 INFO Utils - got OOO: XYZ:912828YZ7 Metrics:[] """,
"app.tag": "flink-app-treasury-enriched-position",
"env": "DEV",
"host": {
"name": "infra-elkagent-29-xhx2x"
}
}
},
(it's returns got OOO messages in this example: with XYZ:912828YZ7)
And querying
GET gsp.datasphere.flink.poc-2020.01.09/_search
{
"size": 10,
"query": {
"wildcard": {
"message": {
"value": "outputing*",
"boost": 1.0,
"rewrite": "constant_score"
}
}
}
}
returns:
"hits": {
"total": 104605,
"max_score": 1,
"hits": [
{
"_index": "poc-2020.01.09",
"_type": "doc",
"_id": "3wfYh28BsW6Znkt67Sho",
"_score": 1,
"_source": {
"offset": 617979882,
"node.tag": "taskmanager",
"logfile.name": "app-taskmanager-1-94hhg.log",
"logtype": "app",
"beat": {
"version": "6.3.2",
"hostname": "infra-elkagent-29-xhx2x",
"name": "infra-elkagent-29-xhx2x"
},
"#version": "1",
"type": "beats",
"tags": [
"beats_input_codec_plain_applied"
],
"#timestamp": "2020-01-09T01:08:56.220Z",
"source": "/nfsdata/ecs/log/app-taskmanager-1-94hhg.log",
"message": "2020-01-09 01:07:34,011 INFO Function - Outputing gotoSchool: XYZ:912828YZ7",
"app.tag": "app-trx",
"env": "DEV",
"host": {
"name": "infra"
}
}
},
(it's returns got gotoSchool messages in this example: with :XYZ:912828YZ7)
please note that for each gotoSchool message there are many OOO messages.
what i really need is a query that can join between them. (by the value - XYZ:912828YZ7 - the number after the "XYZ:" is the join value.
(so for each parent to return itself with it's childs).
can you assist?
thanks.

Elasticsearch + Kibana + Alerting (X-Pack) For Energy Monitoring System

Can somebody help me with Alerting Via X-Pack for Energy monitoring system project? The main problem here is I can't collect the 'Value' data from the database, as I want to compare it later with the upper and the lower threshold.
So here is the index:
PUT /test-1
{
"mappings": {
"Test1": {
"properties": {
"Value": {
"type": "integer"
},
"date": {
"type": "date",
"format": "yyyy-MM-dd'T'HH:mm:ss.SSSZ"
},
"UpperThreshold": {
"type": "integer"
},
"LowerThreshold": {
"type": "integer"
}
}
}
}
}
Here is the example of the input:
POST /test-1/Test1
{
"Value": "500",
"date": "2017-06-13T16:20:00.000Z",
"UpperThreshold":"450",
"LowerThreshold": "380"
}
This is my alerting code
{
"trigger": {
"schedule": {
"interval": "10s"
}
},
"input": {
"search": {
"request": {
"search_type": "query_then_fetch",
"indices": [
"logs"
],
"types": [],
"body": {
"query": {
"match": {
"message": "error"
}
}
}
}
}
},
"condition": {
"compare": {
"ctx.payload.hits.total": {
"gt": 0
}
}
},
"actions": {
"send_email": {
"email": {
"profile": "standard",
"to": [
"<account#gmail.com>"
],
"subject": "Watcher Notification",
"body": {
"text": "{{ctx.payload.hits.total}} error logs found"
}
}
}
}
}
Here is the response I got from the alerting plugin
{
"watch_id": "Alerting-Test",
"state": "execution_not_needed",
"_status": {
"state": {
"active": true,
"timestamp": "2017-07-26T15:27:35.497Z"
},
"last_checked": "2017-07-26T15:27:38.625Z",
"actions": {
"logging": {
"ack": {
"timestamp": "2017-07-26T15:27:35.497Z",
"state": "awaits_successful_execution"
}
}
}
},
"trigger_event": {
"type": "schedule",
"triggered_time": "2017-07-26T15:27:38.625Z",
"schedule": {
"scheduled_time": "2017-07-26T15:27:38.175Z"
}
},
"input": {
"search": {
"request": {
"search_type": "query_then_fetch",
"indices": [
"test-1"
],
"types": [
"Test1"
],
"body": {
"query": {
"match_all": {}
}
}
}
}
},
"condition": {
"compare": {
"ctx.payload.hits.hits.0.Value": {
"gt": 450
}
}
},
"metadata": {
"name": "Alerting-Test"
},
"result": {
"execution_time": "2017-07-26T15:27:38.625Z",
"execution_duration": 0,
"input": {
"type": "search",
"status": "success",
"payload": {
"_shards": {
"total": 5,
"failed": 0,
"successful": 5
},
"hits": {
"hits": [
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-22T12:00:00.000Z",
"LowerThreshold": "380",
"Value": "350",
"UpperThreshold": "450"
},
"_id": "AV1-1P3lArbJ1tbnct4e",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-22T18:00:00.000Z",
"LowerThreshold": "380",
"Value": "4100",
"UpperThreshold": "450"
},
"_id": "AV1-1Sq0ArbJ1tbnct4v",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-24T18:00:00.000Z",
"LowerThreshold": "380",
"Value": "450",
"UpperThreshold": "450"
},
"_id": "AV1-1eLJArbJ1tbnct6G",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-23T00:00:00.000Z",
"LowerThreshold": "380",
"Value": "400",
"UpperThreshold": "450"
},
"_id": "AV1-1VUzArbJ1tbnct5A",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-23T12:00:00.000Z",
"LowerThreshold": "380",
"Value": "390",
"UpperThreshold": "450"
},
"_id": "AV1-1X4FArbJ1tbnct5R",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-23T18:00:00.000Z",
"LowerThreshold": "380",
"Value": "390",
"UpperThreshold": "450"
},
"_id": "AV1-1YySArbJ1tbnct5T",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-26T00:00:00.000Z",
"LowerThreshold": "380",
"Value": "4700",
"UpperThreshold": "450"
},
"_id": "AV1-1mflArbJ1tbnct67",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-26T06:00:00.000Z",
"LowerThreshold": "380",
"Value": "390",
"UpperThreshold": "450"
},
"_id": "AV1-1oluArbJ1tbnct7M",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-21T12:00:00.000Z",
"LowerThreshold": "380",
"Value": "400",
"UpperThreshold": "450"
},
"_id": "AV1-1IrZArbJ1tbnct3r",
"_score": 1
},
{
"_index": "test-1",
"_type": "Test1",
"_source": {
"date": "2017-07-21T18:00:00.000Z",
"LowerThreshold": "380",
"Value": "440",
"UpperThreshold": "450"
},
"_id": "AV1-1LwzArbJ1tbnct38",
"_score": 1
}
],
"total": 20,
"max_score": 1
},
"took": 1,
"timed_out": false
},
"search": {
"request": {
"search_type": "query_then_fetch",
"indices": [
"test-1"
],
"types": [
"Test1"
],
"body": {
"query": {
"match_all": {}
}
}
}
}
},
"condition": {
"type": "compare",
"status": "success",
"met": false,
"compare": {
"resolved_values": {
**"ctx.payload.hits.hits.0.Value": null**
}
}
},
"actions": []
},
"messages": []
}
Really appreciate for your help!!

ElasticSearch inconsistent relevance

I'm using elasticsearch to do search for movies by the actors that played in them. When I search for e.g. "leonardo dicaprio" there are 10 or so movies that I get back but they all have a different score. Since they all have the same actor I would expect them to have the same score. Is anyone able to shed some light on why this is happening and hopefully how to stop it?
Elasticsearch version 1.7.2
Mapping:
{
"programs": {
"mappings": {
"program_doc_type": {
"properties": {
"cast": {
"type": "string",
"analyzer": "keyword_analyzer",
"fields": {
"name": {
"type": "string",
"analyzer": "name_analyzer"
}
}
},
"django_id": {
"type": "integer"
},
"has_poster": {
"type": "boolean"
},
"imdb_id": {
"type": "string",
"index": "not_analyzed"
},
"kind": {
"type": "string",
"index": "not_analyzed"
},
"record_url_count": {
"type": "integer"
},
"release_date": {
"type": "date",
"format": "dateOptionalTime"
},
"release_year": {
"type": "integer"
},
"title": {
"type": "string",
"analyzer": "pattern"
},
"tms_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
Analyzers:
"analysis": {
"analyzer": {
"keyword_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "keyword"
},
"name_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "whitespace"
}
}
}
Query:
{
"query": {
"match": {"cast.name": "leonardo dicaprio"}
}
}
First Page Result:
{
"took": 12,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 62,
"max_score": 12.046804,
"hits": [
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "1077511",
"_score": 12.046804,
"_source": {
"imdb_id": "tt4007278",
"tms_id": "",
"record_url_count": 0,
"release_date": "2014-08-20",
"title": "Carbon",
"has_poster": false,
"release_year": 2014,
"django_id": 1077511,
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "812919",
"_score": 11.906615,
"_source": {
"imdb_id": "tt2076929",
"tms_id": "",
"record_url_count": 0,
"title": "Satori",
"has_poster": false,
"release_year": 2014,
"django_id": 812919,
"kind": "N/A",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "376792",
"_score": 11.886408,
"_source": {
"imdb_id": "tt0402538",
"tms_id": "",
"record_url_count": 0,
"title": "Titanic: The Premiere",
"has_poster": true,
"release_year": 2000,
"django_id": 376792,
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "306106",
"_score": 11.69776,
"_source": {
"imdb_id": "tt0325727",
"tms_id": "",
"record_url_count": 0,
"release_date": "1998-08-16",
"title": "Leo Mania",
"has_poster": true,
"release_year": 1998,
"django_id": 306106,
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "269743",
"_score": 9.637444,
"_source": {
"imdb_id": "tt0286234",
"tms_id": "",
"record_url_count": 0,
"title": "Total Eclipse",
"has_poster": false,
"release_year": 1995,
"django_id": 269743,
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Agnieszka Holland"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "840945",
"_score": 9.358208,
"_source": {
"imdb_id": "tt2195237",
"tms_id": "",
"record_url_count": 0,
"release_date": "2004-12-01",
"title": "MovieReal: The Aviator",
"has_poster": false,
"release_year": 2004,
"django_id": 840945,
"kind": "series",
"cast": [
"Leonardo DiCaprio",
"Martin Scorsese"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "382168",
"_score": 9.358208,
"_source": {
"imdb_id": "tt0408269",
"tms_id": "",
"record_url_count": 0,
"release_date": "1998-09-29",
"title": "To Leo with Love",
"has_poster": true,
"release_year": 1998,
"django_id": 382168,
"kind": "movie",
"cast": [
"Jo Wyatt",
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "846212",
"_score": 7.2280827,
"_source": {
"imdb_id": "tt2218442",
"tms_id": "",
"record_url_count": 0,
"title": "Legacy of Secrecy",
"has_poster": false,
"release_year": 1947,
"django_id": 846212,
"kind": "N/A",
"cast": [
"Leonardo DiCaprio",
"Robert De Niro",
"D'Anthony Palms"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "595027",
"_score": 7.1439695,
"_source": {
"imdb_id": "tt1294988",
"tms_id": "",
"record_url_count": 0,
"release_date": "2006-09-27",
"title": "Emporio Armani 'Red' One Night Only",
"has_poster": false,
"release_year": 2006,
"django_id": 595027,
"kind": "movie",
"cast": [
"Kim Cattrall",
"Leonardo DiCaprio",
"Beyoncé Knowles"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "752646",
"_score": 7.1439695,
"_source": {
"imdb_id": "tt1826731",
"tms_id": "",
"record_url_count": 0,
"release_date": "2009-06-02",
"title": "Lives of Quiet Desperation: The Making of Revolutionary Road",
"has_poster": false,
"release_year": 2009,
"django_id": 752646,
"kind": "movie",
"cast": [
"Kathy Bates",
"Leonardo DiCaprio",
"Kate Winslet"
]
}
}
]
}
}
UPDATE:
I disabled field length norm and that seems to have improved it a lot but they still aren't all the same. I'm still confused. According to what i've read there are three ways to determine relevancy:
Term frequency
Inverse document frequency
Field length norm (disabled)
Since each program only has Leonardo Dicaprio one time it seems to me that they should have identical scores but they don't. Maybe i'm misunderstanding. Here are the updated settings after disabling field length norm:
Mapping:
{
"programs": {
"mappings": {
"program_doc_type": {
"properties": {
"cast": {
"type": "string",
"norms": {
"enabled": false
},
"analyzer": "keyword_analyzer",
"fields": {
"name": {
"type": "string",
"norms": {
"enabled": false
},
"analyzer": "name_analyzer"
}
}
},
"django_id": {
"type": "integer"
},
"has_poster": {
"type": "boolean"
},
"imdb_id": {
"type": "string",
"index": "not_analyzed"
},
"kind": {
"type": "string",
"index": "not_analyzed"
},
"record_url_count": {
"type": "integer"
},
"release_date": {
"type": "date",
"format": "dateOptionalTime"
},
"release_year": {
"type": "integer"
},
"title": {
"type": "string",
"analyzer": "pattern"
},
"tms_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
First Page Result:
{
"took": 20,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 836,
"max_score": 13.778852,
"hits": [
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "421026",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 421026,
"imdb_id": "tt0449557",
"has_poster": false,
"release_date": "2005-05-24",
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Jeffrey M. Schwartz",
"Donald L. Barlett",
"James B. Steele"
],
"release_year": 2005,
"record_url_count": 0,
"title": "The Affliction of Howard Hughes: Obsessive-Compulsive Disorder"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "555015",
"_score": 13.778852,
"_source": {
"tms_id": "MV002510340000",
"django_id": 555015,
"imdb_id": "tt1130884",
"has_poster": true,
"release_date": "2010-02-19",
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Mark Ruffalo",
"Ben Kingsley",
"Max von Sydow"
],
"release_year": 2010,
"record_url_count": 2,
"title": "Shutter Island"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "104669",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 104669,
"imdb_id": "tt0108330",
"has_poster": true,
"release_date": "1993-04-23",
"kind": "movie",
"cast": [
"Robert De Niro",
"Ellen Barkin",
"Leonardo DiCaprio",
"Jonah Blechman"
],
"release_year": 1993,
"record_url_count": 1,
"title": "This Boy's Life"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "846212",
"_score": 13.778852,
"_source": {
"django_id": 846212,
"title": "Legacy of Secrecy",
"imdb_id": "tt2218442",
"has_poster": false,
"kind": "N/A",
"cast": [
"Leonardo DiCaprio",
"Robert De Niro",
"D'Anthony Palms"
],
"release_year": 1947,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "256632",
"_score": 13.778852,
"_source": {
"django_id": 256632,
"title": "The Movie Show",
"imdb_id": "tt0271918",
"has_poster": false,
"kind": "series",
"cast": [
"Ray Brady",
"Russell Crowe",
"Larry Day",
"Leonardo DiCaprio"
],
"release_year": 1986,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "269743",
"_score": 13.778852,
"_source": {
"django_id": 269743,
"title": "Total Eclipse",
"imdb_id": "tt0286234",
"has_poster": false,
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Agnieszka Holland"
],
"release_year": 1995,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "1007190",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 1007190,
"imdb_id": "tt3391950",
"has_poster": false,
"release_date": "2013-12-29",
"kind": "series",
"cast": [
"Leonardo DiCaprio",
"Jonah Hill",
"Martin Scorsese",
"Terence Winter"
],
"release_year": 2013,
"record_url_count": 0,
"title": "The Hollywood Reporter in Focus"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "1077511",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 1077511,
"imdb_id": "tt4007278",
"has_poster": false,
"release_date": "2014-08-20",
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
],
"release_year": 2014,
"record_url_count": 0,
"title": "Carbon"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "302615",
"_score": 13.57246,
"_source": {
"django_id": 302615,
"title": "Directors: James Cameron",
"imdb_id": "tt0322031",
"has_poster": true,
"kind": "movie",
"cast": [
"Michael Biehn",
"James Cameron",
"Jamie Lee Curtis",
"Leonardo DiCaprio"
],
"release_year": 1997,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "509785",
"_score": 13.57246,
"_source": {
"tms_id": "",
"django_id": 509785,
"imdb_id": "tt0923573",
"has_poster": false,
"release_date": "2003-05-06",
"kind": "movie",
"cast": [
"Frank Abagnale Jr.",
"Amy Adams",
"Nathalie Baye",
"Leonardo DiCaprio"
],
"release_year": 2003,
"record_url_count": 0,
"title": "'Catch Me If You Can': The Casting of the Film"
}
}
]
}
}
The results are MUCH improved but still the last 2 have different scores than the rest of the results.

Elasticsearch relevancy default model is called TF/IDF. You can read more about it here.
The _score you see in your search hits is calculated by this model.
Basically, the score is a result of a calculation on three factors (more info here):
Term frequency - How often does a term appear in a specific document? TF
Inverse document frequency - How often does the term appear in all documents in the collection? IDF
Field-length norm - How long is the field?
As you can infer from the above, because each document that contains leonardo dicaprio is different in its matching terms count, length of fields and matching terms count all over the index, its relevancy score is different.
Nevertheless, you get high scores for documents that contains leonardo dicaprio than those who doesn't.
Hope it helps.

How to sort by match prioritising the most left words matched

How to sort by match prioritising the most left words matched
Explanation
Sort the prefix query by the word it matches, but prioritising the matches in the words more at left.
Tests I've made
Data
DELETE /test
PUT /test
PUT /test/person/_mapping
{
"properties": {
"name": {
"type": "multi_field",
"fields": {
"name": {"type": "string"},
"original": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
PUT /test/person/1
{"name": "Berta Kassulke"}
PUT /test/person/2
{"name": "Kaley Bartoletti"}
PUT /test/person/3
{"name": "Kali Hahn"}
PUT /test/person/4
{"name": "Karolann Klein"}
PUT /test/person/5
{"name": "Sofia Mandez Kaloo"}
The mapping was added for the 'sort on original value' test.
Simple query
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
}
}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": 1,
"_source": {
"name": "Karolann Klein"
}
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": 1,
"_source": {
"name": "Sofia Mandez Kaloo"
}
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": 1,
"_source": {
"name": "Berta Kassulke"
}
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": 1,
"_source": {
"name": "Kaley Bartoletti"
}
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": 1,
"_source": {
"name": "Kali Hahn"
}
}
]
}
}
With sorting
Request
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name": {"order": "asc"}}
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"berta"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"kaloo"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"karolann"
]
}
]
}
}
With sort on original value
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name.original": {"order": "asc"}}
}
Result
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"Berta Kassulke"
]
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"Kaley Bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"Kali Hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"Karolann Klein"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"Sofia Mandez Kaloo"
]
}
]
}
}
Intended result
Sorted by name ASC but prioritising the matches on the most left words
Kaley Bartoletti
Kali Hahn
Karolann Klein
Berta Kassulke
Sofia Mandez Kaloo

Good Question. One way to achieve this would be with the combination of edge ngram filter and span first query
This is my setting
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase",
"edge_filter",
"asciifolding"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 8
}
}
}
},
"mappings": {
"person": {
"properties": {
"name": {
"type": "string",
"analyzer": "my_custom_analyzer",
"search_analyzer": "standard",
"fields": {
"standard": {
"type": "string"
}
}
}
}
}
}
}
After that I inserted your sample documents. Then I wrote the following query with dis_max. Notice that end parameter for first span query is 1 so this will prioritize(higher score) leftmost match. I am first sorting by score and then by name.
{
"query": {
"dis_max": {
"tie_breaker": 0.7,
"boost": 1.2,
"queries": [
{
"match": {
"name": "ka"
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 1
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 2
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"name.standard": {
"order": "asc"
}
}
]
}
The result I get
"hits": [
{
"_index": "esedge",
"_type": "policy_data",
"_id": "2",
"_score": 0.72272325,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
0.72272325,
"bartoletti"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "3",
"_score": 0.72272325,
"_source": {
"name": "Kali Hahn"
},
"sort": [
0.72272325,
"hahn"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "4",
"_score": 0.72272325,
"_source": {
"name": "Karolann Klein"
},
"sort": [
0.72272325,
"karolann"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "1",
"_score": 0.54295504,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
0.54295504,
"berta"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "5",
"_score": 0.2905494,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
0.2905494,
"kaloo"
]
}
]
I hope this helps.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

ElasticSearch apply should and range - elasticsearch

Related

How can I prioritize documents in Elasticsearch query

ES how to refactor a querying with a join like?

Elasticsearch + Kibana + Alerting (X-Pack) For Energy Monitoring System

ElasticSearch inconsistent relevance

How to sort by match prioritising the most left words matched

Categories

Resources