Boost score of term query result with multiple matches - elasticsearch

I have serveral documents that look like the following stored in my elastic search index:
PUT tests
{
"mappings": {
"_doc": {
"dynamic": false,
"properties": {
"objects": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"text": {
"type": "text"
}
}
}
}
}
PUT tests/_doc/1
{
"text": "lel",
"objects": ["A"]
}
PUT tests/_doc/2
{
"text": "lol",
"objects": ["B"]
}
PUT tests/_doc/3
{
"text": "lil",
"objects": ["C"]
}
PUT tests/_doc/4
{
"text": "lul",
"objects": ["A", "B", "C"]
}
I want to query for objects with the following query:
GET _search
{
"query": {
"terms": {
"objects.keyword": ["A", "B", "C"]
}
}
}
The result includes all three sample objects I provided.
My question is simply whether I can make an object appear of a higher importance (boost) that has a full match (all keywords in the objects array) and not just only a partial match and if so how, since I could not find any information in the elastic search documentation.
This is the result I am currently receiving:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 11,
"successful": 11,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "tests",
"_type": "_doc",
"_id": "2",
"_score": 1,
"_source": {
"text": "lol",
"objects": [
"B"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "4",
"_score": 1,
"_source": {
"text": "lul",
"objects": [
"A",
"B",
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "1",
"_score": 1,
"_source": {
"text": "lel",
"objects": [
"A"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "3",
"_score": 1,
"_source": {
"text": "lil",
"objects": [
"C"
]
}
}
]
}
}

I think your best bet is using a bool query with should and minimum_should_match: 1.
GET _search
{
"query": {
"bool": {
"should": [
{
"term": {
"objects.keyword": "A"
}
},
{
"term": {
"objects.keyword": "B"
}
},
{
"term": {
"objects.keyword": "C"
}
}
],
"minimum_should_match": 1
}
}
}
Results:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1.5686159,
"hits": [
{
"_index": "tests",
"_type": "_doc",
"_id": "4",
"_score": 1.5686159,
"_source": {
"text": "lul",
"objects": [
"A",
"B",
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "1",
"_score": 0.2876821,
"_source": {
"text": "lel",
"objects": [
"A"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "3",
"_score": 0.2876821,
"_source": {
"text": "lil",
"objects": [
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "2",
"_score": 0.18232156,
"_source": {
"text": "lol",
"objects": [
"B"
]
}
}
]
}
}
EDIT: Here's why, as explained by the docs (https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html):
The bool query takes a more-matches-is-better approach, so the score from each matching must or should clause will be added together to provide the final _score for each document.

Related

Elasticsearch - Return nested values in format

How can i make elasticsearch return nested values in format of hits {value1:..., value2..., value3..., etc..}
This is my request:
{
"_source": 0,
"query": {
"bool": {
"must": [
{
"nested": {
"path": "photo",
"query": {
"bool": {
"must": [
{
"match": {
"photo.hello": "true"
}
}
]
}
},
"inner_hits" : {}
}
}
]
}}}
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.2231436,
"hits": [
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "6UAaCls5iSgavEtFE2qMX902Xmb2",
"_score": 1.2231436,
"inner_hits": {
"photo": {
"hits": {
"total": 1,
"max_score": 1.2231436,
"hits": [
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "6UAaCls5iSgavEtFE2qMX902Xmb2",
"_nested": {
"field": "photo",
"offset": 0
},
"_score": 1.2231436,
"_source": {
"hello": "true",
"i_am_superCOOL": "true",
"xoxox": "true",
"id": "-KSDRx5BN54JHitoq7Wb"
}
}
]
}
}
}
},
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "KDFbeXrOedf7b6NVRGMO0HDIFgx1",
"_score": 1.2231436,
"inner_hits": {
"photo": {
"hits": {
"total": 2,
"max_score": 1.2231436,
"hits": [
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "KDFbeXrOedf7b6NVRGMO0HDIFgx1",
"_nested": {
"field": "photo",
"offset": 1
},
"_score": 1.2231436,
"_source": {
"alahu": "true",
"hello": "true",
"same": "true",
"smukais": "true",
"id": "-KSDJzyUC_N5je-cR2aT"
}
},
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "KDFbeXrOedf7b6NVRGMO0HDIFgx1",
"_nested": {
"field": "photo",
"offset": 0
},
"_score": 1.2231436,
"_source": {
"hello": "true",
"same": "true",
"selfyyy": "true",
"superSexy": "true",
"id": "-KPn4p7spS8NO7IVSLdF"
}
}
]
}
}
}
}
]
}
}
I am using 2 dimension dynamic attribute search, the problem with this approach is that the result's can be 20 from 1 user, but i need to make it propriety based.
Just sticked to the same format.

How to correctly aggregate with the field is a list on Elasticsearch

Currently the ES logs are indexed in a way that some fields have a list instead of a single value.
For example:
_source:{
"field1":"["item1", "item2", "item3"],
"field2":"something",
"field3": "something_else"
}
Of course, the length of list is not always the same. I'm trying to find a way to aggregate the number of logs that consist each item (so some logs will be counted multiple times)
I know I have to use aggs, but how can I form the right query (after -d)?
You can use below query that uses terms aggregation and top_hits.
{
"size": 0,
"aggs": {
"group": {
"terms": {
"script": "_source.field1.each{}"
},
"aggs":{
"top_hits_log" :{
"top_hits" :{
}
}
}
}
}
}
Output will be:
"buckets": [
{
"key": "item1",
"doc_count": 3,
"top_hits_log": {
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
},
{
"_index": "so",
"_type": "test",
"_id": "2",
"_score": 1,
"_source": {
"field1": [
"item1"
],
"field2": "something2"
}
},
{
"_index": "so",
"_type": "test",
"_id": "3",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2"
],
"field2": "something3"
}
}
]
}
}
},
{
"key": "item2",
"doc_count": 2,
"top_hits_log": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
},
{
"_index": "so",
"_type": "test",
"_id": "3",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2"
],
"field2": "something3"
}
}
]
}
}
},
{
"key": "item3",
"doc_count": 1,
"top_hits_log": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
}
]
}
}
}
]
Make sure to enable dynamic scripting. Set script.disable_dynamic: false
Hope this helps.
There is no need to use scripting. It will be slow especially _source parsing. You also need to make sure your field1 is not_analyzed or you will get weird results as terms aggregation is performed on unique tokens in Inverted Index.
{
"size": 0,
"aggs": {
"unique_items": {
"terms": {
"field": "field1",
"size": 100
},
"aggs": {
"documents": {
"top_hits": {
"size": 10
}
}
}
}
}
}
Here the size is 100 inside terms aggregation, change this according to how many unique values you think you have(default is 10).
Hope this helps!

How to sort by match prioritising the most left words matched

How to sort by match prioritising the most left words matched
Explanation
Sort the prefix query by the word it matches, but prioritising the matches in the words more at left.
Tests I've made
Data
DELETE /test
PUT /test
PUT /test/person/_mapping
{
"properties": {
"name": {
"type": "multi_field",
"fields": {
"name": {"type": "string"},
"original": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
PUT /test/person/1
{"name": "Berta Kassulke"}
PUT /test/person/2
{"name": "Kaley Bartoletti"}
PUT /test/person/3
{"name": "Kali Hahn"}
PUT /test/person/4
{"name": "Karolann Klein"}
PUT /test/person/5
{"name": "Sofia Mandez Kaloo"}
The mapping was added for the 'sort on original value' test.
Simple query
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
}
}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": 1,
"_source": {
"name": "Karolann Klein"
}
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": 1,
"_source": {
"name": "Sofia Mandez Kaloo"
}
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": 1,
"_source": {
"name": "Berta Kassulke"
}
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": 1,
"_source": {
"name": "Kaley Bartoletti"
}
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": 1,
"_source": {
"name": "Kali Hahn"
}
}
]
}
}
With sorting
Request
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name": {"order": "asc"}}
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"berta"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"kaloo"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"karolann"
]
}
]
}
}
With sort on original value
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name.original": {"order": "asc"}}
}
Result
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"Berta Kassulke"
]
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"Kaley Bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"Kali Hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"Karolann Klein"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"Sofia Mandez Kaloo"
]
}
]
}
}
Intended result
Sorted by name ASC but prioritising the matches on the most left words
Kaley Bartoletti
Kali Hahn
Karolann Klein
Berta Kassulke
Sofia Mandez Kaloo
Good Question. One way to achieve this would be with the combination of edge ngram filter and span first query
This is my setting
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase",
"edge_filter",
"asciifolding"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 8
}
}
}
},
"mappings": {
"person": {
"properties": {
"name": {
"type": "string",
"analyzer": "my_custom_analyzer",
"search_analyzer": "standard",
"fields": {
"standard": {
"type": "string"
}
}
}
}
}
}
}
After that I inserted your sample documents. Then I wrote the following query with dis_max. Notice that end parameter for first span query is 1 so this will prioritize(higher score) leftmost match. I am first sorting by score and then by name.
{
"query": {
"dis_max": {
"tie_breaker": 0.7,
"boost": 1.2,
"queries": [
{
"match": {
"name": "ka"
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 1
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 2
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"name.standard": {
"order": "asc"
}
}
]
}
The result I get
"hits": [
{
"_index": "esedge",
"_type": "policy_data",
"_id": "2",
"_score": 0.72272325,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
0.72272325,
"bartoletti"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "3",
"_score": 0.72272325,
"_source": {
"name": "Kali Hahn"
},
"sort": [
0.72272325,
"hahn"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "4",
"_score": 0.72272325,
"_source": {
"name": "Karolann Klein"
},
"sort": [
0.72272325,
"karolann"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "1",
"_score": 0.54295504,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
0.54295504,
"berta"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "5",
"_score": 0.2905494,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
0.2905494,
"kaloo"
]
}
]
I hope this helps.

Get every Nth result in Elasticsearch

I have this large set of data and I want a sample that I can use in a graph. For this I don't need all of the data, I need every Nth item.
For instance if I have 4000 results, and I only need 800 results, I want to be able to get every 5th result.
So some like: get, skip, skip, skip, skip, get, skip, skip, skip,..
I was wondering if such a thing is possible in Elasticsearch?
You're better off using a scripted filter. Otherwise you're needlessly using the score. Filters are just like queries, but they don't use scoring.
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "doc['unique_counter'].value % n == 0",
"params" : {
"n" : 5
}
}
}
}
}
}
You're also better off not using dynamic scripting in real world usage.
That said, you probably want to take a look at aggregations for graphing analytical information about your data rather than taking an arbitrary sample.
One way you could do it is with random scoring. It won't give you precisely every nth item according to a rigid ordering, but if you can relax that requirement this trick should do nicely.
To test it I set up a simple index (I mapped "doc_id" to "_id" just so the documents would have some contents, so that part isn't required, in case that's not obvious):
PUT /test_index
{
"mappings": {
"doc": {
"_id": {
"path": "doc_id"
}
}
}
}
Then I indexed ten simple documents:
POST /test_index/doc/_bulk
{"index":{}}
{"doc_id":1}
{"index":{}}
{"doc_id":2}
{"index":{}}
{"doc_id":3}
{"index":{}}
{"doc_id":4}
{"index":{}}
{"doc_id":5}
{"index":{}}
{"doc_id":6}
{"index":{}}
{"doc_id":7}
{"index":{}}
{"doc_id":8}
{"index":{}}
{"doc_id":9}
{"index":{}}
{"doc_id":10}
Now I can pull back three random documents like this:
POST /test_index/_search
{
"size": 3,
"query": {
"function_score": {
"functions": [
{
"random_score": {
"seed": "some seed"
}
}
]
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 0.93746644,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 0.93746644,
"_source": {
"doc_id": 1
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "10",
"_score": 0.926947,
"_source": {
"doc_id": 10
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "5",
"_score": 0.79400194,
"_source": {
"doc_id": 5
}
}
]
}
}
Or a different random three like this:
POST /test_index/_search
{
"size": 3,
"query": {
"function_score": {
"functions": [
{
"random_score": {
"seed": "some other seed"
}
}
]
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 0.817295,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": 0.817295,
"_source": {
"doc_id": 4
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "8",
"_score": 0.469319,
"_source": {
"doc_id": 8
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 0.4374538,
"_source": {
"doc_id": 3
}
}
]
}
}
Hopefully it's clear how to generalize this method to what you need. Just take out however many documents you want, in however many chunks make it performant.
Here is all the code I used to test:
http://sense.qbox.io/gist/a02d4da458365915f5e9cf6ea80546d2dfabc75d
EDIT: Actually now that I think about it, you could also use scripted scoring to get precisely every nth item, if you set it up right. Maybe something like,
POST /test_index/_search
{
"size": 3,
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "if(doc['doc_id'].value % 3 == 0){ return 1 }; return 0;"
}
}
]
}
}
}
...
{
"took": 13,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"doc_id": 3
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "6",
"_score": 1,
"_source": {
"doc_id": 6
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "9",
"_score": 1,
"_source": {
"doc_id": 9
}
}
]
}
}

ElasticSearch Order By String Length

I am using ElasticSearch via NEST c#. I have large list of information about people
{
firstName: 'Frank',
lastName: 'Jones',
City: 'New York'
}
I'd like to be able to filter and sort this list of items by lastName as well as order by the length so people who only have 5 characters in their name will be at the beginning of the result set then people with 10 characters.
So with some pseudo code I'd like to do something like
list.wildcard("j*").sort(m => lastName.length)
You can do the sorting with script-based sorting.
As a toy example, I set up a trivial index with a few documents:
PUT /test_index
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"name":"Bob"}
{"index":{"_id":2}}
{"name":"Jeff"}
{"index":{"_id":3}}
{"name":"Darlene"}
{"index":{"_id":4}}
{"name":"Jose"}
Then I can order search results like this:
POST /test_index/_search
{
"query": {
"match_all": {}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": null,
"_source": {
"name": "Bob"
},
"sort": [
3
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
To filter by length, I can use a script filter in a similar way:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"script": {
"script": "doc['name'].value.length() > 3",
"params": {}
}
}
}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/22fef6dc5453eaaae3be5fb7609663cc77c43dab
P.S.: If any of the last names will contain spaces, you might want to use "index": "not_analyzed" on that field.

Resources