Boolean similarity - is there a way to remove duplicates - elasticsearch

Given the following index
PUT /test_index
{
"mappings": {
"properties": {
"field1": {
"type": "text",
"analyzer": "whitespace",
"similarity": "boolean"
},
"field2": {
"type": "text",
"analyzer": "whitespace",
"similarity": "boolean"
}
}
}
}
and the following data
POST /test_index/_bulk?refresh=true
{ "index" : {} }
{ "field1": "foo", "field2": "bar"}
{ "index" : {} }
{ "field1": "foo1 foo2", "field2": "bar1 bar2"}
{ "index" : {} }
{ "field1": "foo1 foo2 foo3", "field2": "bar1 bar2 bar3"}
for the given Boolean similarity query
POST /test_index/_search
{
"size": 10,
"min_score": 0.4,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"fuzzy":{
"field1":{
"value":"foo",
"fuzziness":"AUTO",
"boost": 1
}
}
},
{
"fuzzy":{
"field2":{
"value":"bar",
"fuzziness":"AUTO",
"boost": 1
}
}
}
]
}
}
}
}
}
I'm always receiving ["foo1 foo2 foo3", "bar1 bar2 bar3"] despite the fact that there is an exact result in index (the first one):
{
"took": 114,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 3.9999998,
"hits": [
{
"_index": "test_index",
"_type": "_doc",
"_id": "bXw8eXUBCTtfNv84bNPr",
"_score": 3.9999998,
"_source": {
"field1": "foo1 foo2 foo3",
"field2": "bar1 bar2 bar3"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "bHw8eXUBCTtfNv84bNPr",
"_score": 2.6666665,
"_source": {
"field1": "foo1 foo2",
"field2": "bar1 bar2"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "a3w8eXUBCTtfNv84bNPr",
"_score": 2.0,
"_source": {
"field1": "foo",
"field2": "bar"
}
}
]
}
}
I'm aware of the fact that Boolean works that way to match as many results, and I know I can do rescoring here, but this is not an option since I don't know how many top N results to fetch.
Are there any other options here? Maybe to create my own similarity plugin based on Boolean similarity to remove duplicates and leave the best matched token, but I don't know where to start from, I see only samples for script and rescore.

Update:- Based on the clarity provided in the comment section of my earlier answer, updating the answer.
Below query returns the expected results
{
"min_score": 0.4,
"size":10,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"fuzzy": {
"field1": {
"value": "foo",
"fuzziness": "AUTO",
"boost": 0.5
}
}
},
{
"term": { --> used for boosting the exact terms
"field1": {
"value": "foo",
"boost": 1.5 --> further boosting the exact match.
}
}
}
]
}
}
}
}
}
And search results
"hits": [
{
"_index": "test_index",
"_type": "_doc",
"_id": "zdMEvHUBlo4-1mHbtvNH",
"_score": 2.0,
"_source": {
"field1": "foo",
"field2": "bar"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "z9MEvHUBlo4-1mHbtvNH",
"_score": 0.99999994,
"_source": {
"field1": "foo1 foo2 foo3",
"field2": "bar1 bar2 bar3"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "ztMEvHUBlo4-1mHbtvNH",
"_score": 0.6666666,
"_source": {
"field1": "foo1 foo2",
"field2": "bar1 bar2"
}
}
]
Another query without the explicit boost of the exact term also returns the expected results
{
"min_score": 0.4,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"fuzzy": {
"field1": {
"value": "foo",
"fuzziness": "AUTO",
"boost": 0.5
}
}
},
{
"term": {
"field1": {
"value": "foo" --> notice there is no boost
}
}
}
]
}
}
}
}
}
And search result
"hits": [
{
"_index": "test_index",
"_type": "_doc",
"_id": "zdMEvHUBlo4-1mHbtvNH",
"_score": 1.5,
"_source": {
"field1": "foo",
"field2": "bar"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "z9MEvHUBlo4-1mHbtvNH",
"_score": 0.99999994,
"_source": {
"field1": "foo1 foo2 foo3",
"field2": "bar1 bar2 bar3"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "ztMEvHUBlo4-1mHbtvNH",
"_score": 0.6666666,
"_source": {
"field1": "foo1 foo2",
"field2": "bar1 bar2"
}
}
]

Related

How to make flattened sub-field in the nested field in elastic search?

Here, I have a indexed document like:
doc = {
"id": 1,
"content": [
{
"txt": I,
"time": 0,
},
{
"txt": have,
"time": 1,
},
{
"txt": a book,
"time": 2,
},
{
"txt": do not match this block,
"time": 3,
},
]
}
And I want to match "I have a book", and return the matched time: 0,1,2. Is there anyone who knows how to build the index and the query for this situation?
I think the "content.txt" should be flattened but "content.time" should be nested?
want to match "I have a book", and return the matched time: 0,1,2.
Adding a working example with index mapping,search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"content": {
"type": "nested"
}
}
}
}
Search Query:
{
"query": {
"nested": {
"path": "content",
"query": {
"bool": {
"must": [
{
"match": {
"content.txt": "I have a book"
}
}
]
}
},
"inner_hits": {}
}
}
}
Search Result:
"inner_hits": {
"content": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 2.5226097,
"hits": [
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 2
},
"_score": 2.5226097,
"_source": {
"txt": "a book",
"time": 2
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 0
},
"_score": 1.5580825,
"_source": {
"txt": "I",
"time": 0
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 1
},
"_score": 1.5580825,
"_source": {
"txt": "have",
"time": 1
}
}
]
}
}
}
}

Elastic search query for name / value pair columns pull

We have one document in elastic search with multiple sections of name/value pair and we want to fetch value's only based on name column value.
"envelopeData": {
"envelopeName": "Bills",
"details": {
"detail": [
{
"name": "UC_CORP",
"value": "76483"
},
{
"name": "UC_CYCLE",
"value": "V"
}
We are expecting only 76483 as result based on name equals to UC_CORP
If the field envelopeData.details.detail is nested type then you can perform a match query for the desired name on the nested path and can use inner_hits to get just the value.
Map the field envelopeData.details.detail as nested(if not nested):
PUT stackoverflow
{
"mappings": {
"_doc": {
"properties": {
"envelopeData.details.detail": {
"type": "nested"
}
}
}
}
}
then you can perform the following query to get value using inner_hits:
GET stackoverflow/_search
{
"_source": "false",
"query": {
"nested": {
"path": "envelopeData.details.detail",
"query": {
"match": {
"envelopeData.details.detail.name.keyword": "UC_CORP"
}
},
"inner_hits": {
"_source": "envelopeData.details.detail.value"
}
}
}
}
which outputs:
{
"_index": "stackoverflow",
"_type": "_doc",
"_id": "W5GUW2gB3GnGVyg-Sf4T",
"_score": 0.6931472,
"_source": {},
"inner_hits": {
"envelopeData.details.detail": {
"hits": {
"total": 1,
"max_score": 0.6931472,
"hits": [
{
"_index": "stackoverflow",
"_type": "_doc",
"_id": "W5GUW2gB3GnGVyg-Sf4T",
"_nested": {
"field": "envelopeData.details.detail",
"offset": 0
},
"_score": 0.6931472,
"_source": {
"value": "76483" -> Outputs value only
}
}
]
}
}
}
}

Elasticsearch: Query the most recent that doesn't contain the field 'X'

I have the following search query:
{
"query": {
"match": {
"name": "testlib"
}
}
}
When I do this query I get the three results below. What I want to do now is only return one result: the newest #timestamp that doesn't contain version_pre. So in this case, only return AV6qvDXDyHw9vNh6Wlpl.
[
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDXDyHw9vNh6Wlpl",
"_score": 0.2876821,
"_source": {
"#timestamp": "2017-09-21T11:02:15-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 1
}
},
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDF5MtcMTuGknsVs",
"_score": 0.18232156,
"_source": {
"#timestamp": "2017-09-20T17:21:35-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 0
}
},
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDnVyHw9vNh6Wlpn",
"_score": 0.18232156,
"_source": {
"#timestamp": "2017-09-22T13:56:55-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 2,
"version_pre": 0
}
}
]
Use sort (https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-sort.html) and https://www.elastic.co/guide/en/elasticsearch/reference/2.3/query-dsl-exists-query.html:
{
"size" : 1,
"sort" : [{ "#timestamp" : {"order" : "asc"}}],
"query" : {
"bool": {
"must_not": {
"exists": {
"field": "version_pre"
}
}
}
Or even, via query string:
/_search?sort=#timestamp:desc&size=1&q=_missing_:version_pre

Elasticsearch search for Turkish characters

I have some documents that i am indexing with elasticsearch. But some of the documents are written with upper case and Tukish characters are changed. For example "kürşat" is written as "KURSAT".
I want to find this document by searching "kürşat". How can i do that?
Thanks
Take a look at the asciifolding token filter.
Here is a small example for you to try out in Sense:
Index:
DELETE test
PUT test
{
"settings": {
"analysis": {
"filter": {
"my_ascii_folding": {
"type": "asciifolding",
"preserve_original": true
}
},
"analyzer": {
"turkish_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_ascii_folding"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"name": {
"type": "string",
"analyzer": "turkish_analyzer"
}
}
}
}
}
POST test/test/1
{
"name": "kürşat"
}
POST test/test/2
{
"name": "KURSAT"
}
Query:
GET test/_search
{
"query": {
"match": {
"name": "kursat"
}
}
}
Response:
"hits": {
"total": 2,
"max_score": 0.30685282,
"hits": [
{
"_index": "test",
"_type": "test",
"_id": "2",
"_score": 0.30685282,
"_source": {
"name": "KURSAT"
}
},
{
"_index": "test",
"_type": "test",
"_id": "1",
"_score": 0.30685282,
"_source": {
"name": "kürşat"
}
}
]
}
Query:
GET test/_search
{
"query": {
"match": {
"name": "kürşat"
}
}
}
Response:
"hits": {
"total": 2,
"max_score": 0.4339554,
"hits": [
{
"_index": "test",
"_type": "test",
"_id": "1",
"_score": 0.4339554,
"_source": {
"name": "kürşat"
}
},
{
"_index": "test",
"_type": "test",
"_id": "2",
"_score": 0.09001608,
"_source": {
"name": "KURSAT"
}
}
]
}
Now the 'preserve_original' flag will make sure that if a user types: 'kürşat', documents with that exact match will be ranked higher than documents that have 'kursat' (Notice the difference in scores for both query responses).
If you want the score to be equal, you can put the flag on false.
Hope I got your problem right!

elasticsearch, documents with data in field1 OR field2

How do I instruct elasticsearch to return all documents which have data in one of the following fields: ['field1','field2']?
I have tried:
{
'query': {
'bool':{
'must':[
'multi_match':{
'fields':['field1','field2'],
'operator':'AND',
'tie_breaker':1.0,
'query': '*',
'type':'cross_fields'
}
]
}
}
}
I also tried:
{
"query":{
"wildcard":
{
"field1":"*"
}
}
}
which works, but:
{
"query":{
"wildcard":
{
"field*":"*"
}
}
}
does not
You can do it with two exists filters in a bool filter
As an example, I set up a simple index and gave it some data:
PUT /test_index
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"field1":"foo","field2":"bar"}
{"index":{"_id":2}}
{"field2":"foo","field3":"bar"}
{"index":{"_id":3}}
{"field3":"foo","field4":"bar"}
{"index":{"_id":4}}
{"field4":"foo","field5":"bar"}
If I want to find all documents that have "field1" or "field3", I can do this:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"should": [
{ "exists": { "field": "field1" } },
{ "exists": { "field": "field3" } }
]
}
}
}
}
}
It returns what I expect:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"field1": "foo",
"field2": "bar"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 1,
"_source": {
"field2": "foo",
"field3": "bar"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"field3": "foo",
"field4": "bar"
}
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/991b828de250e5125fd372bf7e6b066acec55fcd

Resources