elasticsearch, documents with data in field1 OR field2 - elasticsearch

How do I instruct elasticsearch to return all documents which have data in one of the following fields: ['field1','field2']?
I have tried:
{
'query': {
'bool':{
'must':[
'multi_match':{
'fields':['field1','field2'],
'operator':'AND',
'tie_breaker':1.0,
'query': '*',
'type':'cross_fields'
}
]
}
}
}
I also tried:
{
"query":{
"wildcard":
{
"field1":"*"
}
}
}
which works, but:
{
"query":{
"wildcard":
{
"field*":"*"
}
}
}
does not

You can do it with two exists filters in a bool filter
As an example, I set up a simple index and gave it some data:
PUT /test_index
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"field1":"foo","field2":"bar"}
{"index":{"_id":2}}
{"field2":"foo","field3":"bar"}
{"index":{"_id":3}}
{"field3":"foo","field4":"bar"}
{"index":{"_id":4}}
{"field4":"foo","field5":"bar"}
If I want to find all documents that have "field1" or "field3", I can do this:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"should": [
{ "exists": { "field": "field1" } },
{ "exists": { "field": "field3" } }
]
}
}
}
}
}
It returns what I expect:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"field1": "foo",
"field2": "bar"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 1,
"_source": {
"field2": "foo",
"field3": "bar"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"field3": "foo",
"field4": "bar"
}
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/991b828de250e5125fd372bf7e6b066acec55fcd

Related

How to make flattened sub-field in the nested field in elastic search?

Here, I have a indexed document like:
doc = {
"id": 1,
"content": [
{
"txt": I,
"time": 0,
},
{
"txt": have,
"time": 1,
},
{
"txt": a book,
"time": 2,
},
{
"txt": do not match this block,
"time": 3,
},
]
}
And I want to match "I have a book", and return the matched time: 0,1,2. Is there anyone who knows how to build the index and the query for this situation?
I think the "content.txt" should be flattened but "content.time" should be nested?
want to match "I have a book", and return the matched time: 0,1,2.
Adding a working example with index mapping,search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"content": {
"type": "nested"
}
}
}
}
Search Query:
{
"query": {
"nested": {
"path": "content",
"query": {
"bool": {
"must": [
{
"match": {
"content.txt": "I have a book"
}
}
]
}
},
"inner_hits": {}
}
}
}
Search Result:
"inner_hits": {
"content": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 2.5226097,
"hits": [
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 2
},
"_score": 2.5226097,
"_source": {
"txt": "a book",
"time": 2
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 0
},
"_score": 1.5580825,
"_source": {
"txt": "I",
"time": 0
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 1
},
"_score": 1.5580825,
"_source": {
"txt": "have",
"time": 1
}
}
]
}
}
}
}

Boolean similarity - is there a way to remove duplicates

Given the following index
PUT /test_index
{
"mappings": {
"properties": {
"field1": {
"type": "text",
"analyzer": "whitespace",
"similarity": "boolean"
},
"field2": {
"type": "text",
"analyzer": "whitespace",
"similarity": "boolean"
}
}
}
}
and the following data
POST /test_index/_bulk?refresh=true
{ "index" : {} }
{ "field1": "foo", "field2": "bar"}
{ "index" : {} }
{ "field1": "foo1 foo2", "field2": "bar1 bar2"}
{ "index" : {} }
{ "field1": "foo1 foo2 foo3", "field2": "bar1 bar2 bar3"}
for the given Boolean similarity query
POST /test_index/_search
{
"size": 10,
"min_score": 0.4,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"fuzzy":{
"field1":{
"value":"foo",
"fuzziness":"AUTO",
"boost": 1
}
}
},
{
"fuzzy":{
"field2":{
"value":"bar",
"fuzziness":"AUTO",
"boost": 1
}
}
}
]
}
}
}
}
}
I'm always receiving ["foo1 foo2 foo3", "bar1 bar2 bar3"] despite the fact that there is an exact result in index (the first one):
{
"took": 114,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 3.9999998,
"hits": [
{
"_index": "test_index",
"_type": "_doc",
"_id": "bXw8eXUBCTtfNv84bNPr",
"_score": 3.9999998,
"_source": {
"field1": "foo1 foo2 foo3",
"field2": "bar1 bar2 bar3"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "bHw8eXUBCTtfNv84bNPr",
"_score": 2.6666665,
"_source": {
"field1": "foo1 foo2",
"field2": "bar1 bar2"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "a3w8eXUBCTtfNv84bNPr",
"_score": 2.0,
"_source": {
"field1": "foo",
"field2": "bar"
}
}
]
}
}
I'm aware of the fact that Boolean works that way to match as many results, and I know I can do rescoring here, but this is not an option since I don't know how many top N results to fetch.
Are there any other options here? Maybe to create my own similarity plugin based on Boolean similarity to remove duplicates and leave the best matched token, but I don't know where to start from, I see only samples for script and rescore.
Update:- Based on the clarity provided in the comment section of my earlier answer, updating the answer.
Below query returns the expected results
{
"min_score": 0.4,
"size":10,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"fuzzy": {
"field1": {
"value": "foo",
"fuzziness": "AUTO",
"boost": 0.5
}
}
},
{
"term": { --> used for boosting the exact terms
"field1": {
"value": "foo",
"boost": 1.5 --> further boosting the exact match.
}
}
}
]
}
}
}
}
}
And search results
"hits": [
{
"_index": "test_index",
"_type": "_doc",
"_id": "zdMEvHUBlo4-1mHbtvNH",
"_score": 2.0,
"_source": {
"field1": "foo",
"field2": "bar"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "z9MEvHUBlo4-1mHbtvNH",
"_score": 0.99999994,
"_source": {
"field1": "foo1 foo2 foo3",
"field2": "bar1 bar2 bar3"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "ztMEvHUBlo4-1mHbtvNH",
"_score": 0.6666666,
"_source": {
"field1": "foo1 foo2",
"field2": "bar1 bar2"
}
}
]
Another query without the explicit boost of the exact term also returns the expected results
{
"min_score": 0.4,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"fuzzy": {
"field1": {
"value": "foo",
"fuzziness": "AUTO",
"boost": 0.5
}
}
},
{
"term": {
"field1": {
"value": "foo" --> notice there is no boost
}
}
}
]
}
}
}
}
}
And search result
"hits": [
{
"_index": "test_index",
"_type": "_doc",
"_id": "zdMEvHUBlo4-1mHbtvNH",
"_score": 1.5,
"_source": {
"field1": "foo",
"field2": "bar"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "z9MEvHUBlo4-1mHbtvNH",
"_score": 0.99999994,
"_source": {
"field1": "foo1 foo2 foo3",
"field2": "bar1 bar2 bar3"
}
},
{
"_index": "test_index",
"_type": "_doc",
"_id": "ztMEvHUBlo4-1mHbtvNH",
"_score": 0.6666666,
"_source": {
"field1": "foo1 foo2",
"field2": "bar1 bar2"
}
}
]

sorting on aggregate of value in a given field in elasticsearch

I have the following field in my index
field1:{key:value}
Is it possible to sort my query on sum of values in field1.
Thanks
Here's one way you could do this, assuming you know the fields ahead of time. It should be possible with some minor refinements if you need to wildcard the fields. This assumes the sibling fields on the nested type are numeric.
Example mapping:
"test": {
"mappings": {
"type1": {
"properties": {
"field1": {
"properties": {
"key1": {
"type": "integer"
},
"key2": {
"type": "integer"
}
}
}
}
}
}
}
Default results:
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "type1",
"_id": "AV8O7956gIcGI2d5A_5g",
"_score": 1,
"_source": {
"field1": {
"key1": 11,
"key2": 17
}
}
},
{
"_index": "test",
"_type": "type1",
"_id": "AV8O78FqgIcGI2d5A_5f",
"_score": 1,
"_source": {
"field1": {
"key1": 5,
"key2": 6
}
}
}
]
}
Query with script:
GET /test/_search
{
"query": {
"function_score": {
"query": {
"match_all": {}
},
"functions": [
{
"script_score": {
"script": "return (doc['field1.key1'].value + doc['field1.key2'].value) * -1"
}
}
]
}
}
}
Logic taking the lowest score as the best score (least negative in this case):
{
"took": 18,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": -11,
"hits": [
{
"_index": "test",
"_type": "type1",
"_id": "AV8O78FqgIcGI2d5A_5f",
"_score": -11,
"_source": {
"field1": {
"key1": 5,
"key2": 6
}
}
},
{
"_index": "test",
"_type": "type1",
"_id": "AV8O7956gIcGI2d5A_5g",
"_score": -28,
"_source": {
"field1": {
"key1": 11,
"key2": 17
}
}
}
]
}
}
Hopefully this gives you the gist of whatever specific scoring logic you need

Elasticsearch search for Turkish characters

I have some documents that i am indexing with elasticsearch. But some of the documents are written with upper case and Tukish characters are changed. For example "kürşat" is written as "KURSAT".
I want to find this document by searching "kürşat". How can i do that?
Thanks
Take a look at the asciifolding token filter.
Here is a small example for you to try out in Sense:
Index:
DELETE test
PUT test
{
"settings": {
"analysis": {
"filter": {
"my_ascii_folding": {
"type": "asciifolding",
"preserve_original": true
}
},
"analyzer": {
"turkish_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_ascii_folding"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"name": {
"type": "string",
"analyzer": "turkish_analyzer"
}
}
}
}
}
POST test/test/1
{
"name": "kürşat"
}
POST test/test/2
{
"name": "KURSAT"
}
Query:
GET test/_search
{
"query": {
"match": {
"name": "kursat"
}
}
}
Response:
"hits": {
"total": 2,
"max_score": 0.30685282,
"hits": [
{
"_index": "test",
"_type": "test",
"_id": "2",
"_score": 0.30685282,
"_source": {
"name": "KURSAT"
}
},
{
"_index": "test",
"_type": "test",
"_id": "1",
"_score": 0.30685282,
"_source": {
"name": "kürşat"
}
}
]
}
Query:
GET test/_search
{
"query": {
"match": {
"name": "kürşat"
}
}
}
Response:
"hits": {
"total": 2,
"max_score": 0.4339554,
"hits": [
{
"_index": "test",
"_type": "test",
"_id": "1",
"_score": 0.4339554,
"_source": {
"name": "kürşat"
}
},
{
"_index": "test",
"_type": "test",
"_id": "2",
"_score": 0.09001608,
"_source": {
"name": "KURSAT"
}
}
]
}
Now the 'preserve_original' flag will make sure that if a user types: 'kürşat', documents with that exact match will be ranked higher than documents that have 'kursat' (Notice the difference in scores for both query responses).
If you want the score to be equal, you can put the flag on false.
Hope I got your problem right!

Elasticsearch aggregation with custom query parser

I cannot seem to aggregate my query results when using my custom query parser. I get a result set by these are not aggregated. When using a standard query parser like match everything turns out well.
What works:
GET pages/_search
{
"query": {
"match": {
"text": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
returns a nice aggregated result:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 11.11176,
"hits": [
...
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6lzDazctHi91RE",
"_score": 3.3503218,
"_source": {
"rooturl": "document",
"type": "equation",
"url": "document:poly",
"text": "coefficient"
}
},
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6xzDazctHi91RF",
"_score": 3.3503218,
"_source": {
"rooturl": document",
"type": "equation",
"url": "document:poly",
"text": "dot"
}
}
...
]
},
"aggregations": {
"docs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "document",
"doc_count": 10
}
]
}
}
}
But when using my custom query parser, The result is not aggregated.
Query:
GET pages/_search
{
"query": {
"my_custom_query_parser": {
"query": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
Can anyone point me into the right direction?

Resources