Get every Nth result in Elasticsearch - elasticsearch

I have this large set of data and I want a sample that I can use in a graph. For this I don't need all of the data, I need every Nth item.
For instance if I have 4000 results, and I only need 800 results, I want to be able to get every 5th result.
So some like: get, skip, skip, skip, skip, get, skip, skip, skip,..
I was wondering if such a thing is possible in Elasticsearch?

You're better off using a scripted filter. Otherwise you're needlessly using the score. Filters are just like queries, but they don't use scoring.
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "doc['unique_counter'].value % n == 0",
"params" : {
"n" : 5
}
}
}
}
}
}
You're also better off not using dynamic scripting in real world usage.
That said, you probably want to take a look at aggregations for graphing analytical information about your data rather than taking an arbitrary sample.

One way you could do it is with random scoring. It won't give you precisely every nth item according to a rigid ordering, but if you can relax that requirement this trick should do nicely.
To test it I set up a simple index (I mapped "doc_id" to "_id" just so the documents would have some contents, so that part isn't required, in case that's not obvious):
PUT /test_index
{
"mappings": {
"doc": {
"_id": {
"path": "doc_id"
}
}
}
}
Then I indexed ten simple documents:
POST /test_index/doc/_bulk
{"index":{}}
{"doc_id":1}
{"index":{}}
{"doc_id":2}
{"index":{}}
{"doc_id":3}
{"index":{}}
{"doc_id":4}
{"index":{}}
{"doc_id":5}
{"index":{}}
{"doc_id":6}
{"index":{}}
{"doc_id":7}
{"index":{}}
{"doc_id":8}
{"index":{}}
{"doc_id":9}
{"index":{}}
{"doc_id":10}
Now I can pull back three random documents like this:
POST /test_index/_search
{
"size": 3,
"query": {
"function_score": {
"functions": [
{
"random_score": {
"seed": "some seed"
}
}
]
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 0.93746644,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 0.93746644,
"_source": {
"doc_id": 1
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "10",
"_score": 0.926947,
"_source": {
"doc_id": 10
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "5",
"_score": 0.79400194,
"_source": {
"doc_id": 5
}
}
]
}
}
Or a different random three like this:
POST /test_index/_search
{
"size": 3,
"query": {
"function_score": {
"functions": [
{
"random_score": {
"seed": "some other seed"
}
}
]
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 0.817295,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": 0.817295,
"_source": {
"doc_id": 4
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "8",
"_score": 0.469319,
"_source": {
"doc_id": 8
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 0.4374538,
"_source": {
"doc_id": 3
}
}
]
}
}
Hopefully it's clear how to generalize this method to what you need. Just take out however many documents you want, in however many chunks make it performant.
Here is all the code I used to test:
http://sense.qbox.io/gist/a02d4da458365915f5e9cf6ea80546d2dfabc75d
EDIT: Actually now that I think about it, you could also use scripted scoring to get precisely every nth item, if you set it up right. Maybe something like,
POST /test_index/_search
{
"size": 3,
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "if(doc['doc_id'].value % 3 == 0){ return 1 }; return 0;"
}
}
]
}
}
}
...
{
"took": 13,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"doc_id": 3
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "6",
"_score": 1,
"_source": {
"doc_id": 6
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "9",
"_score": 1,
"_source": {
"doc_id": 9
}
}
]
}
}

Related

Boost score of term query result with multiple matches

I have serveral documents that look like the following stored in my elastic search index:
PUT tests
{
"mappings": {
"_doc": {
"dynamic": false,
"properties": {
"objects": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"text": {
"type": "text"
}
}
}
}
}
PUT tests/_doc/1
{
"text": "lel",
"objects": ["A"]
}
PUT tests/_doc/2
{
"text": "lol",
"objects": ["B"]
}
PUT tests/_doc/3
{
"text": "lil",
"objects": ["C"]
}
PUT tests/_doc/4
{
"text": "lul",
"objects": ["A", "B", "C"]
}
I want to query for objects with the following query:
GET _search
{
"query": {
"terms": {
"objects.keyword": ["A", "B", "C"]
}
}
}
The result includes all three sample objects I provided.
My question is simply whether I can make an object appear of a higher importance (boost) that has a full match (all keywords in the objects array) and not just only a partial match and if so how, since I could not find any information in the elastic search documentation.
This is the result I am currently receiving:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 11,
"successful": 11,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "tests",
"_type": "_doc",
"_id": "2",
"_score": 1,
"_source": {
"text": "lol",
"objects": [
"B"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "4",
"_score": 1,
"_source": {
"text": "lul",
"objects": [
"A",
"B",
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "1",
"_score": 1,
"_source": {
"text": "lel",
"objects": [
"A"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "3",
"_score": 1,
"_source": {
"text": "lil",
"objects": [
"C"
]
}
}
]
}
}
I think your best bet is using a bool query with should and minimum_should_match: 1.
GET _search
{
"query": {
"bool": {
"should": [
{
"term": {
"objects.keyword": "A"
}
},
{
"term": {
"objects.keyword": "B"
}
},
{
"term": {
"objects.keyword": "C"
}
}
],
"minimum_should_match": 1
}
}
}
Results:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1.5686159,
"hits": [
{
"_index": "tests",
"_type": "_doc",
"_id": "4",
"_score": 1.5686159,
"_source": {
"text": "lul",
"objects": [
"A",
"B",
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "1",
"_score": 0.2876821,
"_source": {
"text": "lel",
"objects": [
"A"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "3",
"_score": 0.2876821,
"_source": {
"text": "lil",
"objects": [
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "2",
"_score": 0.18232156,
"_source": {
"text": "lol",
"objects": [
"B"
]
}
}
]
}
}
EDIT: Here's why, as explained by the docs (https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html):
The bool query takes a more-matches-is-better approach, so the score from each matching must or should clause will be added together to provide the final _score for each document.

Aggregation in elastic search by field value data

I have below set of data and I want aggregation as per the status. Not sure how to compare the value of status with rejected or success and get the count of result.
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2874,
"max_score": 1,
"hits": [
{
"_index": "testfiles",
"_type": "testfiles",
"_id": "testfile.one",
"_score": 1,
"_source": {
"businessDate": 20171013,
"status": "Success"
}
},
{
"_index": "testfiles",
"_type": "testfiles",
"_id": "testfile.two",
"_score": 1,
"_source": {
"businessDate": 20171013,
"status": "Success"
}
},
{
"_index": "testfiles",
"_type": "testfiles",
"_id": "testfile.three",
"_score": 1,
"_source": {
"businessDate": 20171013,
"status": "Rejected"
}
},
{
"_index": "testfiles",
"_type": "testfiles",
"_id": "testfile.four",
"_score": 1,
"_source": {
"businessDate": 20171013,
"status": "Rejected"
}
}
]
}
}
Can someone help to how to achieve this in elastic search aggregation.
Expected response something like below
"aggregations": {
"success_records": 2,
"rejected_records": 2
}
Assuming status field is of type text, you'll need to update it to multi-fields having a keyword type needed for aggregation. Then query using:
GET my_index/_search
{
"size": 0,
"aggs": {
"statuses": {
"terms": {
"field": "status.raw"
}
}
}
If you already have status as keyword field, then change status.raw to status in the above query.

elasticsearch exact matching string include dot

I have the following indexed document:
curl -XGET "http://127.0.0.1:8200/logstash-test/1/_search"
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "logstash-test",
"_type": "1",
"_id": "AVthzksHqNe69jLmmCEp",
"_score": 1,
"_source": {
"foo": "bar2"
}
},
{
"_index": "logstash-test",
"_type": "1",
"_id": "AVthzlbfqNe69jLmmCSr",
"_score": 1,
"_source": {
"foo": "bar3"
}
},
{
"_index": "logstash-test",
"_type": "1",
"_id": "AVthwg4_qNe69jLmlStd",
"_score": 1,
"_source": {
"foo": "bar"
}
},
{
"_index": "logstash-test",
"_type": "1",
"_id": "AVth0IS1qNe69jLmmMpZ",
"_score": 1,
"_source": {
"foo": "bar4.foo_bar.foo"
}
}
]
}
}
I want to search foo=bar2 or foo=ba3 or foo=bar4.foo_bar.foo
curl -XPOST "http://127.0.0.1:8200/logstash-test/1/_search" -d
'{"query":{"bool":{"filter":[{"terms":{"foo":["bar3","bar2","bar4.foo_bar.foo"]}}]}}}'
But bar4.foo_bar.foo do not match.
Thank you.
As you are searching on exact terms use keyword field available on foo field as shown below:
curl -XPOST "http://127.0.0.1:8200/logstash-test/1/_search" -d
'{
"query": {
"bool": {
"filter": [
{
"terms": {
"foo.keyword": [
"bar3",
"bar2",
"bar4.foo_bar.foo"
]
}
}
]
}
}
}'
You can read more about multi-fields here
Method #2
You can solve it by using different analyzer( e.g whitespace analyzer) for foo field while defining mapping for it.
PUT logstash-test
{
"mappings": {
"1": {
"properties": {
"foo": {
"type": "text",
"analyzer": "whitespace"
}
}
}
}
}
But as your are searching on exact terms method #1 is preferred over method #2

ElasticSearch Order By String Length

I am using ElasticSearch via NEST c#. I have large list of information about people
{
firstName: 'Frank',
lastName: 'Jones',
City: 'New York'
}
I'd like to be able to filter and sort this list of items by lastName as well as order by the length so people who only have 5 characters in their name will be at the beginning of the result set then people with 10 characters.
So with some pseudo code I'd like to do something like
list.wildcard("j*").sort(m => lastName.length)
You can do the sorting with script-based sorting.
As a toy example, I set up a trivial index with a few documents:
PUT /test_index
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"name":"Bob"}
{"index":{"_id":2}}
{"name":"Jeff"}
{"index":{"_id":3}}
{"name":"Darlene"}
{"index":{"_id":4}}
{"name":"Jose"}
Then I can order search results like this:
POST /test_index/_search
{
"query": {
"match_all": {}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": null,
"_source": {
"name": "Bob"
},
"sort": [
3
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
To filter by length, I can use a script filter in a similar way:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"script": {
"script": "doc['name'].value.length() > 3",
"params": {}
}
}
}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/22fef6dc5453eaaae3be5fb7609663cc77c43dab
P.S.: If any of the last names will contain spaces, you might want to use "index": "not_analyzed" on that field.

Elasticsearch update id of each document to a value of another field in the document

In elasticsearch how can I replace the id of every document with the value of another field in the document?
I don't think you can change the ids of existing documents in the index, but you can reindex them using the path parameter in your mapping. Here is a trivial example.
I set up a simple index, using the path parameter in the _id definition in the mapping, and added a few docs:
PUT /test_index
{
"mappings": {
"doc":{
"_id": {
"path": "number"
},
"properties": {
"text_field": {
"type": "string"
},
"number": {
"type": "integer"
}
}
}
}
}
POST /test_index/doc/_bulk
{"index":{}}
{"text_field": "Apple TV","number":3}
{"index":{}}
{"text_field": "Apple iPhone","number":2}
{"index":{}}
{"text_field": "Apple MacBook","number":1}
Then if I search, I can see that the ids were set as I asked:
POST /test_index/_search
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"text_field": "Apple MacBook",
"number": 1
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 1,
"_source": {
"text_field": "Apple iPhone",
"number": 2
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"text_field": "Apple TV",
"number": 3
}
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/933bd839b2d524889e483f50c59c37ffaab2270a
You can do this by defining a mapping for _id as described here id-mapping

Resources