Elasticsearch query to process log data - elasticsearch

I have an event log of an e-commerce website in Elasticsearch.
Each event is a record in ES
{
"_index": "event_log",
"_type": "log_type",
"_id": "3ud-kmoBazYRVz7KCgIy",
"_score": 1,
"_source": {
"user_id": 123,
"event": "click",
"category": "abc",
"product_id": 1112
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4Od-kmoBazYRVz7KCgLr",
"_score": 1,
"_source": {
"user_id": 123,
"event": "click",
"category": "abc",
"product_id": 1118
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4ud-kmoBazYRVz7KkwL2",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 1
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "2ud-kmoBazYRVz7KCALB",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 11
}
},
I want list of all the product_ids grouping event, category, user.
Expected output:
{"click": {
"abc": {
"123": {
"product_id": [1112, 1118]
}
}
},
"cart": {
"xyz": {
"123": {
"product_id": [1, 11]
}
}
}
}
I will be having millions of records in the index. Querying all the records and processing it is time-consuming. Is there a way to produce the output in a single query? I'm sure it is not possible to generate exactly in the given format. Something near to it is very useful.

Hi here is my suggestion (first try)
GET event_log/_search
{
"size": 0,
"aggs": {
"event": {
"terms": {
"field": "event"
},
"aggs": {
"category": {
"terms": {
"field": "category"
},
"aggs": {
"product_id": {
"terms": {
"field": "product_id"
}
}
}
}
}
}
}
}

Related

How to filter an aggregation based on the values of top hits?

The mapping of the elastic search documents are as follows:
{
"mappings": {
"properties": {
"vote_id": {
"type": "keyword"
},
"user_id": {
"type": "text"
},
"song_id": {
"type": "text"
},
"type": {
"type": "byte"
},
"timestamp": {
"type": "date"
}
}
}
}
I want to aggregate these votes such that it returns songs that you AND your friends like.
So far, I have buckets of songs that you and your friends like, but some buckets may be songs that only your friends like.
{
"query": {
"bool": {
"must": {
"terms": {
"user_id": ["you and your friend ids"]
}
}
}
},
"aggs": {
"songs": {
"terms": {
"field": "song_id"
},
"aggs": {
"docs": {
"top_hits": {
"size": "length of you and your friends",
"_source": ["vote_id", "song_id", "user_id", "type", "timestamp"]
}
},
"more_than_one": {
"bucket_selector": {
"buckets_path": {
"count": "_count"
},
"script": "params.count > 1"
}
}
},
}
}
}
I want to filter the buckets such that at least one of the documents in the top hits has your user id.
This is the current response
"aggregations": {
"songs": {
"buckets": [
{
"doc_count": 5,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotNtFBb9TCEfpk3S54q6gcMbjZB82Xc1_ZCgA6kYsUmvk",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:56.118207343Z",
"type": 1,
"user_id": "iusr8keSbPjX9ZqFhX4Dei4G",
"vote_id": "ivotNtFBb9TCEfpk3S54q6gcMbjZB82Xc1_ZCgA6kYsUmvk"
},
"_type": "_doc"
},
{
"_id": "ivotEFcqOlCL5htJZJ43NslAP555DaPj0Dgkcay_Ml2jAT4",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:52.143988883Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotEFcqOlCL5htJZJ43NslAP555DaPj0Dgkcay_Ml2jAT4"
},
"_type": "_doc"
},
{
"_id": "ivotToZ-0iBiM_zF5TP1Shj5C29WV3U0ibedlxvcQccimeo",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:50.178450007Z",
"type": 1,
"user_id": "iusrKDxnm75fADEpusbmx5JM",
"vote_id": "ivotToZ-0iBiM_zF5TP1Shj5C29WV3U0ibedlxvcQccimeo"
},
"_type": "_doc"
},
{
"_id": "ivotAHBPual232E12ggibhr6GfQ5E3f9Ryov0gYKGrIRB0Y",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:52.886305925Z",
"type": 1,
"user_id": "iusrJG4GwkWa6Y70LPkuNCPg",
"vote_id": "ivotAHBPual232E12ggibhr6GfQ5E3f9Ryov0gYKGrIRB0Y"
},
"_type": "_doc"
},
{
"_id": "ivot7a8rWunlFu_q5St44PYDeNelLq4bsxr9wzYP9D80wxE",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8",
"timestamp": "2022-07-08T19:41:49.031694548Z",
"type": 1,
"user_id": "iusrxunBXT1UD0IrvjqjgWaj",
"vote_id": "ivot7a8rWunlFu_q5St44PYDeNelLq4bsxr9wzYP9D80wxE"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 5
}
}
},
"key": "isngVOkuaMqJTu6eQDj73gDYGObUus3g5Qp8"
},
{
"doc_count": 4,
"docs": {
"hits": {
"hits": [
{
"_id": "ivot9_2eQ_3eqU7SXQBnLWGQwFI5DE99Naf8wYbFNFrj1lk",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:55.761587927Z",
"type": 1,
"user_id": "iusr8keSbPjX9ZqFhX4Dei4G",
"vote_id": "ivot9_2eQ_3eqU7SXQBnLWGQwFI5DE99Naf8wYbFNFrj1lk"
},
"_type": "_doc"
},
{
"_id": "ivotUZRVSKzGbmlP4LlmBkMwMM8xcR4nGTE9KNpysVR0vXQ",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:52.555377592Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotUZRVSKzGbmlP4LlmBkMwMM8xcR4nGTE9KNpysVR0vXQ"
},
"_type": "_doc"
},
{
"_id": "ivot5Wj8pIkbO0JOV_5s2PqEvZU3sy0WSYYUSlgs2Qizfo8",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:49.756332674Z",
"type": 1,
"user_id": "iusrKDxnm75fADEpusbmx5JM",
"vote_id": "ivot5Wj8pIkbO0JOV_5s2PqEvZU3sy0WSYYUSlgs2Qizfo8"
},
"_type": "_doc"
},
{
"_id": "ivot8QNCJGsNtRiZYa-QMTUHEh5MHHHr4EKJsXm4UTAwJkg",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6",
"timestamp": "2022-07-08T19:41:53.26319105Z",
"type": 1,
"user_id": "iusrJG4GwkWa6Y70LPkuNCPg",
"vote_id": "ivot8QNCJGsNtRiZYa-QMTUHEh5MHHHr4EKJsXm4UTAwJkg"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 4
}
}
},
"key": "isng4hKgRPQvH0YhtBy5GaUqgCdwDoVhJuf6"
},
{
"doc_count": 3,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotoCfJc_q5vuY27KmvZUo8s4tilI57_xJoPXqfSeJTikg",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc",
"timestamp": "2022-07-08T19:41:50.527352591Z",
"type": 1,
"user_id": "iusrL8FCabxg1YCeaakcVXG5",
"vote_id": "ivotoCfJc_q5vuY27KmvZUo8s4tilI57_xJoPXqfSeJTikg"
},
"_type": "_doc"
},
{
"_id": "ivotStjKDIRy6vfaO5dNws4wGPELywPRgg7D7uSavatfIEo",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc",
"timestamp": "2022-07-08T19:41:51.733375716Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotStjKDIRy6vfaO5dNws4wGPELywPRgg7D7uSavatfIEo"
},
"_type": "_doc"
},
{
"_id": "ivotUHj_Ebh-xIqqPJNEdWAuc_JO_mcVVG8F9wM67bJ7_6A",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc",
"timestamp": "2022-07-08T19:41:48.60900159Z",
"type": 1,
"user_id": "iusrxunBXT1UD0IrvjqjgWaj",
"vote_id": "ivotUHj_Ebh-xIqqPJNEdWAuc_JO_mcVVG8F9wM67bJ7_6A"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 3
}
}
},
"key": "isngHAVCux40BgjGVZuAwZlTiEjVQFxuxurc"
},
{
"doc_count": 3,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotE5xO9hZGLhrS2sL1mgf5UbcHtf_5qAwbrp3QwEtf4zk",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W",
"timestamp": "2022-07-08T19:41:54.99023451Z",
"type": 1,
"user_id": "iusre80pxIMFB XfF61SHlCiz",
"vote_id": "ivotE5xO9hZGLhrS2sL1mgf5UbcHtf_5qAwbrp3QwEtf4zk"
},
"_type": "_doc"
},
{
"_id": "ivotI0OCI6gz6oEV94hgvjZmGB-qA4n-EigirDRJpgeZt68",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W",
"timestamp": "2022-07-08T19:41:50.952366924Z",
"type": 1,
"user_id": "iusr3oWy3mxsBWu6CU4mlw5L",
"vote_id": "ivotI0OCI6gz6oEV94hgvjZmGB-qA4n-EigirDRJpgeZt68"
},
"_type": "_doc"
},
{
"_id": "ivotm7GrIeyWRHamPXF9klzTZ0La8H4evCgWkCTIpx8rLl4",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W",
"timestamp": "2022-07-08T19:41:48.234881506Z",
"type": 1,
"user_id": "iusrCbCltg4nzv0b2JfUbyhj",
"vote_id": "ivotm7GrIeyWRHamPXF9klzTZ0La8H4evCgWkCTIpx8rLl4"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 3
}
}
},
"key": "isngOMnUKFVT1cKsH6Q9JfpF3WEZ4H4iU75W"
},
{
"doc_count": 2,
"docs": {
"hits": {
"hits": [
{
"_id": "ivotGWwrgPehs9s7ZwZACzkVNp4-_SUaUu3noUKyBH8IBnw",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngfMv4IemhtjXX78LTqxKFBc1VMeUz6And",
"timestamp": "2022-07-08T19:41:53.785333717Z",
"type": 1,
"user_id": "iusrYvNFTaTg4RBNBxG63nkY",
"vote_id": "ivotGWwrgPehs9s7ZwZACzkVNp4-_SUaUu3noUKyBH8IBnw"
},
"_type": "_doc"
},
{
"_id": "ivotrWG5cy6vEbe0N4JO4IKzHZyahOlkyPctCdrBnBu-v9Q",
"_index": "votes-index",
"_score": 1,
"_source": {
"song_id": "isngfMv4IemhtjXX78LTqxKFBc1VMeUz6And",
"timestamp": "2022-07-08T19:41:51.303745591Z",
"type": 1,
"user_id": "iusr18wnuxsy8oVVK3Xic4Sy",
"vote_id": "ivotrWG5cy6vEbe0N4JO4IKzHZyahOlkyPctCdrBnBu-v9Q"
},
"_type": "_doc"
}
],
"max_score": 1,
"total": {
"relation": "eq",
"value": 2
}
}
},
"key": "isngfMv4IemhtjXX78LTqxKFBc1VMeUz6And"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
I want to filter out the aggregate hits that have documents with your own id

How to make flattened sub-field in the nested field in elastic search?

Here, I have a indexed document like:
doc = {
"id": 1,
"content": [
{
"txt": I,
"time": 0,
},
{
"txt": have,
"time": 1,
},
{
"txt": a book,
"time": 2,
},
{
"txt": do not match this block,
"time": 3,
},
]
}
And I want to match "I have a book", and return the matched time: 0,1,2. Is there anyone who knows how to build the index and the query for this situation?
I think the "content.txt" should be flattened but "content.time" should be nested?
want to match "I have a book", and return the matched time: 0,1,2.
Adding a working example with index mapping,search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"content": {
"type": "nested"
}
}
}
}
Search Query:
{
"query": {
"nested": {
"path": "content",
"query": {
"bool": {
"must": [
{
"match": {
"content.txt": "I have a book"
}
}
]
}
},
"inner_hits": {}
}
}
}
Search Result:
"inner_hits": {
"content": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 2.5226097,
"hits": [
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 2
},
"_score": 2.5226097,
"_source": {
"txt": "a book",
"time": 2
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 0
},
"_score": 1.5580825,
"_source": {
"txt": "I",
"time": 0
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 1
},
"_score": 1.5580825,
"_source": {
"txt": "have",
"time": 1
}
}
]
}
}
}
}

Multiple aggregation in single query on Elasticsearch

I have log data in the Elasticsearch index.
`"hits": [
{
"_index": "event_log",
"_type": "log_type",
"_id": "2-d-kmoBazYRVz7KCQIj",
"_score": 1,
"_source": {
"user_id": 123,
"event": "click",
"category": "abc",
"product_id": 1112,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3ed-kmoBazYRVz7KCQLX",
"_score": 1,
"_source": {
"user_id": 456,
"event": "click",
"category": "abc",
"product_id": 112,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3ud-kmoBazYRVz7KCgIy",
"_score": 1,
"_source": {
"user_id": 1234,
"event": "click",
"category": "abc",
"product_id": 1112,
"bkt": "B"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4Od-kmoBazYRVz7KCgLr",
"_score": 1,
"_source": {
"user_id": 4567,
"event": "click",
"category": "xyz",
"product_id": 1118,
"bkt": "B"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4ud-kmoBazYRVz7KkwL2",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 1,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "2ud-kmoBazYRVz7KCALB",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 11,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3-d-kmoBazYRVz7KCgKP",
"_score": 1,
"_source": {
"user_id": 4567,
"event": "click",
"category": "abc",
"product_id": 111,
"bkt": "B"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3Od-kmoBazYRVz7KCQJ8",
"_score": 1,
"_source": {
"user_id": 456,
"event": "click",
"category": "abc",
"product_id": 111,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4ed-kmoBazYRVz7KCwJH",
"_score": 1,
"_source": {
"user_id": 4567,
"event": "click",
"category": "xyz",
"product_id": 1128,
"bkt": "B"
}
}
]}
I want to get the aggregation by category, bkt, event. As well as I want to aggregate user_id by category, bkt. I have two separate queries for that
Count of record aggregated by category, bkt, event.
GET event_log/_search
{"size" : 0,
"aggs": {
"category_id": {
"terms": { "field": "category.keyword" },
"aggs": {
"ab_bucket": {
"terms": { "field": "bkt.keyword" },
"aggs": {
"event_type": {
"terms": { "field": "event.keyword" }
}
}
}
}
}
}
}
The result is
"aggregations": {
"category_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "abc",
"doc_count": 5,
"ab_bucket": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 3,
"event_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "click",
"doc_count": 3
}
]
}
},
{
"key": "B",
"doc_count": 2,
"event_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "click",
"doc_count": 2
}
]
}
}
]
}
},
Users aggregated by category, bkt.
GET event_log/_search
{"size" : 0,
"aggs": {
"category_id": {
"terms": { "field": "category.keyword" },
"aggs": {
"ab_bucket": {
"terms": { "field": "bkt.keyword" },
"aggs": {
"total_uniq_users" : {
"cardinality": {
"field" : "user_id"
}
}
}
}
}
}
}
}
The result is
"aggregations": {
"category_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "abc",
"doc_count": 5,
"ab_bucket": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 3,
"total_uniq_users": {
"value": 2
}
},
{
"key": "B",
"doc_count": 2,
"total_uniq_users": {
"value": 2
}
}
]
}
},
Is there a way to combine both the queries and obtain the expected result as a single result
Yes, you can do it like this:
GET event_log/_search
{
"size": 0,
"aggs": {
"category_id": {
"terms": {
"field": "category.keyword"
},
"aggs": {
"ab_bucket": {
"terms": {
"field": "bkt.keyword"
},
"aggs": {
"total_uniq_users": {
"cardinality": {
"field": "user_id"
}
},
"event_type": {
"terms": {
"field": "event.keyword"
}
}
}
}
}
}
}
}

How to sort by match prioritising the most left words matched

How to sort by match prioritising the most left words matched
Explanation
Sort the prefix query by the word it matches, but prioritising the matches in the words more at left.
Tests I've made
Data
DELETE /test
PUT /test
PUT /test/person/_mapping
{
"properties": {
"name": {
"type": "multi_field",
"fields": {
"name": {"type": "string"},
"original": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
PUT /test/person/1
{"name": "Berta Kassulke"}
PUT /test/person/2
{"name": "Kaley Bartoletti"}
PUT /test/person/3
{"name": "Kali Hahn"}
PUT /test/person/4
{"name": "Karolann Klein"}
PUT /test/person/5
{"name": "Sofia Mandez Kaloo"}
The mapping was added for the 'sort on original value' test.
Simple query
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
}
}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": 1,
"_source": {
"name": "Karolann Klein"
}
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": 1,
"_source": {
"name": "Sofia Mandez Kaloo"
}
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": 1,
"_source": {
"name": "Berta Kassulke"
}
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": 1,
"_source": {
"name": "Kaley Bartoletti"
}
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": 1,
"_source": {
"name": "Kali Hahn"
}
}
]
}
}
With sorting
Request
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name": {"order": "asc"}}
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"berta"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"kaloo"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"karolann"
]
}
]
}
}
With sort on original value
Query
POST /test/person/_search
{
"query": {
"prefix": {"name": {"value": "ka"}}
},
"sort": {"name.original": {"order": "asc"}}
}
Result
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test",
"_type": "person",
"_id": "1",
"_score": null,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
"Berta Kassulke"
]
},
{
"_index": "test",
"_type": "person",
"_id": "2",
"_score": null,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
"Kaley Bartoletti"
]
},
{
"_index": "test",
"_type": "person",
"_id": "3",
"_score": null,
"_source": {
"name": "Kali Hahn"
},
"sort": [
"Kali Hahn"
]
},
{
"_index": "test",
"_type": "person",
"_id": "4",
"_score": null,
"_source": {
"name": "Karolann Klein"
},
"sort": [
"Karolann Klein"
]
},
{
"_index": "test",
"_type": "person",
"_id": "5",
"_score": null,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
"Sofia Mandez Kaloo"
]
}
]
}
}
Intended result
Sorted by name ASC but prioritising the matches on the most left words
Kaley Bartoletti
Kali Hahn
Karolann Klein
Berta Kassulke
Sofia Mandez Kaloo
Good Question. One way to achieve this would be with the combination of edge ngram filter and span first query
This is my setting
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase",
"edge_filter",
"asciifolding"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 8
}
}
}
},
"mappings": {
"person": {
"properties": {
"name": {
"type": "string",
"analyzer": "my_custom_analyzer",
"search_analyzer": "standard",
"fields": {
"standard": {
"type": "string"
}
}
}
}
}
}
}
After that I inserted your sample documents. Then I wrote the following query with dis_max. Notice that end parameter for first span query is 1 so this will prioritize(higher score) leftmost match. I am first sorting by score and then by name.
{
"query": {
"dis_max": {
"tie_breaker": 0.7,
"boost": 1.2,
"queries": [
{
"match": {
"name": "ka"
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 1
}
},
{
"span_first": {
"match": {
"span_term": {
"name": "ka"
}
},
"end": 2
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"name.standard": {
"order": "asc"
}
}
]
}
The result I get
"hits": [
{
"_index": "esedge",
"_type": "policy_data",
"_id": "2",
"_score": 0.72272325,
"_source": {
"name": "Kaley Bartoletti"
},
"sort": [
0.72272325,
"bartoletti"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "3",
"_score": 0.72272325,
"_source": {
"name": "Kali Hahn"
},
"sort": [
0.72272325,
"hahn"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "4",
"_score": 0.72272325,
"_source": {
"name": "Karolann Klein"
},
"sort": [
0.72272325,
"karolann"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "1",
"_score": 0.54295504,
"_source": {
"name": "Berta Kassulke"
},
"sort": [
0.54295504,
"berta"
]
},
{
"_index": "esedge",
"_type": "policy_data",
"_id": "5",
"_score": 0.2905494,
"_source": {
"name": "Sofia Mandez Kaloo"
},
"sort": [
0.2905494,
"kaloo"
]
}
]
I hope this helps.

Select TOP + GROUP BY + SHORT in Elasticsearch?

Assume the following stockInWarehouse schema:
{
product_db: {
mappings: {
stockInWarehouse: {
properties: {
sku: {
type: "string"
},
arrivalTime: {
type: "date",
format: "dateOptionalTime"
}
}
}
}
}
}
The data in stockInWarehouse look like:
{
"hits": {
"total": 5,
"hits": [
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "1",
"_source": {
"sku": "item 1",
"arrivalTime": "2015-11-11T19:00:10.231Z"
}
},
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "2",
"_source": {
"sku": "item 2",
"arrivalTime": "2015-11-12T19:00:10.231Z"
}
},
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "3",
"_source": {
"sku": "item 1",
"arrivalTime": "2015-11-12T19:35:10.231Z"
}
},
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "4",
"_source": {
"sku": "item 1",
"arrivalTime": "2015-11-13T19:56:10.231Z"
}
},
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "5",
"_source": {
"sku": "item 3",
"arrivalTime": "2015-11-15T19:56:10.231Z"
}
}
]
}
}
What i am trying to do is to fetch TOP documents by arrivalTime (aka most recent documents) however i want them to be sorted by another field (sku) and limit to available sku. The expected result would look like this:
{
"hits": {
"total": 3,
"hits": [
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "5",
"_source": {
"sku": "item 3",
"arrivalTime": "2015-11-15T19:56:10.231Z"
}
},
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "4",
"_source": {
"sku": "item 1",
"arrivalTime": "2015-11-13T19:56:10.231Z"
}
},
{
"_index": "product_db",
"_type": "stockInWarehouse",
"_id": "2",
"_source": {
"sku": "item 2",
"arrivalTime": "2015-11-12T19:00:10.231Z"
}
}
]
}
}
If I sort by arrivalTime, the result sku list will contains item 3, item 1, item 1, item 2, item 1 (duplicate). If I sort by sku, result list will not reflect correct arrivalTime order.
Is this type of query possible in Elasticsearch? How can I archive this?
How about this one?
{
"size": 0,
"aggs": {
"terms_agg": {
"terms": {
"field": "sku",
"size": 100,
"order": {
"max_date_agg": "desc"
}
},
"aggs": {
"max_date_agg": {
"max": {
"field": "arrivalTime"
}
}
}
}
}
}
I have made size : 100 assuming you have lot of products.
Note You need to add index : not_analyzed to your mapping of sku
This is the result of the query
"aggregations": {
"terms_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "item 3",
"doc_count": 1,
"max_date_agg": {
"value": 1447617370231,
"value_as_string": "2015-11-15T19:56:10.231Z"
}
},
{
"key": "item 1",
"doc_count": 3,
"max_date_agg": {
"value": 1447444570231,
"value_as_string": "2015-11-13T19:56:10.231Z"
}
},
{
"key": "item 2",
"doc_count": 1,
"max_date_agg": {
"value": 1447354810231,
"value_as_string": "2015-11-12T19:00:10.231Z"
}
}
]
}
}
I hope it helps!!

Resources