Return Multi-Term Distinct Values - elasticsearch

Within an Elastic Search index I am attempting to query by 2 distinct top-level field values from field companyName and field productName, ordered by a generatedDate field and include the domainModelId field.
The following SQL query shows the results of all existing values and I've high-lighted the two unique document rows (in this case) by generatedDate;
{
"query": "SELECT companyName, productName, generatedDate FROM nextware_domain_metaservices_domainmodel ORDER BY generatedDate DESC"
}
response as follows:
I tried the following
{
"size":0,
"aggs":
{
"companies":
{
"terms":
{
"field": "companyName.keyword"
},
"aggs":
{
"products":
{
"terms":
{
"field": "productName.keyword"
}
}
}
}
}
}
This returns the correct buckets as follows;
"aggregations": {
"companies": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "NextWare",
"doc_count": 18,
"products": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "ProductPortal",
"doc_count": 16
},
{
"key": "Domain",
"doc_count": 2
}
]
}
}
]
}
}
How can I include the value of domainModelId.Id field without a second query?

To include the value of domainModelId.Id, you need to use top_hits aggregation
Adding a working example with index data, search query, and search result
Index Data:
{
"companyName":"NextWare",
"productName":"Domain",
"domainModelId.Id":"i"
}
{
"companyName":"NextWare",
"productName":"Domain",
"domainModelId.Id":"c"
}
{
"companyName":"NextWare",
"productName":"ProductPortal",
"domainModelId.Id":"a"
}
{
"companyName":"NextWare",
"productName":"ProductPortal",
"domainModelId.Id":"b"
}
{
"companyName":"NextWare",
"productName":"ProductPortal",
"domainModelId.Id":"d"
}
{
"companyName":"NextWare",
"productName":"ProductPortal",
"domainModelId.Id":"e"
}
{
"companyName":"NextWare",
"productName":"ProductPortal",
"domainModelId.Id":"f"
}
{
"companyName":"NextWare",
"productName":"ProductPortal",
"domainModelId.Id":"g"
}
{
"companyName":"NextWare",
"productName":"ProductPortal",
"domainModelId.Id":"h"
}
Search Query:
{
"size": 0,
"aggs": {
"companies": {
"terms": {
"field": "companyName.keyword"
},
"aggs": {
"products": {
"terms": {
"field": "productName.keyword"
},
"aggs": {
"top_ids": {
"top_hits": {
"_source": {
"includes": [
"domainModelId.Id"
]
},
"size": 10
}
}
}
}
}
}
}
}
Search Result:
"aggregations": {
"companies": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "NextWare",
"doc_count": 9,
"products": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "ProductPortal",
"doc_count": 7,
"top_ids": {
"hits": {
"total": {
"value": 7,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "67049816",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"domainModelId.Id": "a"
}
},
{
"_index": "67049816",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"domainModelId.Id": "b"
}
},
{
"_index": "67049816",
"_type": "_doc",
"_id": "4",
"_score": 1.0,
"_source": {
"domainModelId.Id": "d"
}
},
{
"_index": "67049816",
"_type": "_doc",
"_id": "5",
"_score": 1.0,
"_source": {
"domainModelId.Id": "e"
}
},
{
"_index": "67049816",
"_type": "_doc",
"_id": "6",
"_score": 1.0,
"_source": {
"domainModelId.Id": "f"
}
},
{
"_index": "67049816",
"_type": "_doc",
"_id": "7",
"_score": 1.0,
"_source": {
"domainModelId.Id": "g"
}
},
{
"_index": "67049816",
"_type": "_doc",
"_id": "8",
"_score": 1.0,
"_source": {
"domainModelId.Id": "h"
}
}
]
}
}
},
{
"key": "Domain",
"doc_count": 2,
"top_ids": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "67049816",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"domainModelId.Id": "c"
}
},
{
"_index": "67049816",
"_type": "_doc",
"_id": "9",
"_score": 1.0,
"_source": {
"domainModelId.Id": "i"
}
}
]
}
}
}
]
}
}
]
}
}

Related

Create the Elastic search query to show Random 5 Questions by category

I Have fields Category & Questions in the Table.
My Requirement is for the below mentioned 3 category against I need the questions which is tagged (SO I want the Category and Questions field in the query) by writing elastic search query
Category :
OLA
BNA
DRG
GET logstash-sdc-feedback/_search? { "_source":["Category.keyword"], "size": 5, "query":{ "bool": { "must": [ {"match":{"Category.keyword"": "OLA","BNA","DRG"}}
],
}
}, "aggs": { "MyBuckets": { "terms": { "field": "questions.keyword","Category.keyword" "order":{ "_count": "asc" }, "size": "5"
} } } }
You can use terms query along with terms aggregation, to achieve your use case.
Adding a working example
Index Data:
{
"category": "XYZ",
"question": "d"
}
{
"category": "OLA",
"question": "a"
}
{
"category": "BNA",
"question": "b"
}
{
"category": "DRG",
"question": "c"
}
Search Query:
{
"query": {
"bool": {
"must": {
"terms": {
"category.keyword": [
"OLA",
"BNA",
"DRG"
]
}
}
}
},
"aggs": {
"top_tags": {
"terms": {
"field": "category.keyword"
},
"aggs": {
"top_faq_hits": {
"top_hits": {
"_source": {
"includes": [
"question"
]
},
"size": 1
}
}
}
}
}
}
Search Result:
"aggregations": {
"top_tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "BNA", // note this
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"question": "b" // note this
}
}
]
}
}
},
{
"key": "DRG",
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"question": "c"
}
}
]
}
}
},
{
"key": "OLA",
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"question": "a"
}
}
]
}
}
}
]
}
}

How to make flattened sub-field in the nested field in elastic search?

Here, I have a indexed document like:
doc = {
"id": 1,
"content": [
{
"txt": I,
"time": 0,
},
{
"txt": have,
"time": 1,
},
{
"txt": a book,
"time": 2,
},
{
"txt": do not match this block,
"time": 3,
},
]
}
And I want to match "I have a book", and return the matched time: 0,1,2. Is there anyone who knows how to build the index and the query for this situation?
I think the "content.txt" should be flattened but "content.time" should be nested?
want to match "I have a book", and return the matched time: 0,1,2.
Adding a working example with index mapping,search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"content": {
"type": "nested"
}
}
}
}
Search Query:
{
"query": {
"nested": {
"path": "content",
"query": {
"bool": {
"must": [
{
"match": {
"content.txt": "I have a book"
}
}
]
}
},
"inner_hits": {}
}
}
}
Search Result:
"inner_hits": {
"content": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 2.5226097,
"hits": [
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 2
},
"_score": 2.5226097,
"_source": {
"txt": "a book",
"time": 2
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 0
},
"_score": 1.5580825,
"_source": {
"txt": "I",
"time": 0
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 1
},
"_score": 1.5580825,
"_source": {
"txt": "have",
"time": 1
}
}
]
}
}
}
}

Multiple aggregation in single query on Elasticsearch

I have log data in the Elasticsearch index.
`"hits": [
{
"_index": "event_log",
"_type": "log_type",
"_id": "2-d-kmoBazYRVz7KCQIj",
"_score": 1,
"_source": {
"user_id": 123,
"event": "click",
"category": "abc",
"product_id": 1112,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3ed-kmoBazYRVz7KCQLX",
"_score": 1,
"_source": {
"user_id": 456,
"event": "click",
"category": "abc",
"product_id": 112,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3ud-kmoBazYRVz7KCgIy",
"_score": 1,
"_source": {
"user_id": 1234,
"event": "click",
"category": "abc",
"product_id": 1112,
"bkt": "B"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4Od-kmoBazYRVz7KCgLr",
"_score": 1,
"_source": {
"user_id": 4567,
"event": "click",
"category": "xyz",
"product_id": 1118,
"bkt": "B"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4ud-kmoBazYRVz7KkwL2",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 1,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "2ud-kmoBazYRVz7KCALB",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 11,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3-d-kmoBazYRVz7KCgKP",
"_score": 1,
"_source": {
"user_id": 4567,
"event": "click",
"category": "abc",
"product_id": 111,
"bkt": "B"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "3Od-kmoBazYRVz7KCQJ8",
"_score": 1,
"_source": {
"user_id": 456,
"event": "click",
"category": "abc",
"product_id": 111,
"bkt": "A"
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4ed-kmoBazYRVz7KCwJH",
"_score": 1,
"_source": {
"user_id": 4567,
"event": "click",
"category": "xyz",
"product_id": 1128,
"bkt": "B"
}
}
]}
I want to get the aggregation by category, bkt, event. As well as I want to aggregate user_id by category, bkt. I have two separate queries for that
Count of record aggregated by category, bkt, event.
GET event_log/_search
{"size" : 0,
"aggs": {
"category_id": {
"terms": { "field": "category.keyword" },
"aggs": {
"ab_bucket": {
"terms": { "field": "bkt.keyword" },
"aggs": {
"event_type": {
"terms": { "field": "event.keyword" }
}
}
}
}
}
}
}
The result is
"aggregations": {
"category_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "abc",
"doc_count": 5,
"ab_bucket": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 3,
"event_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "click",
"doc_count": 3
}
]
}
},
{
"key": "B",
"doc_count": 2,
"event_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "click",
"doc_count": 2
}
]
}
}
]
}
},
Users aggregated by category, bkt.
GET event_log/_search
{"size" : 0,
"aggs": {
"category_id": {
"terms": { "field": "category.keyword" },
"aggs": {
"ab_bucket": {
"terms": { "field": "bkt.keyword" },
"aggs": {
"total_uniq_users" : {
"cardinality": {
"field" : "user_id"
}
}
}
}
}
}
}
}
The result is
"aggregations": {
"category_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "abc",
"doc_count": 5,
"ab_bucket": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 3,
"total_uniq_users": {
"value": 2
}
},
{
"key": "B",
"doc_count": 2,
"total_uniq_users": {
"value": 2
}
}
]
}
},
Is there a way to combine both the queries and obtain the expected result as a single result
Yes, you can do it like this:
GET event_log/_search
{
"size": 0,
"aggs": {
"category_id": {
"terms": {
"field": "category.keyword"
},
"aggs": {
"ab_bucket": {
"terms": {
"field": "bkt.keyword"
},
"aggs": {
"total_uniq_users": {
"cardinality": {
"field": "user_id"
}
},
"event_type": {
"terms": {
"field": "event.keyword"
}
}
}
}
}
}
}
}

Elasticsearch: Query the most recent that doesn't contain the field 'X'

I have the following search query:
{
"query": {
"match": {
"name": "testlib"
}
}
}
When I do this query I get the three results below. What I want to do now is only return one result: the newest #timestamp that doesn't contain version_pre. So in this case, only return AV6qvDXDyHw9vNh6Wlpl.
[
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDXDyHw9vNh6Wlpl",
"_score": 0.2876821,
"_source": {
"#timestamp": "2017-09-21T11:02:15-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 1
}
},
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDF5MtcMTuGknsVs",
"_score": 0.18232156,
"_source": {
"#timestamp": "2017-09-20T17:21:35-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 0
}
},
{
"_index": "testsoftware",
"_type": "software",
"_id": "AV6qvDnVyHw9vNh6Wlpn",
"_score": 0.18232156,
"_source": {
"#timestamp": "2017-09-22T13:56:55-04:00",
"name": "testlib",
"version_major": 1,
"version_minor": 0,
"version_patch": 2,
"version_pre": 0
}
}
]
Use sort (https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-sort.html) and https://www.elastic.co/guide/en/elasticsearch/reference/2.3/query-dsl-exists-query.html:
{
"size" : 1,
"sort" : [{ "#timestamp" : {"order" : "asc"}}],
"query" : {
"bool": {
"must_not": {
"exists": {
"field": "version_pre"
}
}
}
Or even, via query string:
/_search?sort=#timestamp:desc&size=1&q=_missing_:version_pre

How to correctly aggregate with the field is a list on Elasticsearch

Currently the ES logs are indexed in a way that some fields have a list instead of a single value.
For example:
_source:{
"field1":"["item1", "item2", "item3"],
"field2":"something",
"field3": "something_else"
}
Of course, the length of list is not always the same. I'm trying to find a way to aggregate the number of logs that consist each item (so some logs will be counted multiple times)
I know I have to use aggs, but how can I form the right query (after -d)?
You can use below query that uses terms aggregation and top_hits.
{
"size": 0,
"aggs": {
"group": {
"terms": {
"script": "_source.field1.each{}"
},
"aggs":{
"top_hits_log" :{
"top_hits" :{
}
}
}
}
}
}
Output will be:
"buckets": [
{
"key": "item1",
"doc_count": 3,
"top_hits_log": {
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
},
{
"_index": "so",
"_type": "test",
"_id": "2",
"_score": 1,
"_source": {
"field1": [
"item1"
],
"field2": "something2"
}
},
{
"_index": "so",
"_type": "test",
"_id": "3",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2"
],
"field2": "something3"
}
}
]
}
}
},
{
"key": "item2",
"doc_count": 2,
"top_hits_log": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
},
{
"_index": "so",
"_type": "test",
"_id": "3",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2"
],
"field2": "something3"
}
}
]
}
}
},
{
"key": "item3",
"doc_count": 1,
"top_hits_log": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
}
]
}
}
}
]
Make sure to enable dynamic scripting. Set script.disable_dynamic: false
Hope this helps.
There is no need to use scripting. It will be slow especially _source parsing. You also need to make sure your field1 is not_analyzed or you will get weird results as terms aggregation is performed on unique tokens in Inverted Index.
{
"size": 0,
"aggs": {
"unique_items": {
"terms": {
"field": "field1",
"size": 100
},
"aggs": {
"documents": {
"top_hits": {
"size": 10
}
}
}
}
}
}
Here the size is 100 inside terms aggregation, change this according to how many unique values you think you have(default is 10).
Hope this helps!

Resources