Elasticsearch Filtering Parents by Filtered Child Document Count - elasticsearch

I'm attempting to do some elasticsearch query fu on a set of data I have.
I have a user document that is the parent to many child page view documents. I'm looking to return all users that have viewed a specific page an arbitrary amount of times (defined by user input box). So far, I've got a has_child query that will return me all the users that have a page view with certain ids. However, this will return those parents with all their children. Next, I've tried to write an aggregation on those query results, that will essentially do the same has_child query in aggregation form. Now, I have the right document count for my filtered child documents. I need to use this document count to go back and filter the parents. To explain the query in words, "return to me all the users that have viewed a specific page more than 4 times". It's possible that I may need to restructure my data. Any thoughts?
Here is my query thus far:
curl -XGET 'http://localhost:9200/development_users/_search?pretty=true' -d '
{
"query" : {
"has_child" : {
"type" : "page_view",
"query" : {
"terms" : {
"viewed_id" : [175,180]
}
}
}
},
"aggs" : {
"to_page_view": {
"children": {
"type" : "page_view"
},
"aggs" : {
"page_views_that_match" : {
"filter" : { "terms": { "viewed_id" : [175,180] } }
}
}
}
}
}'
This returns me a response like:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [ {
"_index" : "development_users",
"_type" : "user",
"_id" : "22548",
"_score" : 1.0,
"_source":{"id":22548,"account_id":1009}
} ]
},
"aggregations" : {
"to_page_view" : {
"doc_count" : 53,
"page_views_that_match" : {
"doc_count" : 2
}
}
}
}
Associated Mappings:
{
"development_users" : {
"mappings" : {
"page_view" : {
"dynamic" : "false",
"_parent" : {
"type" : "user"
},
"_routing" : {
"required" : true
},
"properties" : {
"created_at" : {
"type" : "date",
"format" : "date_time"
},
"id" : {
"type" : "integer"
},
"viewed_id" : {
"type" : "integer"
},
"time_on_page" : {
"type" : "integer"
},
"title" : {
"type" : "string"
},
"type" : {
"type" : "string"
},
"updated_at" : {
"type" : "date",
"format" : "date_time"
},
"url" : {
"type" : "string"
}
}
},
"user" : {
"dynamic" : "false",
"properties" : {
"account_id" : {
"type" : "integer"
},
"id" : {
"type" : "integer"
}
}
}
}
}
}

Okay, so this is kind of involved. I made a few simplifications to keep it straight in my head. First, I used this mapping:
PUT /test_index
{
"mappings": {
"page_view": {
"_parent": {
"type": "development_user"
},
"properties": {
"viewed_id": {
"type": "string"
}
}
},
"development_user": {
"properties": {
"id": {
"type": "string"
}
}
}
}
}
Then I added some data. In this little universe, I have three users and two pages. I want to find users who have viewed "page_a" at least twice, so if I construct the correct query only user 3 will be returned.
POST /test_index/development_user/_bulk
{"index":{"_type":"development_user","_id":1}}
{"id":"user_1"}
{"index":{"_type":"page_view","_parent":1}}
{"viewed_id":"page_a"}
{"index":{"_type":"development_user","_id":2}}
{"id":"user_2"}
{"index":{"_type":"page_view","_parent":2}}
{"viewed_id":"page_b"}
{"index":{"_type":"development_user","_id":3}}
{"id":"user_3"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_a"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_a"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_b"}
To get that answer we'll use aggregations. Notice that I don't want documents returned (the normal way), but I do want to filter down the documents we analyze, because it will make things more efficient. So I use the same basic filter you had before.
So the aggregation tree starts with terms_parent_id which will just separate parent documents. Inside that I have children_page_view which filters the child documents down to the ones I want ("page_a"), and next to it in the hierarchy is bucket_selector_page_id_term_count which uses a bucket selector (you'll need ES 2.x) to filter the parent documents by those meeting the criterium, and then finally a top hits aggregation which shows us the documents that match the requirements.
POST /test_index/development_user/_search
{
"size": 0,
"query": {
"has_child": {
"type": "page_view",
"query": {
"terms": {
"viewed_id": [
"page_a"
]
}
}
}
},
"aggs": {
"terms_parent_id": {
"terms": {
"field": "id"
},
"aggs": {
"children_page_view": {
"children": {
"type": "page_view"
},
"aggs": {
"filter_page_ids": {
"filter": {
"terms": {
"viewed_id": [
"page_a"
]
}
}
}
}
},
"bucket_selector_page_id_term_count": {
"bucket_selector": {
"buckets_path": {
"children_count": "children_page_view>filter_page_ids._count"
},
"script": "children_count >= 2"
}
},
"top_hits_users": {
"top_hits": {
"_source": {
"include": [
"id"
]
}
}
}
}
}
}
}
which returns:
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"terms_parent_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user_3",
"doc_count": 1,
"children_page_view": {
"doc_count": 3,
"filter_page_ids": {
"doc_count": 2
}
},
"top_hits_users": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "development_user",
"_id": "3",
"_score": 1,
"_source": {
"id": "user_3"
}
}
]
}
}
}
]
}
}
}
Here's all the code I used:
http://sense.qbox.io/gist/43f24461448519dc884039db40ebd8e2f5b7304f

Related

ElasticSearch Query fields based on conditions on another field

Mapping
PUT /employee
{
"mappings": {
"post": {
"properties": {
"name": {
"type": "keyword"
},
"email_ids": {
"properties":{
"id" : { "type" : "integer"},
"value" : { "type" : "keyword"}
}
},
"primary_email_id":{
"type": "integer"
}
}
}
}
}
Data
POST employee/post/1
{
"name": "John",
"email_ids": [
{
"id" : 1,
"value" : "1#email.com"
},
{
"id" : 2,
"value" : "2#email.com"
}
],
"primary_email_id": 2 // Here 2 refers to the id field of email_ids.id (2#email.com).
}
I need help to form a query to check if an email id is already taken as a primary email?
eg: If I query for 1#email.com I should get result as No as 1#email.com is not a primary email id.
If I query for 2#email.com I should get result as Yes as 2#email.com is a primary email id for John.
As far as i know with this mapping you can not achive what you are expecting.
But, You can create email_ids field as nested type and add one more field like isPrimary and set value of it to true whenever email is primary email.
Index Mapping
PUT employee
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"email_ids": {
"type": "nested",
"properties": {
"id": {
"type": "integer"
},
"value": {
"type": "keyword"
},
"isPrimary":{
"type": "boolean"
}
}
},
"primary_email_id": {
"type": "integer"
}
}
}
}
Sample Document
POST employee/_doc/1
{
"name": "John",
"email_ids": [
{
"id": 1,
"value": "1#email.com"
},
{
"id": 2,
"value": "2#email.com",
"isPrimary": true
}
],
"primary_email_id": 2
}
Query
You need to keep below query as it is and only need to change email address when you want to see if email is primary or not.
POST employee/_search
{
"_source": false,
"query": {
"nested": {
"path": "email_ids",
"query": {
"bool": {
"must": [
{
"term": {
"email_ids.value": {
"value": "2#email.com"
}
}
},
{
"term": {
"email_ids.isPrimary": {
"value": "true"
}
}
}
]
}
}
}
}
}
Result
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.98082924,
"hits" : [
{
"_index" : "employee",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.98082924
}
]
}
}
Interpret Result:
Elasticsearch will not return result in boolean like true or false but you can implement it at application level. You can consider value of hits.total.value from result, if it is 0 then you can consider false otherwise true.
PS: Answer is based on ES version 7.10.

How to get per term statistics in Elasticsearch

I need to implement the following (on the backend): a user types a query and gets back hits as well as statistics for the hits. Below is a simplified example.
Suppose the query is Grif, then the user gets back (random words just for example)
Griffith
Griffin
Grif
Grift
Griffins
And frequency + number of documents a certain term occurs in, for example:
Griffith (freq 10, 3 docs)
Griffin (freq 17, 9 docs)
Grif (freq 6, 3 docs)
Grift (freq 9, 5 docs)
Griffins (freq 11, 4 docs)
I'm relatively new to Elasticsearch, so I'm not sure where to start to implement something like this. What type of query is the most suitable for this? What can I use to get that kind of statistics? Any other advice will be appreciated too.
There are multiple layers to this. You'd need:
n-gram / partial / search-as-you-type matching
a way to group the matched keywords by their original form
a mechanism to reversely look up the document & term frequencies.
I'm not aware of any way to achieve this in one go, but here's my take on it.
You could start off with a special, n-gram-powered analyzer, as explained in my other answer. There's the original content field, plus a multi-field mapping for the said analyzer, plus a keyword field to aggregate on down the line:
PUT my-index
{
"settings": {
"index": {
"max_ngram_diff": 20
},
"analysis": {
"tokenizer": {
"my_ngrams": {
"type": "ngram",
"min_gram": 3,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"my_ngrams_analyzer": {
"tokenizer": "my_ngrams",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"fields": {
"analyzed": {
"type": "text",
"analyzer": "my_ngrams_analyzer"
},
"keyword": {
"type": "keyword"
}
}
}
}
}
}
Next, bulk-insert some sample docs containing text inside the content field. Note that each doc has an _id too — you'll need those later on.
POST _bulk
{"index":{"_index":"my-index", "_id":1}}
{"content":"Griffith"}
{"index":{"_index":"my-index", "_id":2}}
{"content":"Griffin"}
{"index":{"_index":"my-index", "_id":3}}
{"content":"Grif"}
{"index":{"_index":"my-index", "_id":4}}
{"content":"Grift"}
{"index":{"_index":"my-index", "_id":5}}
{"content":"Griffins"}
{"index":{"_index":"my-index", "_id":6}}
{"content":"Griffith"}
{"index":{"_index":"my-index", "_id":7}}
{"content":"Griffins"}
Search for n-grams in the .analyzed field and group the matched documents by the original terms through the terms aggregation. At the same time, retrieve the _id of one of the bucketed documents through the top_hits aggregation. BTW — it doesn't matter which _id is returned in a given bucket — all will have contained the same bucketed term.
POST my-index/_search?filter_path=aggregations.*.buckets.key,aggregations.*.buckets.doc_count,aggregations.*.buckets.*.hits.hits._id
{
"size": 0,
"query": {
"term": {
"content.analyzed": "grif"
}
},
"aggs": {
"full_terms": {
"terms": {
"field": "content.keyword",
"size": 10
},
"aggs": {
"top_doc": {
"top_hits": {
"size": 1,
"_source": false
}
}
}
}
}
}
Observe the response. The filter_path URL parameter from the previous request reduces the response to just those attributes that we need — the untouched, original full_terms plus one of the underlying IDs:
{
"aggregations" : {
"full_terms" : {
"buckets" : [
{
"key" : "Griffins",
"doc_count" : 2,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "5"
}
]
}
}
},
{
"key" : "Griffith",
"doc_count" : 2,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "1"
}
]
}
}
},
{
"key" : "Grif",
"doc_count" : 1,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "3"
}
]
}
}
},
{
"key" : "Griffin",
"doc_count" : 1,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "2"
}
]
}
}
},
{
"key" : "Grift",
"doc_count" : 1,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "4"
}
]
}
}
}
]
}
}
}
Time for the fun part.
There's a specialized Elasticsearch API called Term Vectors which does exactly what you're after — it retrieves field & term stats from the whole index. In order for it to hand these stats over to you, it needs the document IDs — which you'll have obtained from the above aggregation!
Finally, since you've got multiple term vectors to work with, you can use the Multi term vectors API like so — again condensing the response thru filter_path:
POST /my-index/_mtermvectors?filter_path=docs.term_vectors.*.*.*.doc_freq,docs.term_vectors.*.*.*.term_freq
{
"docs": [
{
"_id": "5", <--- guaranteeing
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "1", <--- the response
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "3", <--- order
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "2",
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "4",
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
}
]
}
The result can be post-processed in your backend to form your autocomplete response. You've got A) the full terms, B) the number of matching documents (doc_freq), and C), the term frequency:
{
"docs" : [
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Griffins" : { | term
"doc_freq" : 2, | <-- # of docs
"term_freq" : 1 | term frequency
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Griffith" : {
"doc_freq" : 2,
"term_freq" : 1
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Grif" : {
"doc_freq" : 1,
"term_freq" : 1
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Griffin" : {
"doc_freq" : 1,
"term_freq" : 1
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Grift" : {
"doc_freq" : 1,
"term_freq" : 1
}
}
}
}
}
]
}
Shameless plug: if you're new to Elasticsearch and, just like me, learn best from real-world examples, consider buying my Elasticsearch Handbook.

ELASTICSEARCH - Get a count of values from the most recent document

I can't get a count of fields with a filtered document value.
I have this json
``
{
"took" : 6,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "net",
"_type" : "_doc",
"_id" : "RTHRTH",
"_score" : 1.0,
"_source" : {
"created_at" : "2020-05-31 19:01:01",
"data" : [...]
{
"_index" : "net",
"_type" : "_doc",
"_id" : "LLLoIJBHHM",
"_score" : 1.0,
"_source" : {
"created_at" : "2020-06-23 15:11:59",
"data" : [...]
}
}
]
}
}
``
In the "data" field, there are more fields within other fields respectively.
I want to filter the most recent document, and then count a certain value in the most recent document.
This is my query:
`{
"query": {
"match": {
"name.keyword": "net"
}
},
"sort": [
{
"created_at.keyword": {
"order": "desc"
}
}
],
"size": 1,
"aggs": {
"CountValue": {
"terms": {
"field": "data.add.serv.desc.keyword",
"include": "nginx"
}
}
}
}`
And the output is:
`{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"CountValue" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "nginx",
"doc_count" : 2
}
]
}
}`
I suspect that doc_count is the number of documents the value appears in, not the number of times the value is repeated within the filtered document.
Any advice I will be very grateful!
Unless any of the fields under the path data.add.serv are of the nested type, the terms agg will produce per-whole-doc results, not per-field.
Exempli gratia:
POST example/_doc
{
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
then
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
produces doc_count==1.
When, however, specified as nested:
DELETE example
PUT example
{
"mappings": {
"properties": {
"serv": {
"type": "nested"
}
}
}
}
POST example/_doc
{"serv":[{"desc":"nginx"},{"desc":"nginx"},{"desc":"nginx"}]}
then
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"nested": {
"path": "serv"
},
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
}
}
we end up with doc_count==3.
This has to do with the way non-nested array types are flattened and de-duplicated. At the end, you may need to reindex your collections after having applied the nested mapping.
EDIT
In order to only take the latest doc, you could do the following:
PUT example
{
"mappings": {
"properties": {
"serv": {
"type": "nested"
},
"created_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
}
}
}
}
then
POST example/_doc
{
"created_at" : "2020-05-31 19:01:01",
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
POST example/_doc
{
"created_at" : "2020-06-23 15:11:59",
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
then use a terms agg of size 1, sorted by timestamp desc:
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "created_at",
"order": {
"_term": "desc"
},
"size": 1
},
"aggs": {
"NAME2": {
"nested": {
"path": "serv"
},
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
}
}
}
}

Elasticsearch filter by multiple fields in an object which is in an array field

The goal is to filter products with multiple prices.
The data looks like this:
{
"name":"a",
"price":[
{
"membershipLevel":"Gold",
"price":"5"
},
{
"membershipLevel":"Silver",
"price":"50"
},
{
"membershipLevel":"Bronze",
"price":"100"
}
]
}
I would like to filter by membershipLevel and price. For example, if I am a silver member and query price range 0-10, the product should not appear, but if I am a gold member, the product "a" should appear. Is this kind of query supported by Elasticsearch?
You need to make use of nested datatype for price and make use of nested query for your use case.
Please see the below mapping, sample document, query and response:
Mapping:
PUT my_price_index
{
"mappings": {
"properties": {
"name":{
"type":"text"
},
"price":{
"type":"nested",
"properties": {
"membershipLevel":{
"type":"keyword"
},
"price":{
"type":"double"
}
}
}
}
}
}
Sample Document:
POST my_price_index/_doc/1
{
"name":"a",
"price":[
{
"membershipLevel":"Gold",
"price":"5"
},
{
"membershipLevel":"Silver",
"price":"50"
},
{
"membershipLevel":"Bronze",
"price":"100"
}
]
}
Query:
POST my_price_index/_search
{
"query": {
"nested": {
"path": "price",
"query": {
"bool": {
"must": [
{
"term": {
"price.membershipLevel": "Gold"
}
},
{
"range": {
"price.price": {
"gte": 0,
"lte": 10
}
}
}
]
}
},
"inner_hits": {} <---- Do note this.
}
}
}
The above query means, I want to return all the documents having price.price range from 0 to 10 and price.membershipLevel as Gold.
Notice that I've made use of inner_hits. The reason is despite being a nested document, ES as response would return the entire set of document instead of only the document specific to where the query clause is applicable.
In order to find the exact nested doc that has been matched, you would need to make use of inner_hits.
Below is how the response would return.
Response:
{
"took" : 128,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.9808291,
"hits" : [
{
"_index" : "my_price_index",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.9808291,
"_source" : {
"name" : "a",
"price" : [
{
"membershipLevel" : "Gold",
"price" : "5"
},
{
"membershipLevel" : "Silver",
"price" : "50"
},
{
"membershipLevel" : "Bronze",
"price" : "100"
}
]
},
"inner_hits" : {
"price" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.9808291,
"hits" : [
{
"_index" : "my_price_index",
"_type" : "_doc",
"_id" : "1",
"_nested" : {
"field" : "price",
"offset" : 0
},
"_score" : 1.9808291,
"_source" : {
"membershipLevel" : "Gold",
"price" : "5"
}
}
]
}
}
}
}
]
}
}
Hope this helps!
Let me take show you how to do it, using the nested fields and query and filter context. I will take your example to show, you how to define index mapping, index sample documents, and search query.
It's important to note the include_in_parent param in Elasticsearch mapping, which allows us to use these nested fields without using the nested fields.
Please refer to Elasticsearch documentation about it.
If true, all fields in the nested object are also added to the parent
document as standard (flat) fields. Defaults to false.
Index Def
{
"mappings": {
"properties": {
"product": {
"type": "nested",
"include_in_parent": true
}
}
}
}
Index sample docs
{
"product": {
"price" : 5,
"membershipLevel" : "Gold"
}
}
{
"product": {
"price" : 50,
"membershipLevel" : "Silver"
}
}
{
"product": {
"price" : 100,
"membershipLevel" : "Bronze"
}
}
Search query to show Gold with price range 0-10
{
"query": {
"bool": {
"must": [
{
"match": {
"product.membershipLevel": "Gold"
}
}
],
"filter": [
{
"range": {
"product.price": {
"gte": 0,
"lte" : 10
}
}
}
]
}
}
}
Result
"hits": [
{
"_index": "so-60620921-nested",
"_type": "_doc",
"_id": "1",
"_score": 1.0296195,
"_source": {
"product": {
"price": 5,
"membershipLevel": "Gold"
}
}
}
]
Search query to exclude Silver, with same price range
{
"query": {
"bool": {
"must": [
{
"match": {
"product.membershipLevel": "Silver"
}
}
],
"filter": [
{
"range": {
"product.price": {
"gte": 0,
"lte" : 10
}
}
}
]
}
}
}
Above query doesn't return any result as there isn't any matching result.
P.S :- this SO answer might help you to understand nested fields and query on them in detail.
You have to use Nested fields and nested query to archive this: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-nested-query.html
Define you Price property with type "Nested" and then you will be able to filter by every property of nested object

How can I get element at a particular index in elasticsearch?

I have stored three json objects in elasticsearch, each object has a title and projects array.
{"name": "haris","projects": [{"title": "Splunk"},{"title": "QRadar"},{"title": "LogAnalysis"}]}
{"name": "khalid","projects": [{"title": "MS"},{"title": "Google"},{"title": "Apple"}]}
{"name": "Hamid","projects": [{"title": "Toyota"},{"title": "Honda"},{"title": "Kia"}]}
I have written a query to extract a particular object by _id and its specific property projects
curl -XGET 'localhost:9200/jsontest/_search?pretty' -d '{"query" : { "match" : {"_id":"AV1kzzZqAzHWQ2S7B8f1"} }, "_source": ["projects"]}'
As expected it returns projects object
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [
{
"_index" : "jsontest",
"_type" : "json",
"_id" : "AV1kzzZqAzHWQ2S7B8f1",
"_score" : 1.0,
"_source" : {
"projects" : [{"title" : "Splunk"},{"title" : "QRadar"},{"title" : "LogAnalysis"}
]
}
}
]
}
}
Question: is there a way to retrieve value at a particular index of projects? This is dummy data, in my real scenario projects can have a large number of elements and each element itself is a json object with a lot of properties. I only need to retrieve value at certain index of projects.
Here is what i would do.
First the mapping
PUT test/my_objects/_mapping
{
"properties": {
"name":{
"type": "string",
"index": "not_analyzed"
},
"projects": {
"type": "nested"
}
}
}
Second Projects are indexed
PUT test/my_objects/1111
{
"name": "haris",
"projects": [
{"title": "Splunk"},
{"title": "QRadar"},
{"title": "LogAnalysis"}
]
}
Finally the aggregation query
GET test/my_objects/_search
{
"aggs": {
"by_name": {
"terms": {
"field": "name"
},
"aggs": {
"by_project": {
"nested": {
"path": "projects"
},
"aggs": {
"by_title": {
"terms": {
"field": "projects.title"
}
}
}
}
}
}
}
}
its not tested and a bit tedious because of the nested aggs but should work if you manipulate it further for you requirements

Resources