Es aggregation query is slow. How to improve? - elasticsearch

We have 7552642 documents with total size of 94.290219647 gibabyte data at elasticsearch production cluster.
XMS and XMX:
/usr/bin/java -Xms20g -Xmx20g
Cluster Information:
{
"cluster_name": "production",
"status": "green",
"timed_out": false,
"number_of_nodes": 2,
"number_of_data_nodes": 2,
"active_primary_shards": 10,
"active_shards": 20,
"relocating_shards": 0,
"initializing_shards": 0,
"unassigned_shards": 0,
"delayed_unassigned_shards": 0,
"number_of_pending_tasks": 0,
"number_of_in_flight_fetch": 0,
"task_max_waiting_in_queue_millis": 0,
"active_shards_percent_as_number": 100
}
Mapping
"posts": {
"mappings": {
"information": {
"_all": {
"enabled": true
},
"properties":{
"post_created_time": {
"type": "date"
},
"post_impressions": {
"type": "double"
},
"post_interactions": {
"type": "long"
},
"post_media_value": {
"type": "float"
},
"post_message": {
"type": "text"
},
"post_reach": {
"type": "double"
},
"post_type": {
"type": "keyword"
},
"profile_id": {
"type": "long"
},
"profile_name": {
"type": "text"
},
"platform_followed_by" :{
"type: "long"
}
}
}
}
}
Aggregation query:
{
"size":0,
"aggs":{
"total_influencers":{
"cardinality":{
"field":"profile_id"
}
},
"profiles_reach_bucket":{
"terms":{
"size":2147483639,
"field":"profile_id"
},
"aggs":{
"media_reach_bucket":{
"terms":{
"field":"_type",
"size":2147483639
},
"aggs":{
"media_reach":{
"avg":{
"field":"post_reach"
}
}
}
},
"total_media_reach":{
"sum_bucket":{
"buckets_path":"media_reach_bucket>media_reach"
}
}
}
},
"total_reach":{
"sum_bucket":{
"buckets_path":"profiles_reach_bucket>total_media_reach"
}
},
"total_impressions":{
"sum":{
"field":"post_impressions"
}
},
"total_interactions":{
"sum":{
"field":"post_interactions"
}
},
"total_followers":{
"sum":{
"field":"platform_followed_by"
}
},
"interactions_for_media_value":{
"terms":{
"size":2147483639,
"field":"_type"
},
"aggs":{
"interaction":{
"sum":{
"field":"post_interactions"
}
}
}
},
"impressions_for_media_value":{
"terms":{
"size":2147483639,
"field":"_type"
},
"aggs":{
"impression":{
"sum":{
"field":"post_impressions"
}
}
}
}
},
"query":{
"bool":{
"must":[
{
"query_string":{
"query":"post_message:*food* post_description:*food* post_title:*food* "
}
}
],
"filter":[
{
"range":{
"post_created_time":{
"gte":1533427200,
"lte":1549324800
}
}
}
]
}
}
}
Result:
The above query took
"took": 6685,
"timed_out": false,
"_shards": {
"total": 10,
"successful": 10,
"skipped": 0,
"failed": 0
}
Any suggestion to optimize will be grateful.

Related

Elasticsearch Copy_to data need to copied self subdocument

Thanks in advance for helping.
I have created ES mapping as :
{"mappings": {
"policy": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"tags": {
"properties": {
"scope": {
"type": "text",
"store": "true",
"copy_to": [
"tags.tag_scope"
]
},
"tag": {
"type": "text",
"store": "true",
"copy_to": [
"tags.tag_scope"
]
},
"tag_scope": {
"type": "text",
"store": "true"
}
}
}
}
}
}
}
When i index policy document all tag and scope value from different tags document copied to tag_scope property.
For Example I added a document as to elastic search:
{
"name": "policy1",
"tags": [
{
"tag": "pepsi",
"scope": "prod"
},
{
"tag": "coke",
"scope": "dev"
}
]
}
It is storing all 4 values as in tag_scope documents as:
"tags.tag_scope": [
"pepsi",
"test",
"coke",
"dev"
]
My Exceptions was, it should store like :
{
"name": "policy1",
"tags": [
{
"tag": "pepsi",
"scope": "prod",
"tag_scope" : ["pepsi","prod"]
},
{
"tag": "coke",
"scope": "dev",
"tag_scope" : ["coke","dev"]
}
]
}
Could you please help me to do correct mapping for same?
What you are looking for is Nested Datatype. Change your mapping to the below:
PUT <your_index_name>
{
"mappings":{
"policy":{
"properties":{
"name":{
"type":"text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":256
}
}
},
"tags":{
"type": "nested",
"properties":{
"scope":{
"type":"text",
"store":"true",
"copy_to":[
"tags.tag_scope"
]
},
"tag":{
"type":"text",
"store":"true",
"copy_to":[
"tags.tag_scope"
]
},
"tag_scope":{
"type":"text",
"store":"true",
"fields": { <---- Added this
"keyword": {
"type": "keyword"
}
}
}
}
}
}
}
}
}
Notice how I've made tags as nested type. This would allow the below to be stored as individual document itself, which in your case tags basically has two nested documents.
{
"tag":"coke",
"scope":"dev"
}
Now your tags.tag_scope should be what you are expecting it to be.
Now when it comes to querying for what you are looking for, the below is how a Nested Query should be.
Nested Query:
POST <your_index_name>/_search
{
"query": {
"bool": {
"must": [
{
"nested": {
"path": "tags",
"query": {
"bool": {
"must": [
{
"match": {
"tags.tag_scope": "pepsi"
}
},
{
"match": {
"tags.tag_scope": "prod"
}
}
]
}
}
}
}
]
}
}
}
As to return the list of unique tags.tag_scope value you would need to return aggregation query. Notice that I've mentioned size:0 which means I only want to see aggregation result and not normal query results.
Aggregation Query:
POST <your_index_name>/_search
{
"size":0,
"query":{
"bool":{
"must":[
{
"nested":{
"path":"tags",
"query":{
"bool":{
"must":[
{
"match":{
"tags.tag_scope":"pepsi"
}
},
{
"match":{
"tags.tag_scope":"prod"
}
}
]
}
}
}
}
]
}
},
"aggs":{ <----- Aggregation Query Starts Here
"myscope":{
"nested":{
"path":"tags"
},
"aggs":{
"uniqui_scope":{
"terms":{
"field":"tags.tag_scope.keyword",
"size":10
}
}
}
}
}
}
Aggregation Response:
{
"took": 53,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"myscope": {
"doc_count": 2,
"uniqui_scope": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "coke",
"doc_count": 1
},
{
"key": "dev",
"doc_count": 1
},
{
"key": "pepsi",
"doc_count": 1
},
{
"key": "prod",
"doc_count": 1
}
]
}
}
}
}
Hope this helps.

Getting empty buckets array in elasticsearch aggregation

I'm using version 5.4.1 of ElasticSearch.
When I try to perform a groupBy aggregate/bucket aggregate, I'm not getting any values in the bucket array.
This is my index:
curl -X PUT localhost:9200/urldata -d '{
"mappings" : {
"components" : {
"properties" : {
"name" : {
"type" : "keyword",
"index" : "not_analyzed"
},
"status" : {
"type" : "keyword",
"index" : "not_analyzed"
},
"timestamp":{
"type":"date",
"index":"not_analyzed"
}
}
}
}
}'
And this the aggregate query:
curl -XGET 'localhost:9200/urldata/_search?pretty' -H 'Content-Type: application/json' -d'
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name.keyword"
}
}
}
}
'
Output:
{
"took":2,
"timed_out":false,
"_shards":{
"total":5,
"successful":5,
"failed":0
},
"hits":{
"total":3,
"max_score":0.0,
"hits":[
]
},
"aggregations":{
"components":{
"doc_count_error_upper_bound":0,
"sum_other_doc_count":0,
"buckets":[
]
}
}
}
Where am I going wrong??
Try this, it should do it:
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name"
}
}
}
}
EDIT:
Here is all the steps to replicate your use case:
PUT test
{
"settings" : {
"index" : {
"number_of_shards" : 1,
"number_of_replicas" : 0
}
}
}
PUT test/_mapping/people_name
{
"properties":{
"name":{
"type":"keyword",
"index":"not_analyzed"
},
"status":{
"type":"keyword",
"index":"not_analyzed"
},
"timestamp":{
"type":"date",
"index":"not_analyzed"
}
}
}
POST test/people_name
{
"name": "A",
"status": "success",
"created_at": "2017-08-17"
}
POST test/people_name
{
"name": "A",
"status": "success_2",
"created_at": "2017-06-15"
}
POST test/people_name
{
"name": "B",
"status": "success",
"created_at": "2017-09-15"
}
GET test/people_name/_search
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name"
}
}
}
}
The result of the aggregation is:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"components": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 2
},
{
"key": "B",
"doc_count": 1
}
]
}
}
}

elasticsearch: term query fails

I have a mapping for some documents and queries agains terms does fail. I don't understand why:
"mappings":{
"timeslot":{
"properties":{
"FOB_IN":{
"type":"long"
},
"TRIGGER_CODE":{
"type":"long"
},
"FLIGHT_PHASE":{
"type":"long"
},
"REP16_TRIG":{
"type":"long"
},
"fwot":{
"type":"string"
},
"FOB_OUT":{
"type":"long"
},
"FP":{
"type":"long"
},
"FLTNB":{
"type":"string"
},
"Date":{
"format":"strict_date_optional_time||epoch_millis",
"type":"date"
}
}
}
}
I can make a term query against TRIGGER_CODE, for example, and it works fine
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 4.4446826,
"hits": [
{
"_index": "merged-2016-04",
"_type": "timeslot",
"_id": "AVRS8VnirVLwfvMnwpXb",
"_score": 4.4446826,
"_source": {
"Date": "2016-04-03T08:42:44+0000",
"FLIGHT_PHASE": 20,
"TRIGGER_CODE": 4000,
"fwot": "A6-APA"
}
}
]
}
}
now the same against fwot does fail. What's wrong?
GET merged-2016-04/_search?size=1
{
"query" : {
"term" : { "fwot": "A6-APA"}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
You need fwot to be "index": "not_analyzed" for that to work. And you need to reindex the data for the above change to work.
Here's the complete list of commands for the mapping change and some test data:
PUT /merged-2016-04
{
"mappings": {
"timeslot": {
"properties": {
"FOB_IN": {
"type": "long"
},
"TRIGGER_CODE": {
"type": "long"
},
"FLIGHT_PHASE": {
"type": "long"
},
"REP16_TRIG": {
"type": "long"
},
"fwot": {
"type": "string",
"index": "not_analyzed"
},
"FOB_OUT": {
"type": "long"
},
"FP": {
"type": "long"
},
"FLTNB": {
"type": "string"
},
"Date": {
"format": "strict_date_optional_time||epoch_millis",
"type": "date"
}
}
}
}
}
POST /merged-2016-04/timeslot
{
"Date": "2016-04-03T08:42:44+0000",
"FLIGHT_PHASE": 20,
"TRIGGER_CODE": 4000,
"fwot": "A6-APA"
}
GET merged-2016-04/_search?size=1
{
"query": {
"term": {
"fwot": "A6-APA"
}
}
}
See the doc page Query DLS term query, note "Why doesn’t the term query match my document" for a detailed explanation.
We Can use keyword
GET merged-2016-04/_search?size=1
{
"query": {
"term": {
"fwot.keyword": "A6-APA"
}
}
}

ElasticSearch: Is it possible to produce a "Temporary Field" during a search request?

Sample Document:
{
"text": "this is my text",
"categories": [
{"category": "sample category"},
{"category": "local news"}
]
}
The mapping currently is:
{
"topic": {
"properties": {
"categories": {
"properties": {
"category": {
"type": "string",
"store": "no",
"term_vector": "with_positions_offsets",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word",
"include_in_all": "true",
"boost": 8,
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
Search query:
{
"_source": false,
"query":{
"match":{
"categories.category":"news"
}
},
"aggs": {
"match_count": {
"terms" : {"field": "categories.category.raw"}
}
}
}
The result I want it to be:
{
...
"buckets": [
{
"key": "local news",
"doc_count": 1
}
]
...
}
The result actually is (it aggregates all matching documents' categories.category):
{
...
"buckets": [
{
"key": "local news",
"doc_count": 1
},{
"key": "sample category", //THIS PART IS NOT NEEDED
"doc_count": 1
}
]
...
}
Is it possible to add a temporary field during a search? In this case let's say name all the matching categories.category as categories.match_category, and aggregates by this temporary field categories.match_category? If true how can I do it and if not what should I do then?
You have multiple documents within your document and you need to match against some of them, you should probably change mapping into nested documents as follows:
mapping
{
"topic": {
"properties": {
"categories": {
"type":"nested",
"properties": {
"category": {
"type": "string",
"store": "no",
"term_vector": "with_positions_offsets",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word",
"include_in_all": "true",
"boost": 8,
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
Then you can perform your query as follows
{
"_source": false,
"query":{
"filtered":{
"query":{
"match":{
"categories.category":
{
"query" : "news",
"cutoff_frequency" : 0.001
}
}
}
}
},
"aggs": {
"categ": {
"nested" : {
"path" : "categories"
},
"aggs":{
"match_count": {
"terms" : {"field": "categories.category.raw"}
}
}
}
}
}
Try it
Another approach but with a more specific to your needs logic is the following:
mapping
{
"topic": {
"properties": {
"categories": {
"type":"nested",
"properties": {
"category": {
"type": "string",
"store": "no",
"analyzer": "simple",
"include_in_all": "true",
"boost": 8,
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
data
{
"text": "this is my text",
"categories": [
{"category": "sample category"},
{"category": "local news"}
]
}
query
{
"query":{
"nested":{
"path":"categories",
"query":{
"filtered":{
"query":{
"match":{
"categories.category":"news"
}
}
}
}
}
},
"aggs": {
"nest":{
"nested":{
"path":"categories"
},
"aggs":{
"filt":{
"filter" : {
"script": {
"script" : "doc['categories.category'].values.contains('news')"
}
},
"aggs":{
"match_count": {
"terms" : {"field": "categories.category.raw"}
}
}
}
}
}
}
}
produced result
{
"_shards": {
"failed": 0,
"successful": 5,
"total": 5
},
"aggregations": {
"nest": {
"doc_count": 2,
"filt": {
"doc_count": 1,
"match_count": {
"buckets": [
{
"doc_count": 1,
"key": "local news"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
}
},
"hits": {
"hits": [],
"max_score": 0.0,
"total": 1
},
"timed_out": false,
"took": 3
}
The catch here is that you have to create your own, according to your needs script filter in the aggregation, the above example worked for me with a simple analyzer in the "category" mapping

Elasticsearch: generating terms from array using script

Would love an explanation of why this happens and how to correct it.
Here's a snippet of the source document:
{
"created_time":1412988495000,
"tags":{
"items":[
{
"tag_type":"Placement",
"tag_id":"id1"
},
{
"tag_type":"Product",
"tag_id":"id2"
}
]
}
}
The following terms aggregation:
"aggs":{
"tags":{
"terms":{
"script":"doc['tags'].value != null ? doc['tags.items.tag_type'].value + ':' + doc['tags.items.tag_id'].value : ''",
"size":2000,
"exclude":{
"pattern":"null:null"
}
}
}
}
returns:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Placement:id2",
"doc_count":1
}
]
...when you would expect:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Product:id2",
"doc_count":1
}
]
I would probably go with a nested type. I don't know all the details of your setup, but here is a proof of concept, at least. I took out the "items" property because I didn't need that many layers, and just used "tags" as the nested type. It could be added back in if needed, I think.
So I set up an index with a "nested" property:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"doc": {
"properties": {
"created_time": {
"type": "date"
},
"tags": {
"type": "nested",
"properties": {
"tag_type": {
"type": "string",
"index": "not_analyzed"
},
"tag_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then added a couple of docs (notice that the structure differs slightly from yours):
PUT /test_index/doc/1
{
"created_time": 1412988495000,
"tags": [
{
"tag_type": "Placement",
"tag_id": "id1"
},
{
"tag_type": "Product",
"tag_id": "id2"
}
]
}
PUT /test_index/doc/2
{
"created_time": 1412988475000,
"tags": [
{
"tag_type": "Type3",
"tag_id": "id3"
},
{
"tag_type": "Type4",
"tag_id": "id3"
}
]
}
Now a scripted terms aggregation inside a nested aggregation seems to do the trick:
POST /test_index/_search?search_type=count
{
"query": {
"match_all": {}
},
"aggs": {
"tags": {
"nested": { "path": "tags" },
"aggs":{
"tag_vals": {
"terms": {
"script": "doc['tag_type'].value+':'+doc['tag_id'].value"
}
}
}
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"tags": {
"doc_count": 4,
"tag_vals": {
"buckets": [
{
"key": "Placement:id1",
"doc_count": 1
},
{
"key": "Product:id2",
"doc_count": 1
},
{
"key": "Type3:id3",
"doc_count": 1
},
{
"key": "Type4:id3",
"doc_count": 1
}
]
}
}
}
}
Here is the code I used:
http://sense.qbox.io/gist/4ceaf8693f85ff257c2fd0639ba62295f2e5e8c5

Resources