Elasticsearch is taking too much time. I have the following kind of mappings.
Mappings
{
"mappings": {
"properties": {
"manufacture_id": {
"type": "long"
},
"product_id": {
"type": "long"
},
"seller_id": {
"type": "long"
},
"sell_amount": {
"type": "double"
}
}
}
}
Sample documents
[
{
"manufacture_id": 12,
"product_id": 1,
"seller_id": 1,
"sell": 70
},
{
"manufacture_id": 12,
"product_id": 1,
"seller_id": 1,
"sell": 40
},
{
"manufacture_id": 12,
"product_id": 1,
"seller_id": 2,
"sell": 10
},
{
"manufacture_id": 1234,
"product_id": 2,
"seller_id": 1,
"sell": 20
},
{
"manufacture_id": 1234,
"product_id": 2,
"seller_id": 2,
"sell": 120
},
{
"manufacture_id": 1234,
"product_id": 2,
"seller_id": 3,
"sell": 90
},
{
"manufacture_id": 1234,
"product_id": 2,
"seller_id": 3,
"sell": 20
}
]
What I want is the average sell of each product for a given manufacturer.
Query:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"manufacture_id": {
"query": 1234
}
}
}
]
}
},
"aggregations": {
"products": {
"terms": {
"field": "product_id"
},
"aggregations": {
"totalSell": {
"sum": {
"field": "sell"
}
},
"sellerCount": {
"cardinality": {
"field": "seller_id"
}
},
"avgSellOfProduct": {
"bucket_script": {
"buckets_path": {
"totalSellAmnt": "totalSell",
"totalSeller": "sellerCount"
},
"script": {
"source": "params.totalSellAmnt / params.totalSeller",
"lang": "painless"
}
}
}
}
}
}
}
I have around 15 million documents for this manufacture id 1234.
This query is taking time around 5 seconds to complete.
Is there any way to improve this search speed?
Also in this index, we have around 200 million documents (1 shard).
Any suggestion on how i manage this large amount of data?
Should i make multiple indexes?
Related
I have an Elasticsearch index structured like this
{
"mappings": {
"properties": {
"content": {
"type": "text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":20
}
}
},
"result_nums":{
"type":"integer"
}
}
}
}
and all documents in the index like this
{
"content": "this",
"result_nums": 40
},
{
"content": "this",
"result_nums": 40
},
{
"content": "that",
"result_nums": 40
},
{
"content": "what",
"result_nums": 50
},
{
"content": "what",
"result_nums": 50
},
{
"content": "but",
"result_nums": 100
},
{
"content": "like",
"result_nums": 20
}
I need to get the data, sorting by result_nums DESC and removing duplicate "content". For example, I used the query like this to get the first two data
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"size": 2
},
"aggs": {
"res_nums": {
"avg": {
"field": "result_nums"
}
},
"res_sort": {
"bucket_sort": {
"sort": [
{
"res_nums": "desc"
}
]
}
}
}
}
}
}
The data I expect to get is
{
"key": "but",
"doc_count": 1,
"res_nums": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
}
But what I actually get is
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
},
{
"key": "this",
"doc_count": 2,
"res_nums": {
"value": 40.0
}
}
so I think es needs to be sorted before aggregation, because now it will only be sorted after aggregation, so I got results that did not match expectations.
I tried to use sort before aggregation but no effect
{
"size": 0,
"sort": [
{
"result_nums": "desc"
}
],
"aggs": {
...
}
...
}
So how to do sort before aggregation?
You need to use max aggregation along with term query to get the data, sorting by result_nums DESC and removing duplicate "content"
Adding a working example
Search Query:
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"order": {
"max_num": "desc"
},
"size":2
},
"aggs": {
"max_num": {
"max": {
"field": "result_nums"
}
}
}
}
}
}
Search Result:
"aggregations": {
"content": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 4,
"buckets": [
{
"key": "but",
"doc_count": 1,
"max_num": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"max_num": {
"value": 50.0
}
}
]
}
I'm filtering prices dynamically with the given currency rates and sorting them with the score which is generated by script. But there is one thing I could not figure out how to do is range filter.
For example I only want to get product_platforms only match score between 10 and 100.
Index request.
PUT /test_products
{
"settings": {
"number_of_shards": 3,
"number_of_replicas": 0,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "2",
"max_gram": "15"
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "keyword",
"doc_values": true
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
},
"raw": {
"type": "keyword"
}
},
"analyzer": "autocomplete",
"search_analyzer": "standard"
},
"product_platforms": {
"type": "nested",
"properties": {
"id": {
"type": "long"
},
"platform_id": {
"type": "long"
},
"price": {
"type": "float"
},
"currency_id": {
"type": "long"
},
"currency_code": {
"enabled": false
},
"sku": {
"type": "keyword"
},
"quantity": {
"type": "long"
}
}
}
}
}
}
Insert test documents:
POST /test_products/_bulk?pretty&refresh
{"index":{"_id": 1}}
{"id": 1, "name": "1. Product", "product_platforms": [{"id": 11, "platform_id": 3, "price": 100, "currency_id": 1, "currency_code": "TRY", "sku": "URN_1_1", "quantity": 1},{"id": 12, "platform_id": 3, "price": 75, "currency_id": 2, "currency_code": "USD", "sku": "URN_1_2", "quantity": 1},{"id": 13, "platform_id": 2, "price": 15, "currency_id": 2, "currency_code": "USD", "sku": "URN_1_3", "quantity": 1}]}
{"index":{"_id": 2}}
{"id": 2, "name": "2. Product", "product_platforms": [{"id": 21, "platform_id": 3, "price": 50, "currency_id": 1, "currency_code": "TRY", "sku": "URN_2_1", "quantity": 1},{"id": 22, "platform_id": 3, "price": 25, "currency_id": 2, "currency_code": "USD", "sku": "URN_2_2", "quantity": 1},{"id": 23, "platform_id": 3, "price": 75, "currency_id": 1, "currency_code": "TRY", "sku": "URN_2_3", "quantity": 1}, {"id": 24, "platform_id": 3, "price": 20, "currency_id": 2, "currency_code": "USD", "sku": "URN_2_4", "quantity": 1}]}
And here is the my search query:
GET /test_products/_search
{
"query": {
"nested": {
"path": "product_platforms",
"score_mode": "max",
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"term": {
"product_platforms.platform_id": {
"value": "3"
}
}
}
]
}
},
"boost_mode": "replace",
"script_score": {
"script": {
"source": """
doc['product_platforms.price'].value * (doc['product_platforms.currency_id'].value == 2 ? params.rate_usd : (doc['product_platforms.currency_id'].value == 3 ? params.rate_eur : params.rate_try)) """,
"params": {
"rate_try": 1,
"rate_usd": 7,
"rate_eur": 8
}
}
}
}
},
"inner_hits": {
"name": "product_platforms",
"_source": true,
"size": 5,
"sort": {
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": """ doc['product_platforms.price'].value * (doc['product_platforms.currency_id'].value == 2 ? params.rate_usd : (doc['product_platforms.currency_id'].value == 3 ? params.rate_eur : params.rate_try)) """,
"params": {
"rate_try": 1,
"rate_usd": 7,
"rate_eur": 8
}
},
"order": "desc"
}
}
}
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}
I'm using version 7.10 btw.
You could repeat that score calculator once again, this time in a boolean script query of its own.
Now, since your currency conversion script repeats itself one too many times, you could store it and reference it by its ID every time you need it. You'll of course keep the rates parametrized but the whole thing will be a bit more readable and maintainable.
So, let's save the script first:
POST _scripts/product-platforms-converter
{
"script": {
"source": """
def price = doc['product_platforms.price'].value;
def currency_id = doc['product_platforms.currency_id'].value;
def converted_price = price * (currency_id == 2
? params.rate_usd : (currency_id == 3
? params.rate_eur : params.rate_try));
if (params.final_range != null) {
def is_in_range = converted_price >= params.final_range.gte
&& converted_price <= params.final_range.lte;
return is_in_range;
}
return converted_price;
""",
"lang": "painless"
}
}
Notice that if final_range is provided in the params, the script returns a boolean; if not, it'll simply return the converted_price.
After that, the original query can be rewritten as:
GET /test_products/_search
{
"query": {
"nested": {
"path": "product_platforms",
"score_mode": "max",
"query": {
"function_score": {
"query": {
"bool": {
"must": [
{
"term": {
"product_platforms.platform_id": {
"value": "3"
}
}
},
{
"script": {
"script": {
"id": "product-platforms-converter",
"params": {
"rate_try": 1,
"rate_usd": 7,
"rate_eur": 8,
"final_range": { <--- the desired "range" query
"gte": 10,
"lte": 100
}
}
}
}
}
]
}
},
"boost_mode": "replace",
"script_score": {
"script": {
"id": "product-platforms-converter",
"params": {
"rate_try": 1,
"rate_usd": 7,
"rate_eur": 8
}
}
}
}
},
"inner_hits": {
"name": "product_platforms",
"_source": true,
"size": 5,
"sort": {
"_script": {
"type": "number",
"script": {
"id": "product-platforms-converter",
"params": {
"rate_try": 1,
"rate_usd": 7,
"rate_eur": 8
}
},
"order": "desc"
}
}
}
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}
I have data in the following form in Elasticsearch:
[
{
"id": 1,
"name": "abc",
"score": 10,
"values": [
{
"v1": 1,
"v2": 2
}
]
},
{
"id": 2,
"name": "def",
"score": 20,
"values": [
{
"v1": 3,
"v2": 4
}
]
}
]
Currently, I am calculating average on score using the following code:
s = Search(using=es, index=index).query(Q(query))
s.aggs.bucket('average_score', 'avg', field='score') # average_score = 15
Now I wish to compute average on v1 and v2 using elasticsearch_dsl but I don't know how to do nested aggregation. In this example, v1 = 2 and v2 = 3. Please help. Thanks in advance!
Edit: Unfortunately, here values is an object instead of nested.
No idea how to do it in your DSL but here are the raw steps.
Mapping:
PUT avgs/
{
"mappings": {
"properties": {
"values": {
"type": "nested",
"properties": {
"v1": {
"type": "integer"
},
"v2": {
"type": "integer"
}
}
}
}
}
}
Indexing:
POST avgs/_doc
{
"id": 1,
"name": "abc",
"score": 10,
"values": [
{
"v1": 1,
"v2": 2
}
]
}
POST avgs/_doc
{
"id": 2,
"name": "def",
"score": 20,
"values": [
{
"v1": 3,
"v2": 4
}
]
}
Querying:
GET avgs/_search
{
"size": 0,
"aggs": {
"avg_v1": {
"nested": {
"path": "values"
},
"aggs": {
"nested_v1": {
"avg": {
"field": "values.v1"
}
}
}
},
"avg_v2": {
"nested": {
"path": "values"
},
"aggs": {
"nested_v2": {
"avg": {
"field": "values.v2"
}
}
}
}
}
}
Yielding:
...
"aggregations" : {
"avg_v2" : {
"doc_count" : 2,
"nested_v2" : {
"value" : 3.0
}
},
"avg_v1" : {
"doc_count" : 2,
"nested_v1" : {
"value" : 2.0
}
}
}
...
For example I have the following records with the columns as:(Country,City,Date,Income)
USA SF 2015-08 50
USA SF 2015-05 30
USA SF 2015-01 20
USA NY 2015-05 70
USA NY 2015-02 10
U.K LD 2015-05 90
My sql as: select country,city,max(date) as maxDate,sum(income) as sumIncome from testTable group by country,city order by maxDate desc,sumIncome desc limit 3.
So the result should be:
USA SF 2015-08 100
U.K LD 2015-05 90
USA NY 2015-05 80
I wrote the ES aggregates as following, but it's wrong:
"aggs":{"sub1": {"terms":{"field":"contry"},
"aggs":{"sub2":{"terms":{"field":"city",
"order":[{"submax":"DESC"},{"subsum":"DESC"}]},
"aggs":{"submax":{"max":{"field":"date"}},"subsum":{"sum":{"field":"income"}}}}}}}
By my above script, it got the wrong result as following:
USA SF 2015-08 100
USA NY 2015-05 80
U.K LD 2015-05 90
You, actually, have two options, now that I understood the requirement.
Option 1
Use a script to "concatenate" country field and city field. Using the regular aggregations per field to do what you want is not possible in Elasticsearch.
Instead you need to do something like this:
GET /test/test/_search?search_type=count
{
"aggs": {
"sub1": {
"terms": {
"script": "doc['country'].value + ' ' + doc['city'].value",
"size": 3,
"order": [
{
"submax": "DESC"
},
{
"subsum": "DESC"
}
]
},
"aggs": {
"submax": {
"max": {
"field": "date"
}
},
"subsum": {
"sum": {
"field": "income"
}
}
}
}
}
}
With curl:
curl -XPOST "http://localhost:9200/livebox/type1/_search?search_type=count" -d'
{
"aggs": {
"sub1": {
"terms": {
"script": "doc[\"boxname\"].value + \" \" + doc[\"app\"].value",
"size": 3,
"order": [
{
"submax": "DESC"
},
{
"subsum": "DESC"
}
]
},
"aggs": {
"submax": {
"max": {
"field": "date"
}
},
"subsum": {
"sum": {
"field": "count"
}
}
}
}
}
}'
And the result of the aggregation will generate terms that are of the following form: country + + city.
"buckets": [
{
"key": "usa sf",
"doc_count": 3,
"subsum": {
"value": 100
},
"submax": {
"value": 1438387200000,
"value_as_string": "2015-08"
}
},
{
"key": "uk ld",
"doc_count": 1,
"subsum": {
"value": 90
},
"submax": {
"value": 1430438400000,
"value_as_string": "2015-05"
}
},
{
"key": "usa ny",
"doc_count": 2,
"subsum": {
"value": 80
},
"submax": {
"value": 1430438400000,
"value_as_string": "2015-05"
}
}
]
Option 2
Use _source transformation that will build a new field at indexing time, which will "move" the performance impact of running a script at aggregation time.
The mapping of the index, as it needs some changes, whatever you have now:
PUT /test
{
"mappings": {
"test": {
"transform": {
"script": "ctx._source['country_and_city'] = ctx._source['country'] + ' ' + ctx._source['city']"
},
"properties": {
"country": {
"type": "string"
},
"city": {
"type": "string"
},
"income": {
"type": "integer"
},
"date": {
"type": "date",
"format": "yyyy-MM"
},
"country_and_city": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
The query:
GET /test/test/_search?search_type=count
{
"aggs": {
"sub1": {
"terms": {
"field": "country_and_city",
"order": [
{
"submax": "DESC"
},
{
"subsum": "DESC"
}
]
},
"aggs": {
"submax": {
"max": {
"field": "date"
}
},
"subsum": {
"sum": {
"field": "income"
}
}
}
}
}
}
And the result:
"buckets": [
{
"key": "usa sf",
"doc_count": 3,
"subsum": {
"value": 100
},
"submax": {
"value": 1438387200000,
"value_as_string": "2015-08"
}
},
{
"key": "uk ld",
"doc_count": 1,
"subsum": {
"value": 90
},
"submax": {
"value": 1430438400000,
"value_as_string": "2015-05"
}
},
{
"key": "usa ny",
"doc_count": 2,
"subsum": {
"value": 80
},
"submax": {
"value": 1430438400000,
"value_as_string": "2015-05"
}
}
]
I would like to find the minimum value of a field in a nested array object after aggregation.
Data example:
[
{
"id": "i1",
"version": 1,
"entries": [
{
"name": "n1",
"position": 1
}, {
"name": "n2",
"position": 2
}
]
}, {
"id": "i1"
"version": 2,
"entries": [
{
"name": "n2",
"position": 3
}, {
"name": "n3",
"position": 4
}
]
},
{
"id": "i2",
"version": 1,
"entries": [
{
"name": "n1",
"position": 8
}, {
"name": "n2",
"position": 7
}
]
}, {
"id": "i2"
"version": 2,
"entries": [
{
"name": "n2",
"position": 6
}, {
"name": "n3",
"position": 5
}
]
}
]
Pseudo Query:
SELECT min(entries["n2"].position) WHERE entries.name="n2" GROUP BY id;
Expected Result:
[
{
"id": "i1",
"min(position)": 2
}, {
"id": "i2",
"min(position)": 6
}
]
I can do this in code, but it's not performant, as I need to return the document sources which can be quite large.
I am thinking of denormalizing the data, but would like to first know if this request is not possible at all.
You can do it by nesting several aggregations like this:
terms agg -> nested agg -> filter agg -> min agg
To test it I set up an index:
PUT /test_index
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"doc": {
"properties": {
"entries": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"position": {
"type": "long"
}
}
},
"id": {
"type": "string"
},
"version": {
"type": "long"
}
}
}
}
}
And indexed your docs:
PUT /test_index/doc/_bulk
{"index":{"_id":1}}
{"id":"i1","version":1,"entries":[{"name":"n1","position":1},{"name":"n2","position":2}]}
{"index":{"_id":2}}
{"id":"i1","version":2,"entries":[{"name":"n2","position":3},{"name":"n3","position":4}]}
{"index":{"_id":3}}
{"id":"i2","version":1,"entries":[{"name":"n1","position":8},{"name":"n2","position":7}]}
{"index":{"_id":4}}
{"id":"i2","version":2,"entries":[{"name":"n2","position":6},{"name":"n3","position":5}]}
Here is the query:
POST /test_index/_search?search_type=count
{
"aggs": {
"id_terms": {
"terms": {
"field": "id"
},
"aggs": {
"nested_entries": {
"nested": {
"path": "entries"
},
"aggs": {
"filter_name": {
"filter": {
"term": {
"entries.name": "n2"
}
},
"aggs": {
"min_position": {
"min": {
"field": "position"
}
}
}
}
}
}
}
}
}
}
and the result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"id_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "i1",
"doc_count": 2,
"nested_entries": {
"doc_count": 4,
"filter_name": {
"doc_count": 2,
"min_position": {
"value": 2,
"value_as_string": "2.0"
}
}
}
},
{
"key": "i2",
"doc_count": 2,
"nested_entries": {
"doc_count": 4,
"filter_name": {
"doc_count": 2,
"min_position": {
"value": 6,
"value_as_string": "6.0"
}
}
}
}
]
}
}
}
Here is the code I used all together:
http://sense.qbox.io/gist/34a013099ef07fb527d9d7cf8490ad1bbafa718b