Aggregation in elasticsearch with specific parameter - elasticsearch

I have bulk documents in elasticsearch and as an example I have taken the elasticsearch documentation example as banks
{
"_index": "bank",
"_type": "account",
"_id": "25",
"_score": 1,
"_source": {
"account_number": 25,
"balance": 40540,
"firstname": "Virginia",
"lastname": "Ayala",
"age": 39,
"gender": "F",
"address": "171 Putnam Avenue",
"employer": "Filodyne",
"email": "virginiaayala#filodyne.com",
"city": "Nicholson",
"state": "PA"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "44",
"_score": 1,
"_source": {
"account_number": 44,
"balance": 34487,
"firstname": "Aurelia",
"lastname": "Harding",
"age": 37,
"gender": "M",
"address": "502 Baycliff Terrace",
"employer": "Orbalix",
"email": "aureliaharding#orbalix.com",
"city": "Yardville",
"state": "DE"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "99",
"_score": 1,
"_source": {
"account_number": 99,
"balance": 47159,
"firstname": "Ratliff",
"lastname": "Heath",
"age": 39,
"gender": "F",
"address": "806 Rockwell Place",
"employer": "Zappix",
"email": "ratliffheath#zappix.com",
"city": "Shaft",
"state": "ND"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "119",
"_score": 1,
"_source": {
"account_number": 119,
"balance": 49222,
"firstname": "Laverne",
"lastname": "Johnson",
"age": 28,
"gender": "F",
"address": "302 Howard Place",
"employer": "Senmei",
"email": "lavernejohnson#senmei.com",
"city": "Herlong",
"state": "DC"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "126",
"_score": 1,
"_source": {
"account_number": 126,
"balance": 3607,
"firstname": "Effie",
"lastname": "Gates",
"age": 39,
"gender": "F",
"address": "620 National Drive",
"employer": "Digitalus",
"email": "effiegates#digitalus.com",
"city": "Blodgett",
"state": "MD"
}
}
Now there is a field called state and price in each document. How can I write a query for which it returns only the results that contain distinct state with sort order as balance in asc order.
I was trying with terms aggregation but of no use.
UPDATE
POST _search
{
"size": 0,
"aggs": {
"states": {
"terms": {
"field": "state"
},
"aggs": {
"balances": {
"top_hits": {
"from" : 0,
"size": 1,
"sort": {"balance": "asc"}
}
}
}
}
}
}
now for this query i'll be returned with all top-hits with price sorted in that key "state". But what i want is a sorted results w.r.t balance and with unique state fields.
For the above query, i am getting response as follows
"buckets": [
{
"key": "tx",
"doc_count": 30,
"balances": {
"hits": {
"total": 30,
"max_score": null,
"hits": [
{
"_index": "bank",
"_type": "account",
"_id": "161",
"_score": null,
"_source": {
"account_number": 161,
"balance": 4659,
"firstname": "Doreen",
"lastname": "Randall",
"age": 37,
"gender": "F",
"address": "178 Court Street",
"employer": "Calcula",
"email": "doreenrandall#calcula.com",
"city": "Belmont",
"state": "TX"
},
"sort": [
4659
]
}
]
}
}
},
{
"key": "md",
"doc_count": 28,
"balances": {
"hits": {
"total": 28,
"max_score": null,
"hits": [
{
"_index": "bank",
"_type": "account",
"_id": "527",
"_score": null,
"_source": {
"account_number": 527,
"balance": 2028,
"firstname": "Carver",
"lastname": "Peters",
"age": 35,
"gender": "M",
"address": "816 Victor Road",
"employer": "Housedown",
"email": "carverpeters#housedown.com",
"city": "Nadine",
"state": "MD"
},
"sort": [
2028
]
}
]
}
}
},
{
"key": "id",
"doc_count": 27,
"balances": {
"hits": {
"total": 27,
"max_score": null,
"hits": [
{
"_index": "bank",
"_type": "account",
"_id": "402",
"_score": null,
"_source": {
"account_number": 402,
"balance": 1282,
"firstname": "Pacheco",
"lastname": "Rosales",
"age": 32,
"gender": "M",
"address": "538 Pershing Loop",
"employer": "Circum",
"email": "pachecorosales#circum.com",
"city": "Elbert",
"state": "ID"
},
"sort": [
1282
]
}
]
}
}
},
which is not in price sorted.

Try like this:
POST bank/_search
{
"size": 0,
"aggs": {
"states": {
"terms": {
"field": "state",
"order": {
"balances": "asc"
}
},
"aggs": {
"balances": {
"sum": {
"field": "balance"
}
}
}
}
}
}
Note: I don't see a price field, but a balance one, maybe that's the one you meant.
If you're interested in getting all documents by state sorted by price, then you can try this, too:
POST bank/_search
{
"size": 0,
"aggs": {
"states": {
"terms": {
"field": "state"
},
"aggs": {
"balances": {
"top_hits": {
"size": 5,
"sort": {"balance": "asc"}
}
}
}
}
}
}

Related

Specify Elasticsearch aggregation fields when finding duplicates

I am using the following ES query when looking for duplicates:
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999,
"order": {
"_term": "asc"
}
},
"aggs": {
"_docs": {
"top_hits": {
"size": 99999
}
}
}
}
}
It works well, it returns the key which in this case is the phone, and inside of it it returns all the matches. The main problem is exactly that, on the _source it brings everything, which is a lot of fields on my case, and I wanted to specify to bring only the ones I need. Example of what's returning:
"duplicates": {
"1": {
"key": "1",
"doc_count": 2,
"_docs": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "local:company_id:1:sync",
"_type": "leads",
"_id": "23",
"_score": 1,
"_source": {
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": "", // .... and so on
I want to specify the fields that will be returned on the _source, is that possible?
Another problem that I'm having is that I want to order the aggregation results by a specific field (by id) but if I put any field name instead of _term it gives me an error.
Thank you!
In the below example, documents with id 29 and 23 have the same phone, hence these are duplicates. The search query will show only two fields i.e id and phone (you can change these fields according to your condition) and sort the top hits result on the basis of id
Adding a working example with index data, search query, and search result
Index Data:
{
"id": 29,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 30,
"phone": 1235,
"areacode_id": 92,
"areacode_state_id": 10,
"firstName": "Mark",
"lastName": "Smith",
"state": ""
}
Search Query:
{
"size": 0,
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999
},
"aggs": {
"_docs": {
"top_hits": {
"_source": {
"includes": [
"phone",
"id"
]
},
"sort": [
{
"id": {
"order": "asc"
}
}
]
}
}
}
}
}
}
Search Result:
"aggregations": {
"duplicates": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123456,
"doc_count": 2,
"_docs": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "66896259",
"_type": "_doc",
"_id": "1",
"_score": null,
"_source": {
"phone": 123456,
"id": 23
},
"sort": [
23 // note this
]
},
{
"_index": "66896259",
"_type": "_doc",
"_id": "2",
"_score": null,
"_source": {
"phone": 123456,
"id": 29
},
"sort": [
29 // note this
]
}
]
}
}
}
]
}
}

Is it possible to get whole nested object from a document in Elasticsearch?

Imagine I have a document like this:
{
"_index": "bank-accounts",
"_type": "_doc",
"_id": "1",
"_version": 4,
"_seq_no": 3,
"_primary_term": 1,
"found": true,
"_source": {
"id": 1,
"balance": 140,
"transactions": [
{
"id": "42f52474-a49b-4707-86e4-e983efb4ab31",
"type": "Deposit",
"amount": 100
},
{
"id": "3f8396a3-d747-4a4c-8926-cdcedea6b5c3",
"type": "Deposit",
"amount": 50
},
{
"id": "5693585d-6356-4d1a-8d7b-cac5d0dab39f",
"type": "Withdraw",
"amount": 10
}
],
"accountCreatedAt": 1614029062764
}
}
I do want to return only the transactions array in a query.
How would I do this within Elasticsearch? Is this even possible? I've achieved a result using fields[ "transactions.*" ], but it returns each of the fields in separate arrays:
{
...
"hits": [
{
"_index": "bank-accounts",
"_type": "_doc",
"_id": "1",
"_score": 1,
"fields": {
"transactions.id": [
"42f52474-a49b-4707-86e4-e983efb4ab31",
"3f8396a3-d747-4a4c-8926-cdcedea6b5c3",
"5693585d-6356-4d1a-8d7b-cac5d0dab39f"
],
"transactions.amount": [
100,
50,
10
],
"transactions.type": [
"Deposit",
"Deposit",
"Withdraw"
],
...
}
}
]
}
}
I mean, I could very well be using this, but I want something more simple to handle. I expect to get something like this:
*I have to use the document id in my search
{
...
"hits": [
{
"_index": "bank-accounts",
"_type": "_doc",
"_id": "1",
"_score": 3,
"transactions": [
{
"id": "42f52474-a49b-4707-86e4-e983efb4ab31",
"type": "Deposit",
"amount": 100
},
{
"id": "3f8396a3-d747-4a4c-8926-cdcedea6b5c3",
"type": "Deposit",
"amount": 50
},
{
"id": "5693585d-6356-4d1a-8d7b-cac5d0dab39f",
"type": "Withdraw",
"amount": 10
},
....
]
}
]
}
}
Is this possible to achieve?
If you only want to return the transactions array (as you have not mentioned any query condition, on which you need to search), you can achieve that using source filtering.
Adding a working example
Index Mapping:
{
"mappings": {
"properties": {
"transactions": {
"type": "nested"
}
}
}
}
Index Data:
{
"id": 1,
"balance": 140,
"transactions": [
{
"id": "42f52474-a49b-4707-86e4-e983efb4ab31",
"type": "Deposit",
"amount": 100
},
{
"id": "3f8396a3-d747-4a4c-8926-cdcedea6b5c3",
"type": "Deposit",
"amount": 50
},
{
"id": "5693585d-6356-4d1a-8d7b-cac5d0dab39f",
"type": "Withdraw",
"amount": 10
}
],
"accountCreatedAt": 1614029062764
}
Search Query:
{
"_source": [
"transactions.*"
]
}
Search Result:
"hits": [
{
"_index": "66324257",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"transactions": [
{
"amount": 100,
"id": "42f52474-a49b-4707-86e4-e983efb4ab31",
"type": "Deposit"
},
{
"amount": 50,
"id": "3f8396a3-d747-4a4c-8926-cdcedea6b5c3",
"type": "Deposit"
},
{
"amount": 10,
"id": "5693585d-6356-4d1a-8d7b-cac5d0dab39f",
"type": "Withdraw"
}
]
}
}
]

Sum of fields between two dates in ElasticSearch

I'm very new to ElasticSearch
I want to sum of salary between two dates 27/08/2020 and 31/08/2020 which I'm not able to achieve.
I'm posting here the simple query and the outcome of the query.
I read about date-histogram but could not find a specific answer and If the outcome is very simple then why date-histogram is asking to put in interval as well as it returns data in the bucket.
Documents in employees index are
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "employees",
"_type": "_doc",
"_id": "wO8AMXQBDHla7ClA8iDV",
"_score": 1.0,
"_source": {
"FirstName": "JOYE",
"LastName": "WIATR",
"Designation": "CEO",
"Salary": 144000,
"DateOfJoining": "25/05/2009",
"Address": "9068 SW. Grove St. Waynesboro, PA 17268",
"Gender": "Female",
"Age": 58,
"MaritalStatus": "Unmarried",
"Interests": "Renting movies,Scuba Diving,Snowboarding,Butterfly Watching,Dumpster Diving,Badminton,Church/church activities"
}
},
{
"_index": "employees",
"_type": "_doc",
"_id": "wu8CMXQBDHla7ClAwCDT",
"_score": 1.0,
"_source": {
"FirstName": "Ajay",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 44000,
"DateOfJoining": "28/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 29,
"MaritalStatus": "Unmarried",
"Interests": "Watching movies , learing from scratch"
}
},
{
"_index": "employees",
"_type": "_doc",
"_id": "w-8KMXQBDHla7ClAICC9",
"_score": 1.0,
"_source": {
"FirstName": "MR.X",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 56000,
"DateOfJoining": "30/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 39,
"MaritalStatus": "Married",
"Interests": "Watching movies,Watching war movies"
}
}
]
}
}
Here is my query
{
"query": {
"bool": {
"must": [
{
"range": {
"DateOfJoining": {
"gte": "27/08/2020",
"lte": "31/08/2020"
}
}
}
]
}
}
}
and here is the outcome of the query
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "employees",
"_type": "_doc",
"_id": "wu8CMXQBDHla7ClAwCDT",
"_score": 1.0,
"_source": {
"FirstName": "Ajay",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 44000,
"DateOfJoining": "28/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 29,
"MaritalStatus": "Unmarried",
"Interests": "Watching movies , learing from scratch"
}
},
{
"_index": "employees",
"_type": "_doc",
"_id": "w-8KMXQBDHla7ClAICC9",
"_score": 1.0,
"_source": {
"FirstName": "MR.X",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 56000,
"DateOfJoining": "30/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 39,
"MaritalStatus": "Married",
"Interests": "Watching movies,Watching war movies"
}
}
]
}
}
I'm really not able to achieve
I want to sum of salary between two dates 27/08/2020 and 31/08/2020 which I'm not able to achieve.
Try below query:
{
"size": 0,
"query": {
"range": {
"DateOfJoining": {
"gte": "27/08/2020",
"lte": "31/08/2020"
}
}
},
"aggs": {
"sum_of_salary": {
"sum": {
"field": "Salary"
}
}
}
}
I read about date-histogram but could not find a specific answer and
If the outcome is very simple then why date-histogram is asking to put
in interval as well as it returns data in the bucket.
In Date-histograms data is bucketized based on some time period.
e.g if you want to get sum of salaries paid to employees month over month, then date-histogram will help there.

How to make aggregations work for text fields

I am trying to write a elasticsearch query to get unique locality towns. my locality_town_keyword is of keyword type. when I try to search into locality_town_keyword, I get search hits but nothing in "aggregations":"Buckets".
Following is how my schema looks like...
"locality_town": {
"type": "text"
},
"locality_town_keyword": {
"type": "keyword"
},
My Search query looks like following
{
"query":
{
"prefix" : { "locality_town" : "m" }
},
"size": "1",
"_source": {
"includes": [
"locality_town*"
]
},
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword",
"size": 5,
"order": {
"_count": "desc"
}
}
}
}
}
Here is the output it gives
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 799,
"max_score": 1.0,
"hits": [
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_score": 1.0,
"_source": {
"locality_town": "Manchester",
"locality_town_keyword": "Manchester"
}
}
]
},
"aggregations": {
"loc": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
This is how one document looks like
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_version": 1,
"_seq_no": 39,
"_primary_term": 1,
"found": true,
"_source": {
"title": "Legal Services",
"buyers": "CENTRAL MANCHESTER UNIVERSITY HOSPITALS NHS FOUNDATION TRUST",
"postal_code": "M13 0JR",
"publish_date": "2015-03-03T15:48:45Z",
"status": "cancelled",
"start_date": "2017-03-03T00:00:00Z",
"endt_date": "2020-03-03T00:00:00Z",
"url": "https://www.temp.com",
"country": "England",
"description": "desc......",
"language": "en-GB",
"service": "OPEN_CONTRACTING",
"value": "0",
"value_currency": "GBP",
"winner": "",
"create_time": "2019-05-11T21:39:42Z",
"deadline_date": "1970-01-01T00:00:00Z",
"address": "Central Manchester University Hospitals NHS Foundation Trust Wilmslow Park",
"locality_town": "Manchester",
"locality_town_keyword": "Manchester",
"region": "North West",
"tender_type": "planning",
"cpv": "Health services ",
"strpublish_date": "2015-03-03T15:48:45Z",
"strstart_date": "2017-03-03T00:00:00Z",
"strend_date": "2020-03-03T00:00:00Z",
"strdeadline_date": "",
"winner_email": "",
"winner_address": "",
"winner_town": "",
"winner_postalcode": "",
"winner_phone": "",
"cpvs": "[\"Health services (85100000-0)\"]"
}
}
Looks like you have a typo in your aggregation query:
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword", <== here
"size": 5,
Try with locality_town_keyword instead!
Hope this helps!

Elasticsearch query to process log data

I have an event log of an e-commerce website in Elasticsearch.
Each event is a record in ES
{
"_index": "event_log",
"_type": "log_type",
"_id": "3ud-kmoBazYRVz7KCgIy",
"_score": 1,
"_source": {
"user_id": 123,
"event": "click",
"category": "abc",
"product_id": 1112
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4Od-kmoBazYRVz7KCgLr",
"_score": 1,
"_source": {
"user_id": 123,
"event": "click",
"category": "abc",
"product_id": 1118
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "4ud-kmoBazYRVz7KkwL2",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 1
}
},
{
"_index": "event_log",
"_type": "log_type",
"_id": "2ud-kmoBazYRVz7KCALB",
"_score": 1,
"_source": {
"user_id": 123,
"event": "cart",
"category": "xyz",
"product_id": 11
}
},
I want list of all the product_ids grouping event, category, user.
Expected output:
{"click": {
"abc": {
"123": {
"product_id": [1112, 1118]
}
}
},
"cart": {
"xyz": {
"123": {
"product_id": [1, 11]
}
}
}
}
I will be having millions of records in the index. Querying all the records and processing it is time-consuming. Is there a way to produce the output in a single query? I'm sure it is not possible to generate exactly in the given format. Something near to it is very useful.
Hi here is my suggestion (first try)
GET event_log/_search
{
"size": 0,
"aggs": {
"event": {
"terms": {
"field": "event"
},
"aggs": {
"category": {
"terms": {
"field": "category"
},
"aggs": {
"product_id": {
"terms": {
"field": "product_id"
}
}
}
}
}
}
}
}

Resources