I'm very new to ElasticSearch
I want to sum of salary between two dates 27/08/2020 and 31/08/2020 which I'm not able to achieve.
I'm posting here the simple query and the outcome of the query.
I read about date-histogram but could not find a specific answer and If the outcome is very simple then why date-histogram is asking to put in interval as well as it returns data in the bucket.
Documents in employees index are
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "employees",
"_type": "_doc",
"_id": "wO8AMXQBDHla7ClA8iDV",
"_score": 1.0,
"_source": {
"FirstName": "JOYE",
"LastName": "WIATR",
"Designation": "CEO",
"Salary": 144000,
"DateOfJoining": "25/05/2009",
"Address": "9068 SW. Grove St. Waynesboro, PA 17268",
"Gender": "Female",
"Age": 58,
"MaritalStatus": "Unmarried",
"Interests": "Renting movies,Scuba Diving,Snowboarding,Butterfly Watching,Dumpster Diving,Badminton,Church/church activities"
}
},
{
"_index": "employees",
"_type": "_doc",
"_id": "wu8CMXQBDHla7ClAwCDT",
"_score": 1.0,
"_source": {
"FirstName": "Ajay",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 44000,
"DateOfJoining": "28/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 29,
"MaritalStatus": "Unmarried",
"Interests": "Watching movies , learing from scratch"
}
},
{
"_index": "employees",
"_type": "_doc",
"_id": "w-8KMXQBDHla7ClAICC9",
"_score": 1.0,
"_source": {
"FirstName": "MR.X",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 56000,
"DateOfJoining": "30/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 39,
"MaritalStatus": "Married",
"Interests": "Watching movies,Watching war movies"
}
}
]
}
}
Here is my query
{
"query": {
"bool": {
"must": [
{
"range": {
"DateOfJoining": {
"gte": "27/08/2020",
"lte": "31/08/2020"
}
}
}
]
}
}
}
and here is the outcome of the query
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "employees",
"_type": "_doc",
"_id": "wu8CMXQBDHla7ClAwCDT",
"_score": 1.0,
"_source": {
"FirstName": "Ajay",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 44000,
"DateOfJoining": "28/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 29,
"MaritalStatus": "Unmarried",
"Interests": "Watching movies , learing from scratch"
}
},
{
"_index": "employees",
"_type": "_doc",
"_id": "w-8KMXQBDHla7ClAICC9",
"_score": 1.0,
"_source": {
"FirstName": "MR.X",
"LastName": "Jaiswal",
"Designation": "CEO",
"Salary": 56000,
"DateOfJoining": "30/08/2020",
"Address": "Hyderabad",
"Gender": "Male",
"Age": 39,
"MaritalStatus": "Married",
"Interests": "Watching movies,Watching war movies"
}
}
]
}
}
I'm really not able to achieve
I want to sum of salary between two dates 27/08/2020 and 31/08/2020 which I'm not able to achieve.
Try below query:
{
"size": 0,
"query": {
"range": {
"DateOfJoining": {
"gte": "27/08/2020",
"lte": "31/08/2020"
}
}
},
"aggs": {
"sum_of_salary": {
"sum": {
"field": "Salary"
}
}
}
}
I read about date-histogram but could not find a specific answer and
If the outcome is very simple then why date-histogram is asking to put
in interval as well as it returns data in the bucket.
In Date-histograms data is bucketized based on some time period.
e.g if you want to get sum of salaries paid to employees month over month, then date-histogram will help there.
Related
I am trying to write a elasticsearch query to get unique locality towns. my locality_town_keyword is of keyword type. when I try to search into locality_town_keyword, I get search hits but nothing in "aggregations":"Buckets".
Following is how my schema looks like...
"locality_town": {
"type": "text"
},
"locality_town_keyword": {
"type": "keyword"
},
My Search query looks like following
{
"query":
{
"prefix" : { "locality_town" : "m" }
},
"size": "1",
"_source": {
"includes": [
"locality_town*"
]
},
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword",
"size": 5,
"order": {
"_count": "desc"
}
}
}
}
}
Here is the output it gives
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 799,
"max_score": 1.0,
"hits": [
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_score": 1.0,
"_source": {
"locality_town": "Manchester",
"locality_town_keyword": "Manchester"
}
}
]
},
"aggregations": {
"loc": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
This is how one document looks like
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_version": 1,
"_seq_no": 39,
"_primary_term": 1,
"found": true,
"_source": {
"title": "Legal Services",
"buyers": "CENTRAL MANCHESTER UNIVERSITY HOSPITALS NHS FOUNDATION TRUST",
"postal_code": "M13 0JR",
"publish_date": "2015-03-03T15:48:45Z",
"status": "cancelled",
"start_date": "2017-03-03T00:00:00Z",
"endt_date": "2020-03-03T00:00:00Z",
"url": "https://www.temp.com",
"country": "England",
"description": "desc......",
"language": "en-GB",
"service": "OPEN_CONTRACTING",
"value": "0",
"value_currency": "GBP",
"winner": "",
"create_time": "2019-05-11T21:39:42Z",
"deadline_date": "1970-01-01T00:00:00Z",
"address": "Central Manchester University Hospitals NHS Foundation Trust Wilmslow Park",
"locality_town": "Manchester",
"locality_town_keyword": "Manchester",
"region": "North West",
"tender_type": "planning",
"cpv": "Health services ",
"strpublish_date": "2015-03-03T15:48:45Z",
"strstart_date": "2017-03-03T00:00:00Z",
"strend_date": "2020-03-03T00:00:00Z",
"strdeadline_date": "",
"winner_email": "",
"winner_address": "",
"winner_town": "",
"winner_postalcode": "",
"winner_phone": "",
"cpvs": "[\"Health services (85100000-0)\"]"
}
}
Looks like you have a typo in your aggregation query:
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword", <== here
"size": 5,
Try with locality_town_keyword instead!
Hope this helps!
To simplify:
PUT /test/vendors/1
{
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
PUT /test/vendors/2
{
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
PUT /test/vendors/3
{
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
Then search:
GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ]
}
}
}
I understand why I get Jack who works in San Fran -- it's because he's a doctor too. However, I can't figure out why the match score is the SAME for him. The other two were matched with the place too, weren't they? why aren't Ron and Tom scored higher?
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.9245277,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}
Is there a way to force it to score less when less search keywords are found? Also, If I'n going to wrong way about this kind of search and there's a better pattern/way to do it -- I'd appreciate to be pointed in the right direction.
Your search structure is incorrect. The search query above is ignoring the place property and that's why you get the same score for all documents (only type property is taken into account). The reason for that is because works_at is a nested mapping, which should be treated differently when searching.
First, you should defined works_at as a nested mapping (read more here). Then you'll have to adjust your query to work with that nested mapping, see an example here.
GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ],
"type": "most_fields" . <---- I WAS MISSING THIS
}
}
}
once in, that gave the correct results, where the "San Fran" guy is scored lower.
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.2122098,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}
I have bulk documents in elasticsearch and as an example I have taken the elasticsearch documentation example as banks
{
"_index": "bank",
"_type": "account",
"_id": "25",
"_score": 1,
"_source": {
"account_number": 25,
"balance": 40540,
"firstname": "Virginia",
"lastname": "Ayala",
"age": 39,
"gender": "F",
"address": "171 Putnam Avenue",
"employer": "Filodyne",
"email": "virginiaayala#filodyne.com",
"city": "Nicholson",
"state": "PA"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "44",
"_score": 1,
"_source": {
"account_number": 44,
"balance": 34487,
"firstname": "Aurelia",
"lastname": "Harding",
"age": 37,
"gender": "M",
"address": "502 Baycliff Terrace",
"employer": "Orbalix",
"email": "aureliaharding#orbalix.com",
"city": "Yardville",
"state": "DE"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "99",
"_score": 1,
"_source": {
"account_number": 99,
"balance": 47159,
"firstname": "Ratliff",
"lastname": "Heath",
"age": 39,
"gender": "F",
"address": "806 Rockwell Place",
"employer": "Zappix",
"email": "ratliffheath#zappix.com",
"city": "Shaft",
"state": "ND"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "119",
"_score": 1,
"_source": {
"account_number": 119,
"balance": 49222,
"firstname": "Laverne",
"lastname": "Johnson",
"age": 28,
"gender": "F",
"address": "302 Howard Place",
"employer": "Senmei",
"email": "lavernejohnson#senmei.com",
"city": "Herlong",
"state": "DC"
}
}
,
{
"_index": "bank",
"_type": "account",
"_id": "126",
"_score": 1,
"_source": {
"account_number": 126,
"balance": 3607,
"firstname": "Effie",
"lastname": "Gates",
"age": 39,
"gender": "F",
"address": "620 National Drive",
"employer": "Digitalus",
"email": "effiegates#digitalus.com",
"city": "Blodgett",
"state": "MD"
}
}
Now there is a field called state and price in each document. How can I write a query for which it returns only the results that contain distinct state with sort order as balance in asc order.
I was trying with terms aggregation but of no use.
UPDATE
POST _search
{
"size": 0,
"aggs": {
"states": {
"terms": {
"field": "state"
},
"aggs": {
"balances": {
"top_hits": {
"from" : 0,
"size": 1,
"sort": {"balance": "asc"}
}
}
}
}
}
}
now for this query i'll be returned with all top-hits with price sorted in that key "state". But what i want is a sorted results w.r.t balance and with unique state fields.
For the above query, i am getting response as follows
"buckets": [
{
"key": "tx",
"doc_count": 30,
"balances": {
"hits": {
"total": 30,
"max_score": null,
"hits": [
{
"_index": "bank",
"_type": "account",
"_id": "161",
"_score": null,
"_source": {
"account_number": 161,
"balance": 4659,
"firstname": "Doreen",
"lastname": "Randall",
"age": 37,
"gender": "F",
"address": "178 Court Street",
"employer": "Calcula",
"email": "doreenrandall#calcula.com",
"city": "Belmont",
"state": "TX"
},
"sort": [
4659
]
}
]
}
}
},
{
"key": "md",
"doc_count": 28,
"balances": {
"hits": {
"total": 28,
"max_score": null,
"hits": [
{
"_index": "bank",
"_type": "account",
"_id": "527",
"_score": null,
"_source": {
"account_number": 527,
"balance": 2028,
"firstname": "Carver",
"lastname": "Peters",
"age": 35,
"gender": "M",
"address": "816 Victor Road",
"employer": "Housedown",
"email": "carverpeters#housedown.com",
"city": "Nadine",
"state": "MD"
},
"sort": [
2028
]
}
]
}
}
},
{
"key": "id",
"doc_count": 27,
"balances": {
"hits": {
"total": 27,
"max_score": null,
"hits": [
{
"_index": "bank",
"_type": "account",
"_id": "402",
"_score": null,
"_source": {
"account_number": 402,
"balance": 1282,
"firstname": "Pacheco",
"lastname": "Rosales",
"age": 32,
"gender": "M",
"address": "538 Pershing Loop",
"employer": "Circum",
"email": "pachecorosales#circum.com",
"city": "Elbert",
"state": "ID"
},
"sort": [
1282
]
}
]
}
}
},
which is not in price sorted.
Try like this:
POST bank/_search
{
"size": 0,
"aggs": {
"states": {
"terms": {
"field": "state",
"order": {
"balances": "asc"
}
},
"aggs": {
"balances": {
"sum": {
"field": "balance"
}
}
}
}
}
}
Note: I don't see a price field, but a balance one, maybe that's the one you meant.
If you're interested in getting all documents by state sorted by price, then you can try this, too:
POST bank/_search
{
"size": 0,
"aggs": {
"states": {
"terms": {
"field": "state"
},
"aggs": {
"balances": {
"top_hits": {
"size": 5,
"sort": {"balance": "asc"}
}
}
}
}
}
}
I'm trying a simple terms aggregation but the result is not creating buckets. Here is a sample document:
"hits": {
"total": 27330,
"max_score": 0.8293952,
"hits": [
{
"_index": "policy",
"_type": "policy",
"_id": "W0051311PNWO",
"_score": 0.8293952,
"_source": {
"productname": "UK CARGO",
"alternateproductname": "ABC39393939",
"brokername": "Name***",
"agentname": "Name***",
"policyref": "ABC33333",
"client": "International Cargo Limited",
"addressline1": "",
"post/zipcode": "",
"telephone": null,
"bapolicyendorseid": 123334,
"prevcertnum": "",
"policystatus": "Endorsed",
"#version": "1",
"#timestamp": "2015-10-09T11:11:02.018Z"
}
},
Here is the aggregate search (in sense):
get policy/policy/_search
{
"aggs": {
"statuses": {
"terms": {
"field": "policystatus"
}
}
}
}
I'm trying to get the equivalent of:
select policystatus, count(*) from policy group by policystatus
The result is not showing buckets. It is showing regular document results:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 227398,
"max_score": 1,
"hits": [
{
"_index": "policy",
"_type": "policy",
"_id": "04/QQQ/04UKI0018",
"_score": 1,
"_source": {
"productname": "2 RES 01/09/04",
"alternateproductname": "2 RES 01/09/04",
"brokername": "Blah LTD",
"agentname": "Insurance",
"policyref": "blah",
"client": "blah",
"addressline1": "blah",
"post/zipcode": "blah",
"telephone": null,
"bapolicyendorseid": 21427,
"prevcertnum": "04UKI0018",
"policystatus": "Pending",
"#version": "1",
"#timestamp": "2015-10-09T11:10:10.146Z"
}
},
Try this:
GET /policy/policy/_search?search_type=count
{
"aggs": {
"statuses": {
"terms": {
"field": "policystatus"
}
}
}
}
meaning capital letters GET and search_type=count to get only the buckets, not also the hits.
After many lectures , I cannot say if this kind of query is possible with elasticsearch , I found the "getting started" really excellent but the rest of guide have a lack of examples (from my point of vue ).
See my structure below, I need to retrieve all id who are not in my blacklist. My blacklist is some reference id. For this example I am the id 1 with the firstname "me" . Here in the structure we see I blacklisted "bob" , so the bob id (2) is in my blacklist array because I don't want to find bob in my search result.. :)
Is it possible to only retrieve (dynamically for sure) all id who are not in my blacklist in one query?
If you come from SQL, the same logic could be :
SELECT id FROM index WHERE id NOT IN (SELECT * FROM blacklist WHERE id = 1)
I would like to avoid the 2 step query , if my schema is bad and should be reconsidered , please I'm totally open for advice or suggestions.
Here is the structure :
{
"id: 1,
"balance": 16623,
"firstname": "me",
"blacklist" : [2,1982,939,1982,98716,7611,983838, and thousands others ....],
}
{
"id: 2,
"balance": 16623,
"firstname": "bob,
"blacklist" : [18,1982,939,1982,98716,7611,983838, and thousands others ....],
}
{
"id: 3,
"balance": 16623,
"firstname": "jhon",
"blacklist" : [18,1982,939,1982,98716,7611,983838, and thousands others ....],
}
You can use use a terms filter lookup together with a not filter as follows.
I set up the index with the three docs you have listed:
DELETE /test_index
PUT /test_index
PUT /test_index/doc/1
{
"id": 1,
"balance": 16623,
"firstname": "me",
"blacklist" : [2,1982,939,1982,98716,7611,983838]
}
PUT /test_index/doc/2
{
"id": 2,
"balance": 16623,
"firstname": "bob",
"blacklist" : [18,1982,939,1982,98716,7611,983838]
}
PUT /test_index/doc/3
{
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist" : [18,1982,939,1982,98716,7611,983838]
}
Then set up a query that filters out docs that are in the blacklist for "me":
POST /test_index/doc/_search
{
"filter": {
"not": {
"filter": {
"terms": {
"id": {
"index": "test_index",
"type": "doc",
"id": "1",
"path": "blacklist"
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"id": 1,
"balance": 16623,
"firstname": "me",
"blacklist": [2,1982,939,1982,98716,7611,983838]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist": [18,1982,939,1982,98716,7611,983838]
}
}
]
}
}
If you also want to filter out the user whose blacklist is being used, you can set up a slightly more complex filter using or:
POST /test_index/doc/_search
{
"filter": {
"not": {
"filter": {
"or": {
"filters": [
{
"terms": {
"id": {
"index": "test_index",
"type": "doc",
"id": "1",
"path": "blacklist"
}
}
},
{
"term": {
"id": "1"
}
}
]
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist": [18,1982,939,1982,98716,7611,983838]
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/0b6808414f9447d4f7d23eb4c0d3e937ec2ea4e7