Incorrect output result of "aggs" query - elasticsearch

I have a query that searches the number of entries in a given datetime window (i.e. between 2017-02-17T15:00:00.000 and 2017-02-17T16:00:00.000). When I execute this query, I get the incorrect result (it's better said that the result is unexpected):
POST /myindex/_search
{
"size": 0,
"aggs": {
"range": {
"date_range": {
"field": "Datetime",
"ranges": [
{ "to": "2017-02-17T16:00:00||-1H/H" },
{ "from": "2017-02-17T16:00:00||/H" }
]
}
}
}
}
This is the output:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 11,
"max_score": 0,
"hits": []
},
"aggregations": {
"range": {
"buckets": [
{
"key": "*-2017-02-17T15:00:00.000Z",
"to": 1487343600000,
"to_as_string": "2017-02-17T15:00:00.000Z",
"doc_count": 0
},
{
"key": "2017-02-17T16:00:00.000Z-*",
"from": 1487347200000,
"from_as_string": "2017-02-17T16:00:00.000Z",
"doc_count": 0
}
]
}
}
}
In myindex I have two entries with the following values of Datetime:
2017-02-17T15:15:00.000Z
2017-02-17T15:02:00.000Z
So, the result should be equal to 2.
I don't understand how to interpret the current output. Which fields defines the number of entries?
UPDATE:
data structure:
PUT /myindex
{
"mappings": {
"intensity": {
"_all": {
"enabled": false
},
"properties": {
"Country_Id": {
"type":"keyword"
},
"Datetime": {
"type":"date"
}
}
}
}
}
sample data:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 1,
"hits": [
{
"_index": "myindex",
"_type": "intensity",
"_id": "4",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-02-18T15:01:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "6",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-03-16T16:15:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "1",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-02-17T15:15:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "7",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-03-16T16:18:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "3",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-02-17T15:02:00.000Z"
}
}
]
}
}
The answer that I get:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 11,
"max_score": 0,
"hits": []
},
"aggregations": {
"range": {
"buckets": [
{
"key": "2017-02-17T15:00:00.000Z-2017-02-17T16:00:00.000Z",
"from": 1487343600000,
"from_as_string": "2017-02-17T15:00:00.000Z",
"to": 1487347200000,
"to_as_string": "2017-02-17T16:00:00.000Z",
"doc_count": 0
}
]
}
}
}

Your ranges are wrong, do it like this instead
POST /myindex/_search
{
"size": 0,
"aggs": {
"range": {
"date_range": {
"field": "Datetime",
"ranges": [
{
"from": "2017-02-17T16:00:00Z||-1H/H",
"to": "2017-02-17T16:00:00Z||/H"
}
]
}
}
}
}

Related

Iterating over doc to return a particular key's value in an array based on a match

data
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1000,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "learn",
"_id": "OeCLr4QBPMAw7FiXknKz",
"_score": 1,
"_source": {
"user_rating_size": 80,
"ratingdescription": 80,
"rating": "PG-13",
"release_year": 2004,
"user_rating_score": 82,
"title": "White Chicks",
"ratinglevel": "crude and sexual humor, language and some drug content"
}
},
{
"_index": "learn",
"_id": "QuCLr4QBPMAw7FiXknKz",
"_score": 1,
"_source": {
"user_rating_size": 80,
"ratingdescription": 90,
"rating": "TV-14",
"release_year": 2016,
"user_rating_score": 96,
"title": "Pretty Little Liars",
"ratinglevel": "Parents strongly cautioned. May be unsuitable for children ages 14 and under."
}
}
]
}
}
Mapping
{
"learn": {
"mappings": {
"_meta": {
"created_by": "file-data-visualizer"
},
"properties": {
"rating": {
"type": "keyword"
},
"ratingdescription": {
"type": "long"
},
"ratinglevel": {
"type": "text"
},
"release_year": {
"type": "long"
},
"title": {
"type": "text"
},
"user_rating_score": {
"type": "long"
},
"user_rating_size": {
"type": "long"
}
}
}
}
}
All i want is to return all the values of title as an array based on rating match(grouping).
I tried to group it based on rating but it returns the matching document. In this case i have to again loop through through to get just the value.
In aggregation, all I see from documentation is sum and other statistics based.
I also tried to do it through painless script but cant seem to figure out a way.
I had to add a keyword field type to title to be able to aggregate on it:
PUT learn
{
"mappings": {
"_meta": {
"created_by": "file-data-visualizer"
},
"properties": {
"rating": {
"type": "keyword"
},
"ratingdescription": {
"type": "long"
},
"ratinglevel": {
"type": "text"
},
"release_year": {
"type": "long"
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"user_rating_score": {
"type": "long"
},
"user_rating_size": {
"type": "long"
}
}
}
}
Via Aggregations
GET learn/_search
{
"size": 0,
"query": {
"match": {
"title": "pretty"
}
},
"aggs": {
"ratings": {
"terms": {
"field": "rating",
"size": 10
},
"aggs": {
"titles": {
"terms": {
"field": "title.keyword",
"size": 10
}
}
}
}
}
}
Results
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"ratings": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "TV-14",
"doc_count": 2,
"titles": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Another Pretty TV-14 movie",
"doc_count": 1
},
{
"key": "Pretty Little Liars",
"doc_count": 1
}
]
}
},
{
"key": "PG-13",
"doc_count": 1,
"titles": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Pretty White Chicks",
"doc_count": 1
}
]
}
}
]
}
}
}
Via Collapse query
GET learn/_search
{
"_source": false,
"query": {
"match": {
"title": "pretty"
}
},
"collapse": {
"field": "rating",
"inner_hits": {
"name": "titles",
"size": 5,
"_source": ["title"]
}
}
}
Results
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "learn",
"_id": "JVV4vIQBtNG1OrZoVQ2v",
"_score": 0.7361701,
"fields": {
"rating": [
"TV-14"
]
},
"inner_hits": {
"titles": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.7361701,
"hits": [
{
"_index": "learn",
"_id": "JVV4vIQBtNG1OrZoVQ2v",
"_score": 0.7361701,
"_source": {
"title": "Pretty Little Liars"
}
},
{
"_index": "learn",
"_id": "_FV4vIQBtNG1OrZo-Q95",
"_score": 0.5897495,
"_source": {
"title": "Another Pretty TV-14 movie"
}
}
]
}
}
}
},
{
"_index": "learn",
"_id": "wcV5vIQB5Gw0WET8ve-k",
"_score": 0.7361701,
"fields": {
"rating": [
"PG-13"
]
},
"inner_hits": {
"titles": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.7361701,
"hits": [
{
"_index": "learn",
"_id": "wcV5vIQB5Gw0WET8ve-k",
"_score": 0.7361701,
"_source": {
"title": "Pretty White Chicks"
}
}
]
}
}
}
}
]
}
}

Exact match Query in Elastic Search issue

I have a index in ElasticSearch with 4 datas
Here's the Data:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "sample4",
"_type": "logs",
"_id": "UQBMOHABHstawU4w4_z3",
"_score": 1,
"_source": {
"date": "2020-02-12T07:28:48",
"target": {
"http://localhost/wordpress/index.php/2020/01/13/hello-world/": {
"clicks": {
"868 278": 12
}
}
}
}
},
{
"_index": "sample4",
"_type": "logs",
"_id": "UgBNOHABHstawU4wT_wn",
"_score": 1,
"_source": {
"date": "2020-02-12T07:29:15",
"target": {
"http://localhost/wordpress/": {
"clicks": {
"958 250": 5
}
}
}
}
},
{
"_index": "sample4",
"_type": "logs",
"_id": "UABMOHABHstawU4wC_y9",
"_score": 1,
"_source": {
"date": "2020-02-12T07:27:52",
"target": {
"http://localhost/wordpress/": {
"clicks": {
"880 257": 6
}
}
}
}
},
{
"_index": "sample4",
"_type": "logs",
"_id": "UwBOOHABHstawU4wFvxV",
"_score": 1,
"_source": {
"date": "2020-02-12T07:30:06",
"target": {
"http://localhost/wordpress/index.php/2020/01/13/hello-world/": {
"clicks": {
"389 60": 33
}
},
"http://localhost/wordpress/": {
"clicks": {
"657 235": 8
}
}
}
}
}
]
}
}
I want to match the target key in the index with the value http://localhost/wordpress/. If the given value exactly matches the value in target key in ES index, I would get 3 data. Inside the target key, it was like an object. So i don't know how make a match query.
Here's the query i tried:
{
"query": {
"wildcard": {
"target.http://localhost/wordpress/": {
"value": "*"
}
}
}
}
But it returns 0 results.
Output I got:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 0,
"relation": "eq"
},
"max_score": null,
"hits": []
}
}
Required Output:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "sample4",
"_type": "logs",
"_id": "UgBNOHABHstawU4wT_wn",
"_score": 1,
"_source": {
"date": "2020-02-12T07:29:15",
"target": {
"http://localhost/wordpress/": {
"clicks": {
"958 250": 5
}
}
}
}
},
{
"_index": "sample4",
"_type": "logs",
"_id": "UABMOHABHstawU4wC_y9",
"_score": 1,
"_source": {
"date": "2020-02-12T07:27:52",
"target": {
"http://localhost/wordpress/": {
"clicks": {
"880 257": 6
}
}
}
}
},
{
"_index": "sample4",
"_type": "logs",
"_id": "UwBOOHABHstawU4wFvxV",
"_score": 1,
"_source": {
"date": "2020-02-12T07:30:06",
"target": {
"http://localhost/wordpress/index.php/2020/01/13/hello-world/": {
"clicks": {
"389 60": 33
}
},
"http://localhost/wordpress/": {
"clicks": {
"657 235": 8
}
}
}
}
}
]
}
}
Help me to solve this problem.....
Since you're checking on a field name and not a value, you should try this query instead
{
"query": {
"exists": {
"field": "target.http://localhost/wordpress/"
}
}
}

What does total value shows inside the _search query result in elasticsearch?

When we call the elasticsearch, say as follows:
POST https:////_search with body:
{
"from": 0,
"size": 1,
"query": {
"bool": {
"must": [
{
"range": {
"createdAt": {
"gt": "2019-11-11T10:00:00"
}
}
}
]
}
},
"sort": [
{
"createdAt" : {
"order" : "desc"
}
}
]
}
I see that I get only 1 result as pagination is set to 1 but total inside hits in response shows 2. This is the response I get:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": “<index-name>”,
"_type": "_doc",
"_id": "5113c843-dff3-499f-a12e-44c7ac103bcf_0",
"_score": null,
"_source": {
"oId": "5113c843-dff3-499f-a12e-44c7ac103bcf",
"oItemId": 0,
"createdAt": "2019-11-13T11:00:00"
},
"sort": [
1573642800000
]
}
]
}
}
Doesn’t total doesn’t capture the pagination part? And it only cares about the query report? It should show the total count of items matching the query irrespective of the pagination set, right?
Yes, You are right that total doesn't capture the pagination part and just cares about the query report ie. whatever the total no of the document matches for a given query.
To be precise, it is as explained in official ES docs .
total (Object) Metadata about the number of returned documents.
Returned parameters include:
value: Total number of returned documents. relation: Indicates whether
the number of documents returned. Returned values are:
eq: Accurate gte: Lower bound, including returned documents
It means its the total no of returned documents, but as pagination is set to 1 in your example, inner hits have just 1 document.You can cross-check this understanding easily by creating a sample example as below:
Create a sample index with just 1 text field:
URL:- http://localhost:9200/{your-index-name}/ --> PUT method
{
"mappings": {
"properties": {
"name": {
"type": "text"
}
}
},
"settings": {
"index": {
"number_of_shards": "1",
"number_of_replicas": "1"
}
}
}
Once the above index is created index below 4 documents:
URL:- http://localhost:9200/{your-index-name}/_doc/{1,2,like..} --> POST method
{
"name": "foo 1"
}
{
"name": "foo bar"
}
{
"name": "foo"
}
{
"name": "foo 2"
}
Now when you hit below search query without pagination:
{
"query": {
"bool": {
"must": [
{
"match": {
"name": "foo"
}
}
]
}
}
}
It gives below response:
{
"took": 9,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4, --> Note 4 here
"relation": "eq"
},
"max_score": 0.12199639,
"hits": [
{
"_index": "59638303",
"_type": "_doc",
"_id": "1",
"_score": 0.12199639,
"_source": {
"name": "foo"
}
},
{
"_index": "59638303",
"_type": "_doc",
"_id": "3",
"_score": 0.12199639,
"_source": {
"name": "foo"
}
},
{
"_index": "59638303",
"_type": "_doc",
"_id": "2",
"_score": 0.09271725,
"_source": {
"name": "foo bar"
}
},
{
"_index": "59638303",
"_type": "_doc",
"_id": "4",
"_score": 0.09271725,
"_source": {
"name": "foo 1"
}
}
]
}
}
But when you hit a search query with pagination:
{
"from": 0,
"size": 1,--> note size 1
"query": {
"bool": {
"must": [
{
"match": {
"name": "foo"
}
}
]
}
}
}
it gives below response
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4, --> this is still 4
"relation": "eq"
},
"max_score": 0.12199639,
"hits": [
{
"_index": "59638303",
"_type": "_doc",
"_id": "1",
"_score": 0.12199639,
"_source": {
"name": "foo"
}
}
]
}
}
Now in the above query, you can change the size and check only inner-hits array gets change but the outer hits object which contains total always remains same as 4, this confirms your understanding is correct.

Find Duplicate Documents in Elastic Search

I'm looking for a solution to find duplicate(exact) Docs in ElasticSearch.
I've read https://qbox.io/blog/minimizing-document-duplication-in-elasticsearch and tried it but its results are not as I expected as example this is my sample simple query :
GET /last_month_ads/_search
{
"size": 0,
"fields": [
"title"
],
"aggs": {
"duplicateCount": {
"terms": {
"field": "title",
"size" : 3
},
"aggs": {
"duplicateDocuments": {
"top_hits": {}
}
}
}
}
}
and the result is
{
"took": 981,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"hits": {
"total": 482909,
"max_score": 0,
"hits": []
},
"aggregations": {
"duplicateCount": {
"doc_count_error_upper_bound": 11667,
"sum_other_doc_count": 1958146,
"buckets": [
{
"key": "CM",
"doc_count": 46867,
"duplicateDocuments": {
"hits": {
"total": 46867,
"max_score": 1,
"hits": [
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtoBQTqkjEa7YQG1",
"_score": 1,
"_source": {
"id": "20642316",
"cat_id": "43606",
"user_id": "1825875",
"title": "125 CM HOME",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtpdQTqkjEa7YQHc",
"_score": 1,
"_source": {
"id": "20642379",
"cat_id": "43604",
"user_id": "4642299",
"title": "Home with Big CM",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73Etp6QTqkjEa7YQHp",
"_score": 1,
"_source": {
"id": "20642409",
"cat_id": "43607",
"user_id": "4813303",
"title": "100 of live CM is here ",
"desc": "DESC"
}
}
]
}
}
},
}
]
}
}
}
I'm looking for Exact (or similar) titles not abundance words in titles, how can I get get Duplicate(similar) Docs in Elastic Search?

Terms aggregation not giving buckets

I'm trying a simple terms aggregation but the result is not creating buckets. Here is a sample document:
"hits": {
"total": 27330,
"max_score": 0.8293952,
"hits": [
{
"_index": "policy",
"_type": "policy",
"_id": "W0051311PNWO",
"_score": 0.8293952,
"_source": {
"productname": "UK CARGO",
"alternateproductname": "ABC39393939",
"brokername": "Name***",
"agentname": "Name***",
"policyref": "ABC33333",
"client": "International Cargo Limited",
"addressline1": "",
"post/zipcode": "",
"telephone": null,
"bapolicyendorseid": 123334,
"prevcertnum": "",
"policystatus": "Endorsed",
"#version": "1",
"#timestamp": "2015-10-09T11:11:02.018Z"
}
},
Here is the aggregate search (in sense):
get policy/policy/_search
{
"aggs": {
"statuses": {
"terms": {
"field": "policystatus"
}
}
}
}
I'm trying to get the equivalent of:
select policystatus, count(*) from policy group by policystatus
The result is not showing buckets. It is showing regular document results:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 227398,
"max_score": 1,
"hits": [
{
"_index": "policy",
"_type": "policy",
"_id": "04/QQQ/04UKI0018",
"_score": 1,
"_source": {
"productname": "2 RES 01/09/04",
"alternateproductname": "2 RES 01/09/04",
"brokername": "Blah LTD",
"agentname": "Insurance",
"policyref": "blah",
"client": "blah",
"addressline1": "blah",
"post/zipcode": "blah",
"telephone": null,
"bapolicyendorseid": 21427,
"prevcertnum": "04UKI0018",
"policystatus": "Pending",
"#version": "1",
"#timestamp": "2015-10-09T11:10:10.146Z"
}
},
Try this:
GET /policy/policy/_search?search_type=count
{
"aggs": {
"statuses": {
"terms": {
"field": "policystatus"
}
}
}
}
meaning capital letters GET and search_type=count to get only the buckets, not also the hits.

Resources