Elasticsearch separate aggregation based on values from first - elasticsearch

I'm using a Elasticsearch 6.8.8 and trying to aggregate the number of entities and relationships over a given time period.
Here is the data structure and examples values of the index:
date entityOrRelationshipId startId endId type
=========================================================================
DATETIMESTAMP ENT1_ID null null ENTITY
DATETIMESTAMP ENT2_ID null null ENTITY
DATETIMESTAMP ENT3_ID null null ENTITY
DATETIMESTAMP REL1_ID ENT1_ID ENT2_ID RELATIONSHIP
DATETIMESTAMP REL2_ID ENT3_ID ENT1_ID RELATIONSHIP
etc.
For a given entity ID, I want to get the top 50 relationships. I have started with the following query.
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"date": {
"gte": "2020-04-01T00:00:00.000+00:00",
"lt": "2020-04-28T00:00:00.000+00:00"
}
}
}
]
}
},
"aggs": {
"my_rels": {
"filter": {
"bool": {
"must": [
{
"term": {
"type": "RELATIONSHIP"
}
},
{
"bool": {
"should": [
{
"term": {"startId": "ENT1_ID"}
},
{
"term": {"endId": "ENT1_ID"}
}
]
}
}
]
}
},
"aggs": {
"my_rels2": {
"terms": {
"field": "entityOrRelationshipId",
"size": 50
},
"aggs": {
"my_rels3": {
"top_hits": {
"_source": {
"includes": ["startId","endId"]
},
"size": 1
}
}
}
}
}
}
}
}
This produces the following results:
{
"took": 54,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 93122,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"my_rels": {
"doc_count": 332,
"my_rels2": {
"doc_count_error_upper_bound": 6,
"sum_other_doc_count": 259,
"buckets": [
{
"key": "REL1_ID",
"doc_count": 47,
"my_rels3": {
"hits": {
"total": 47,
"max_score": 1.0,
"hits": [
{
"_index": "trends",
"_type": "trend",
"_score": 1.0,
"_source": {
"endId": "ENT2_ID",
"startId": "ENT1_ID"
}
}
]
}
}
},
{
"key": "REL2_ID",
"doc_count": 26,
"my_rels3": {
"hits": {
"total": 26,
"max_score": 1.0,
"hits": [
{
"_index": "trends",
"_type": "trend",
"_score": 1.0,
"_source": {
"endId": "ENT1_ID",
"startId": "ENT3_ID"
}
}
]
}
}
}
]
}
}
}
}
This lists the top 50 relationships. For each relationship it lists the relationship ID, the count and the entity ids (startId, endId). What I would like to do now is produce another aggregation of entity counts for those distinct entities. Ideally this would not be a nested aggregation but a separate one using the rel ids identified in the first aggregation.
Is that possible to do in this query?

Unfortunately you cannot aggregate over the results of top_hits in Elasticsearch.
Here is the link to GitHub issue.
You can have other aggregation on a parallel level of top_hit but you cannot have any sub_aggregation below top_hit.
You can have a parallel level aggregation like:
"aggs": {
"top_hits_agg": {
"top_hits": {
"size": 10,
"_source": {
"includes": ["score"]
}
}
},
"avg_agg": {
"avg": {
"field": "score"
}
}
}

Related

Elasticsearch - How do i search on 2 fields. 1 must be null and other must match search text

I am trying to do a search on elasticsearch 6.8.
I don't have control over the elastic search instance, meaning i cannot control how the data is indexed.
I have data structured like this when i do a match. all search:
{ "took": 4,
"timed_out": false,
"_shards": {
"total": 13,
"successful": 13,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 15.703552,
"hits": [ {
"_index": "(removed index)",
"_type": "_doc",
"_id": "******** (Removed id)",
"_score": 15.703552,
"_source": {
"VCompany": {
"cvrNummer": 12345678,
"penheder": [
{
"pNummer": 1234567898,
"periode": {
"gyldigFra": "2013-04-10",
"gyldigTil": "2014-09-30"
}
}
],
"vMetadata": {
"nyesteNavn": {
"navn": "company1",
"periode": {
"gyldigFra": "2013-04-10",
"gyldigTil": "2014-09-30"
}
},
}
}
}
}
}]
The json might not be fully complete because i removed some unneeded data. So what I am trying to do is search where: "vCompany.vMetaData.nyesteNavn.gyldigTil" is null and where "vCompany.vMetaData.nyesteNavn.navn" will match a text string.
I tried something like this:
{
"query": {
"bool": {
"must": [
{"match": {"Vrvirksomhed.virksomhedMetadata.nyesteNavn.navn": "company1"}}
],
"should": {
"terms": {
"Vrvirksomhed.penheder.periode.gyldigTil": null
}
}
}
}
You need to use must_not with exists query like below to check if field is null or not. Below query will give result where company1 is matching and Vrvirksomhed.penheder.periode.gyldigTil field is null.
{
"query": {
"bool": {
"must": [
{
"match": {
"Vrvirksomhed.virksomhedMetadata.nyesteNavn.navn": "company1"
}
}
],
"must_not": [
{
"exists": {
"field": "Vrvirksomhed.penheder.periode.gyldigTil"
}
}
]
}
}
}

Is it possible to use a query result into another query in ElasticSearch?

I have two queries that I want to combine, the first one returns a document with some fields.
Now I want to use one of these fields into the new query without creating two separates ones.
Is there a way to combine them in order to accomplish my task?
This is the first query
{
"_source": {
"includes": [
"data.session"
]
},
"query": {
"bool": {
"must": [
{
"match": {
"field1": "9419"
}
},
{
"match": {
"field2": "5387"
}
}
],
"filter": [
{
"range": {
"timestamp": {
"time_zone": "+00:00",
"gte": "2020-10-24 10:16",
"lte": "2020-10-24 11:16"
}
}
}
]
}
},
"size" : 1
}
And this is the response returned:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 109,
"relation": "eq"
},
"max_score": 3.4183793,
"hits": [
{
"_index": "file",
"_type": "_doc",
"_id": "UBYCkgsEzLKoXh",
"_score": 3.4183793,
"_source": {
"data": {
"session": "123456789"
}
}
}
]
}
}
I want to use that "data.session" into another query, instead of rewriting the value of the field by passing the result of the first query.
{
"_source": {
"includes": [
"data.session"
]
},
"query": {
"bool": {
"must": [
{
"match": {
"data.session": "123456789"
}
}
]
}
},
"sort": [
{
"timestamp": {
"order": "asc"
}
}
]
}
If you mean to use the result of the first query as an input to the second query, then it's not possible in Elasticsearch. But if you share your query and use-case, we might suggest you better way.
ElasticSearch does not allow sub queries or inner queries.

How to get the number of documents for each occurence in Elastic?

I have an Elastic index (say file) where I append a document every time the file is downloaded by a client.
Each document is quite basic, it contains a field filename and a date when to indicate the time of the download.
What I want to achieve is to get, for each file the number of times it has been downloaded in the last 3 months.
For the moment, the closest I get it with this query:
{
"query": {
"range": {
"when": {
"gte": "now-3M"
}
}
},
"aggs": {
"downloads": {
"terms": {
"field": "filename.keyword"
}
}
}
}
The result is something like that:
{
"took": 793,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": 1.0,
"hits": [
{
"_index": "file",
"_type": "_doc",
"_id": "8DkTFHQB3kG435svAA3O",
"_score": 1.0,
"_source": {
"filename": "taz",
"id": 24009,
"when": "2020-08-21T08:11:54.943Z"
}
},
...
]
},
"aggregations": {
"downloads": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 418486,
"buckets": [
{
"key": "file1",
"doc_count": 313873
},
{
"key": "file2",
"doc_count": 281504
},
...,
{
"key": "file10",
"doc_count": 10662
}
]
}
}
}
So I am quite interested in the aggregations.downloads.bucket, but this is limited to 10 results.
What do I need to change in my query to have all the list (in my case, I will have ~15,000 different files)?
Thanks.
The size of the terms buckets defaults to 10. If you want to increase it, go with
{
"query": {
"range": {
"when": {
"gte": "now-3M"
}
}
},
"aggs": {
"downloads": {
"terms": {
"field": "filename.keyword",
"size": 15000 <-------
}
}
}
}
Note that there are strategies to paginate those buckets using a composite aggregation.
Also note that as your index grows, you may hit the default limit as well. It's a dynamic cluster-wide setting so it can be changed.

Elasticsearch order by _score or max_score from SearchResponse Java API

I have an index which contain documents with same employee name and email address but varies with other information such as meetings attended and amount spent.
{
"emp_name" : "Raju",
"emp_email" : "raju#abc.com",
"meeting" : "World cup 2019",
"cost" : "2000"
}
{
"emp_name" : "Sanju",
"emp_email" : "sanju#abc.com",
"meeting" : "International Academy",
"cost" : "3000"
}
{
"emp_name" : "Sanju",
"emp_email" : "sanju#abc.com",
"meeting" : "School of Education",
"cost" : "4000"
}
{
"emp_name" : "Sanju",
"emp_email" : "sanju#abc.com",
"meeting" : "Water world",
"cost" : "1200"
}
{
"emp_name" : "Sanju",
"emp_email" : "sanju#abc.com",
"meeting" : "Event of Tech",
"cost" : "5200"
}
{
"emp_name" : "Bajaj",
"emp_email" : "bajaju#abc.com",
"meeting" : "Event of Tech",
"cost" : "4500"
}
Now, when I do search based on emp_name field like "raj" then I should get one of the Raju, Sanju and Bajaj document since I am using fuzzy search functionality (fuzziness(auto)).
I am implementing elasticsearch using Java High level rest client 6.8 API.
TermsAggregationBuilder termAggregation = AggregationBuilders.terms("employees")
.field("emp_email.keyword")
.size(2000);
TopHitsAggregationBuilder termAggregation1 = AggregationBuilders.topHits("distinct")
.sort(new ScoreSortBuilder().order(SortOrder.DESC))
.size(1)
.fetchSource(includeFields, excludeFields);
Based on the above code, it's getting distinct documents but Raju's record is not on the top of the response instead we see Sanju document due to the number of counts.
Below is the JSON created based on the searchrequest.
{
"size": 0,
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "raj",
"fields": [
"emp_name^1.0",
"emp_email^1.0"
],
"boost": 1.0
}
}
],
"filter": [
{
"range": {
"meeting_date": {
"from": "2019-12-01",
"to": null,
"boost": 1.0
}
}
}
],
"adjust_pure_negative": true,
"boost": 1.0
}
},
"aggregations": {
"employees": {
"terms": {
"field": "emp_email.keyword",
"size": 2000,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggregations": {
"distinct": {
"top_hits": {
"from": 0,
"size": 1,
"version": false,
"explain": false,
"_source": {
"includes": [
"all_uid",
"emp_name",
"emp_email",
"meeting",
"country",
"cost"
],
"excludes": [
]
},
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}
}
}
}
}
}
I think if we order by max_score or _score then Raju's record will be on top of the response.
Could you please let me know how to get order by _score or max_score of the document returned by response?
Sample response is
{
"took": 264,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 232,
"max_score": 0.0,
"hits": [
]
},
"aggregations": {
"sterms#employees": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Sanju",
"doc_count": 4,
"top_hits#distinct": {
"hits": {
"total": 4,
"max_score": 35.71312,
"hits": [
{
"_index": "indexone",
"_type": "employeedocs",
"_id": "1920424",
"_score": 35.71312,
"_source": {
"emp_name": "Sanju",
...
}
}
]
}
}
},
{
"key": "Raju",
"doc_count": 1,
"top_hits#distinct": {
"hits": {
"total": 1,
"max_score": 89.12312,
"hits": [
{
"_index": "indexone",
"_type": "employeedocs",
"_id": "1920424",
"_score": 89.12312,
"_source": {
"emp_name": "Raju",
...
}
}
]
}
}
}
Let me know if you have any question.
Note: I see many similar kind of questions but none of them helped me. Please advise.
Thanks,
Chetan

Aggregation on geo_piont elasticsearch

Is there a way to aggregate on a geo_point field and to receive the actual lat long?
all i managed to do is get the hash geo.
what i did so far:
creating the index
PUT geo_test
{
"mappings": {
"sharon_test": {
"properties": {
"location": {
"type": "geo_point"
}
}
}
}
}
adding X docs with different lat long
POST geo_test/sharon_test
{
"location": {
"lat": 45,
"lon": -7
}
}
ran this agg:
GET geo_test/sharon_test/_search
{
"query": {
"bool": {
"must": [
{
"match_all": {}
}
]
}
},
"aggs": {
"locationsAgg": {
"geohash_grid": {
"field": "location",
"precision" : 12
}
}
}
}
i got this result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "geo_test",
"_type": "sharon_test",
"_id": "fGb4uGEBfEDTRjcEmr6i",
"_score": 1,
"_source": {
"location": {
"lat": 41.12,
"lon": -71.34
}
}
},
{
"_index": "geo_test",
"_type": "sharon_test",
"_id": "oWb4uGEBfEDTRjcE7b6R",
"_score": 1,
"_source": {
"location": {
"lat": 4,
"lon": -7
}
}
}
]
},
"aggregations": {
"locationsAgg": {
"buckets": [
{
"key": "ebenb8nv8nj9",
"doc_count": 1
},
{
"key": "drm3btev3e86",
"doc_count": 1
}
]
}
}
}
I want to know if i can get one of the 2:
1. convert the "key" that is currently representing as a geopoint hash to the sources lat/long
2. show the lat, long in the aggregation in the first place
Thanks!
P.S
I also tried the other geo aggregations but all they give me is the number of docs that fit my aggs conditions, i need the actual values
E.G
wanted this aggregation to return all the locations i had in my index, but it only returned the count
GET geo_test/sharon_test/_search
{
"query": {
"bool": {
"must": [
{
"match_all": {}
}
]
}
},
"aggs": {
"distanceRanges": {
"geo_distance": {
"field": "location",
"origin": "50.0338, 36.2242 ",
"unit": "meters",
"ranges": [
{
"key": "All Locations",
"from": 1
}
]
}
}
}
}
You can actually use geo_bounds inside the geo_hash to get a bounding box to narrow it down precisely but to get the exact location you will need to decode the geohash
GET geo_test/sharon_test/_search
{
"query":{
"bool":{
"must":[
{
"match_all":{
}
}
]
}
},
"aggs":{
"locationsAgg":{
"geohash_grid":{
"field":"location",
"precision":12
},
"aggs":{
"cell":{
"geo_bounds":{
"field":"location"
}
}
}
}
}
}

Resources