Elasticsearch aggregation with custom query parser - elasticsearch

I cannot seem to aggregate my query results when using my custom query parser. I get a result set by these are not aggregated. When using a standard query parser like match everything turns out well.
What works:
GET pages/_search
{
"query": {
"match": {
"text": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
returns a nice aggregated result:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 11.11176,
"hits": [
...
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6lzDazctHi91RE",
"_score": 3.3503218,
"_source": {
"rooturl": "document",
"type": "equation",
"url": "document:poly",
"text": "coefficient"
}
},
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6xzDazctHi91RF",
"_score": 3.3503218,
"_source": {
"rooturl": document",
"type": "equation",
"url": "document:poly",
"text": "dot"
}
}
...
]
},
"aggregations": {
"docs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "document",
"doc_count": 10
}
]
}
}
}
But when using my custom query parser, The result is not aggregated.
Query:
GET pages/_search
{
"query": {
"my_custom_query_parser": {
"query": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
Can anyone point me into the right direction?

Related

Elasticsearch returns documents with a query must_not exists

Elasticsearch: 6.5.4
Issue: I'm executing a bool query (sample to follow) where I'm checking for the existence of a specific field. The issue is, I'm getting results back where the field does exist but has an empty array.
My question is, how do I properly execute a query and only get results where nlp is not added to the document at all.
Sample query:
{
"size": 100,
"sort": [{
"publishedAt": {
"order": "asc"
}
}],
"_source": {
"includes": ["nlp"]
},
"query": {
"bool": {
"must_not": {
"exists": {
"field": "nlp.categories.gcp"
}
}
}
}
}
Sample Mapping:
(This was automatically created by Elastic Search, with the exception of the null_value, I tried adding that).
{
"mapping": {
"article": {
"properties": {
"nlp": {
"properties": {
"categories": {
"properties": {
"gcp": {
"properties": {
"confidence": {
"type": "float"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"null_value": "[]",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
}
}
}
}
Sample Result:
{
"took": 68,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1126581,
"max_score": null,
"hits": [
{
"_index": "news",
"_type": "article",
"_id": "UTuVmmsBE1H01hY9Rn6i",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": []
}
}
},
"sort": [
1509940860000
]
},
{
"_index": "news",
"_type": "article",
"_id": "2w6PmmsBIpi-jAhhO13F",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": []
}
}
},
"sort": [
1510027260000
]
}
]
}
}
When the nlp.categories.gcp has values in it, a typical response would look like this.
{
"took": 26,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 475690,
"max_score": null,
"hits": [
{
"_index": "news",
"_type": "article",
"_id": "6Q6JmmsBIpi-jAhhAlcm",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": [
{
"confidence": 0.8999999761581421,
"name": "/Travel/Hotels & Accommodations"
}
]
}
}
},
"sort": [
1510215565000
]
},
{
"_index": "news",
"_type": "article",
"_id": "rzunmmsBE1H01hY9sLyE",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": [
{
"confidence": 0.9399999976158142,
"name": "/Travel/Hotels & Accommodations"
}
]
}
}
},
"sort": [
1510228881000
]
}
]
}
}

Unique search results from ElasticSearch

I am new to ElasticSearch and can't quite figure out what I want is possible or not.
I can query like this:
GET entity/_search
{
"query": {
"bool": {
"must": [
{ "match": { "searchField": "searchValue" }}
]
}
},
"aggs" : {
"uniq_Id" : {
"terms" : { "field" : "Id", "size":500 }
}
}
}
and it will return top search results and the term aggregation buckets. But ideally what I would like for the search results to return, is only one (perhaps the top one, does not matter) for each of unique Id's defined in the aggregation terms.
You can make use of Terms Aggregation along with the Top Hits Aggregation to give you the result you are looking for.
Now once you do that, specify the size as 1 in the Top Hits Aggregation
Based on your query I've created sample mapping,documents, aggregation query and the response for your reference.
Mapping:
PUT mysampleindex
{
"mappings": {
"mydocs": {
"properties": {
"searchField":{
"type": "text"
},
"Id": {
"type": "keyword"
}
}
}
}
}
Sample Documents:
POST mysampleindex/mydocs/1
{
"searchField": "elasticsearch",
"Id": "1000"
}
POST mysampleindex/mydocs/2
{
"searchField": "elasticsearch is awesome",
"Id": "1000"
}
POST mysampleindex/mydocs/3
{
"searchField": "elasticsearch is awesome",
"Id": "1001"
}
POST mysampleindex/mydocs/4
{
"searchField": "elasticsearch is pretty cool",
"Id": "1001"
}
POST mysampleindex/mydocs/5
{
"searchField": "elasticsearch is pretty cool",
"Id": "1002"
}
Query:
POST mysampleindex/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"searchField": "elasticsearch"
}
}
]
}
},
"aggs": {
"myUniqueIds": {
"terms": {
"field": "Id",
"size": 10
},
"aggs": {
"myDocs": {
"top_hits": { <---- Top Hits Aggregation
"size": 1 <---- Note this
}
}
}
}
}
}
Sample Response:
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"myUniqueIds": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1000",
"doc_count": 2,
"myDocs": {
"hits": {
"total": 2,
"max_score": 0.2876821,
"hits": [
{
"_index": "mysampleindex",
"_type": "mydocs",
"_id": "1",
"_score": 0.2876821,
"_source": {
"searchField": "elasticsearch",
"Id": "1000"
}
}
]
}
}
},
{
"key": "1001",
"doc_count": 2,
"myDocs": {
"hits": {
"total": 2,
"max_score": 0.25316024,
"hits": [
{
"_index": "mysampleindex",
"_type": "mydocs",
"_id": "3",
"_score": 0.25316024,
"_source": {
"searchField": "elasticsearch is awesome",
"Id": "1001"
}
}
]
}
}
},
{
"key": "1002",
"doc_count": 1,
"myDocs": {
"hits": {
"total": 1,
"max_score": 0.2876821,
"hits": [
{
"_index": "mysampleindex",
"_type": "mydocs",
"_id": "5",
"_score": 0.2876821,
"_source": {
"searchField": "elasticsearch is pretty cool",
"Id": "1002"
}
}
]
}
}
}
]
}
}
}
Notice that I am not returning any bool results in the above, the search result you are looking for comes in the form of Top Hits Aggregation.
Hope this helps!

How to return actual value (not lowercase) when performing search with terms aggregation?

I am working on an ElasticSearch (6.2) project where the index has many keyword fields and they are normalized with lowercase filter for performing case-insensitive searches. The search working great and returning actual values (not lowercase) of the normalized fields. However, the aggregations not returning the actual value (returning lowercase) of the fields.
The following example has been taken from ElasticSearch doc.
https://www.elastic.co/guide/en/elasticsearch/reference/master/normalizer.html
Creating index:
PUT index
{
"settings": {
"analysis": {
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"foo": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
}
Inserting a doc:
PUT index/_doc/1
{
"foo": "Bar"
}
PUT index/_doc/2
{
"foo": "Baz"
}
Search with aggregation:
GET index/_search
{
"size": 0,
"aggs": {
"foo_terms": {
"terms": {
"field": "foo"
}
}
}
}
Result:
{
"took": 43,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped" : 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.0,
"hits": {
"total": 2,
"max_score": 0.47000363,
"hits": [
{
"_index": "index",
"_type": "_doc",
"_id": "1",
"_score": 0.47000363,
"_source": {
"foo": "Bar"
}
},
{
"_index": "index",
"_type": "_doc",
"_id": "2",
"_score": 0.47000363,
"_source": {
"foo": "Baz"
}
}
]
}
},
"aggregations": {
"foo_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bar",
"doc_count": 1
},
{
"key": "baz",
"doc_count": 1
}
]
}
}
}
If you check the aggregation, you will see that lowercase value has been returned. e.g. "key": "bar".
Is there any way to change the aggregation to return actual value?
e.g. "key": "Bar"
If you want to do case-insensitive search yet return exact values in your aggregations you don't need any normalizer. You can simply have a text field (which lowercases the tokens and allows case-insensitive search by default) with a keyword sub-field. You'd use the former for search and the latter for aggregations. It goes like this:
PUT index
{
"mappings": {
"_doc": {
"properties": {
"foo": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
}
}
After indexing your two documents, your can issue a terms aggregation on foo.keyword:
GET index/_search
{
"size": 2,
"aggs": {
"foo_terms": {
"terms": {
"field": "foo.keyword"
}
}
}
}
And the result would look like this:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "index",
"_type": "_doc",
"_id": "2",
"_score": 1,
"_source": {
"foo": "Baz"
}
},
{
"_index": "index",
"_type": "_doc",
"_id": "1",
"_score": 1,
"_source": {
"foo": "Bar"
}
}
]
},
"aggregations": {
"foo_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Bar",
"doc_count": 1
},
{
"key": "Baz",
"doc_count": 1
}
]
}
}
}

Find Duplicate Documents in Elastic Search

I'm looking for a solution to find duplicate(exact) Docs in ElasticSearch.
I've read https://qbox.io/blog/minimizing-document-duplication-in-elasticsearch and tried it but its results are not as I expected as example this is my sample simple query :
GET /last_month_ads/_search
{
"size": 0,
"fields": [
"title"
],
"aggs": {
"duplicateCount": {
"terms": {
"field": "title",
"size" : 3
},
"aggs": {
"duplicateDocuments": {
"top_hits": {}
}
}
}
}
}
and the result is
{
"took": 981,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"hits": {
"total": 482909,
"max_score": 0,
"hits": []
},
"aggregations": {
"duplicateCount": {
"doc_count_error_upper_bound": 11667,
"sum_other_doc_count": 1958146,
"buckets": [
{
"key": "CM",
"doc_count": 46867,
"duplicateDocuments": {
"hits": {
"total": 46867,
"max_score": 1,
"hits": [
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtoBQTqkjEa7YQG1",
"_score": 1,
"_source": {
"id": "20642316",
"cat_id": "43606",
"user_id": "1825875",
"title": "125 CM HOME",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtpdQTqkjEa7YQHc",
"_score": 1,
"_source": {
"id": "20642379",
"cat_id": "43604",
"user_id": "4642299",
"title": "Home with Big CM",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73Etp6QTqkjEa7YQHp",
"_score": 1,
"_source": {
"id": "20642409",
"cat_id": "43607",
"user_id": "4813303",
"title": "100 of live CM is here ",
"desc": "DESC"
}
}
]
}
}
},
}
]
}
}
}
I'm looking for Exact (or similar) titles not abundance words in titles, how can I get get Duplicate(similar) Docs in Elastic Search?

Using an aggregation on data with forward slash in elasticsearch

I have data, that has an attribute like this
apiUrl:/REST/endpoint/123
Now I would like to show all the urls and I am trying to use an aggregate function (apiUrl.raw is not_analyzed part of the multifield):
POST /index/type/_search
{
"aggregations": {
"application": {
"terms": {
"field": "apiUrl.raw"
}
}
}
}
When running this query, no results get returned. What am I doing wrong? I would expect something along the lines (and the count of occurence):
/REST/api1/123
/REST/otherApi/345
Thanks!
Your query does return non-empty results. Compare and let us know what was the difference:
PUT index
PUT index/type/_mapping
{
"properties" : {
"apiUrl": {
"type": "multi_field",
"fields": {
"apiUrl": {"type":"string", "index":"analyzed"},
"raw": {"type":"string", "index":"not_analyzed"}
}
}
}
}
GET index/type/_mapping
PUT index/type/1
{
"apiUrl":"/REST/api1/123"
}
PUT index/type/2
{
"apiUrl":"/REST/otherApi/345"
}
GET index/type/_search?fields=apiUrl.raw
GET index/type/_search
{
"aggregations": {
"application": {
"terms": {
"field": "apiUrl.raw"
}
}
}
}
Response:
{
"took": 76,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "index",
"_type": "type",
"_id": "1",
"_score": 1,
"_source": {
"apiUrl": "/REST/api1/123"
}
},
{
"_index": "index",
"_type": "type",
"_id": "2",
"_score": 1,
"_source": {
"apiUrl": "/REST/otherApi/345"
}
}
]
},
"aggregations": {
"application": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "/REST/api1/123",
"doc_count": 1
},
{
"key": "/REST/otherApi/345",
"doc_count": 1
}
]
}
}
}

Resources