How to make aggregations work for text fields - elasticsearch

I am trying to write a elasticsearch query to get unique locality towns. my locality_town_keyword is of keyword type. when I try to search into locality_town_keyword, I get search hits but nothing in "aggregations":"Buckets".
Following is how my schema looks like...
"locality_town": {
"type": "text"
},
"locality_town_keyword": {
"type": "keyword"
},
My Search query looks like following
{
"query":
{
"prefix" : { "locality_town" : "m" }
},
"size": "1",
"_source": {
"includes": [
"locality_town*"
]
},
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword",
"size": 5,
"order": {
"_count": "desc"
}
}
}
}
}
Here is the output it gives
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 799,
"max_score": 1.0,
"hits": [
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_score": 1.0,
"_source": {
"locality_town": "Manchester",
"locality_town_keyword": "Manchester"
}
}
]
},
"aggregations": {
"loc": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
This is how one document looks like
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_version": 1,
"_seq_no": 39,
"_primary_term": 1,
"found": true,
"_source": {
"title": "Legal Services",
"buyers": "CENTRAL MANCHESTER UNIVERSITY HOSPITALS NHS FOUNDATION TRUST",
"postal_code": "M13 0JR",
"publish_date": "2015-03-03T15:48:45Z",
"status": "cancelled",
"start_date": "2017-03-03T00:00:00Z",
"endt_date": "2020-03-03T00:00:00Z",
"url": "https://www.temp.com",
"country": "England",
"description": "desc......",
"language": "en-GB",
"service": "OPEN_CONTRACTING",
"value": "0",
"value_currency": "GBP",
"winner": "",
"create_time": "2019-05-11T21:39:42Z",
"deadline_date": "1970-01-01T00:00:00Z",
"address": "Central Manchester University Hospitals NHS Foundation Trust Wilmslow Park",
"locality_town": "Manchester",
"locality_town_keyword": "Manchester",
"region": "North West",
"tender_type": "planning",
"cpv": "Health services ",
"strpublish_date": "2015-03-03T15:48:45Z",
"strstart_date": "2017-03-03T00:00:00Z",
"strend_date": "2020-03-03T00:00:00Z",
"strdeadline_date": "",
"winner_email": "",
"winner_address": "",
"winner_town": "",
"winner_postalcode": "",
"winner_phone": "",
"cpvs": "[\"Health services (85100000-0)\"]"
}
}

Looks like you have a typo in your aggregation query:
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword", <== here
"size": 5,
Try with locality_town_keyword instead!
Hope this helps!

Related

Iterating over doc to return a particular key's value in an array based on a match

data
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1000,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "learn",
"_id": "OeCLr4QBPMAw7FiXknKz",
"_score": 1,
"_source": {
"user_rating_size": 80,
"ratingdescription": 80,
"rating": "PG-13",
"release_year": 2004,
"user_rating_score": 82,
"title": "White Chicks",
"ratinglevel": "crude and sexual humor, language and some drug content"
}
},
{
"_index": "learn",
"_id": "QuCLr4QBPMAw7FiXknKz",
"_score": 1,
"_source": {
"user_rating_size": 80,
"ratingdescription": 90,
"rating": "TV-14",
"release_year": 2016,
"user_rating_score": 96,
"title": "Pretty Little Liars",
"ratinglevel": "Parents strongly cautioned. May be unsuitable for children ages 14 and under."
}
}
]
}
}
Mapping
{
"learn": {
"mappings": {
"_meta": {
"created_by": "file-data-visualizer"
},
"properties": {
"rating": {
"type": "keyword"
},
"ratingdescription": {
"type": "long"
},
"ratinglevel": {
"type": "text"
},
"release_year": {
"type": "long"
},
"title": {
"type": "text"
},
"user_rating_score": {
"type": "long"
},
"user_rating_size": {
"type": "long"
}
}
}
}
}
All i want is to return all the values of title as an array based on rating match(grouping).
I tried to group it based on rating but it returns the matching document. In this case i have to again loop through through to get just the value.
In aggregation, all I see from documentation is sum and other statistics based.
I also tried to do it through painless script but cant seem to figure out a way.
I had to add a keyword field type to title to be able to aggregate on it:
PUT learn
{
"mappings": {
"_meta": {
"created_by": "file-data-visualizer"
},
"properties": {
"rating": {
"type": "keyword"
},
"ratingdescription": {
"type": "long"
},
"ratinglevel": {
"type": "text"
},
"release_year": {
"type": "long"
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"user_rating_score": {
"type": "long"
},
"user_rating_size": {
"type": "long"
}
}
}
}
Via Aggregations
GET learn/_search
{
"size": 0,
"query": {
"match": {
"title": "pretty"
}
},
"aggs": {
"ratings": {
"terms": {
"field": "rating",
"size": 10
},
"aggs": {
"titles": {
"terms": {
"field": "title.keyword",
"size": 10
}
}
}
}
}
}
Results
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"ratings": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "TV-14",
"doc_count": 2,
"titles": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Another Pretty TV-14 movie",
"doc_count": 1
},
{
"key": "Pretty Little Liars",
"doc_count": 1
}
]
}
},
{
"key": "PG-13",
"doc_count": 1,
"titles": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Pretty White Chicks",
"doc_count": 1
}
]
}
}
]
}
}
}
Via Collapse query
GET learn/_search
{
"_source": false,
"query": {
"match": {
"title": "pretty"
}
},
"collapse": {
"field": "rating",
"inner_hits": {
"name": "titles",
"size": 5,
"_source": ["title"]
}
}
}
Results
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "learn",
"_id": "JVV4vIQBtNG1OrZoVQ2v",
"_score": 0.7361701,
"fields": {
"rating": [
"TV-14"
]
},
"inner_hits": {
"titles": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.7361701,
"hits": [
{
"_index": "learn",
"_id": "JVV4vIQBtNG1OrZoVQ2v",
"_score": 0.7361701,
"_source": {
"title": "Pretty Little Liars"
}
},
{
"_index": "learn",
"_id": "_FV4vIQBtNG1OrZo-Q95",
"_score": 0.5897495,
"_source": {
"title": "Another Pretty TV-14 movie"
}
}
]
}
}
}
},
{
"_index": "learn",
"_id": "wcV5vIQB5Gw0WET8ve-k",
"_score": 0.7361701,
"fields": {
"rating": [
"PG-13"
]
},
"inner_hits": {
"titles": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.7361701,
"hits": [
{
"_index": "learn",
"_id": "wcV5vIQB5Gw0WET8ve-k",
"_score": 0.7361701,
"_source": {
"title": "Pretty White Chicks"
}
}
]
}
}
}
}
]
}
}

Specify Elasticsearch aggregation fields when finding duplicates

I am using the following ES query when looking for duplicates:
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999,
"order": {
"_term": "asc"
}
},
"aggs": {
"_docs": {
"top_hits": {
"size": 99999
}
}
}
}
}
It works well, it returns the key which in this case is the phone, and inside of it it returns all the matches. The main problem is exactly that, on the _source it brings everything, which is a lot of fields on my case, and I wanted to specify to bring only the ones I need. Example of what's returning:
"duplicates": {
"1": {
"key": "1",
"doc_count": 2,
"_docs": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "local:company_id:1:sync",
"_type": "leads",
"_id": "23",
"_score": 1,
"_source": {
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": "", // .... and so on
I want to specify the fields that will be returned on the _source, is that possible?
Another problem that I'm having is that I want to order the aggregation results by a specific field (by id) but if I put any field name instead of _term it gives me an error.
Thank you!
In the below example, documents with id 29 and 23 have the same phone, hence these are duplicates. The search query will show only two fields i.e id and phone (you can change these fields according to your condition) and sort the top hits result on the basis of id
Adding a working example with index data, search query, and search result
Index Data:
{
"id": 29,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 30,
"phone": 1235,
"areacode_id": 92,
"areacode_state_id": 10,
"firstName": "Mark",
"lastName": "Smith",
"state": ""
}
Search Query:
{
"size": 0,
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999
},
"aggs": {
"_docs": {
"top_hits": {
"_source": {
"includes": [
"phone",
"id"
]
},
"sort": [
{
"id": {
"order": "asc"
}
}
]
}
}
}
}
}
}
Search Result:
"aggregations": {
"duplicates": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123456,
"doc_count": 2,
"_docs": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "66896259",
"_type": "_doc",
"_id": "1",
"_score": null,
"_source": {
"phone": 123456,
"id": 23
},
"sort": [
23 // note this
]
},
{
"_index": "66896259",
"_type": "_doc",
"_id": "2",
"_score": null,
"_source": {
"phone": 123456,
"id": 29
},
"sort": [
29 // note this
]
}
]
}
}
}
]
}
}

How can I fetch all disctinct objects within a field over an elasticsearch index?

I am trying to get all distinct objects from within a field over the whole index.
What I tried so far is:
POST http://es5server:9200/indexname/_search
Content-Type: application/json
{
"size": "1",
"aggs": {
"tags": {
"terms": {
"field": "tags"
}
}
}
}
Currently this returns the following for me (I set size to 1 to include a sample document):
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.0,
"hits": [
{
"_index": "indexname",
"_type": "news",
"_id": "51",
"_score": 1.0,
"_source": {
"localized": {
"de": {
"title": null,
"shorttext": null,
"text": null
},
"en": {
"title": "test new title",
"shorttext": "hello my name is mayur and this is testnews text",
"text": null
}
},
"type": "object",
"key": "testnews",
"path": "\/",
"tags": {
"38": {
"name": "I AM",
"parent": "0"
},
"45": {
"name": "ffddd",
"parent": "43"
},
"43": {
"name": "kkjjttdd",
"parent": "0"
}
}
}
}
]
},
"aggregations": {
"tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
The buckets are empty, is this because the tags field contains objects and not text?
How can I get ES to return all distinct objects within the tags fields of all documents?
Looks like you need to have nested objects for you solution. See this question answered here

Find Duplicate Documents in Elastic Search

I'm looking for a solution to find duplicate(exact) Docs in ElasticSearch.
I've read https://qbox.io/blog/minimizing-document-duplication-in-elasticsearch and tried it but its results are not as I expected as example this is my sample simple query :
GET /last_month_ads/_search
{
"size": 0,
"fields": [
"title"
],
"aggs": {
"duplicateCount": {
"terms": {
"field": "title",
"size" : 3
},
"aggs": {
"duplicateDocuments": {
"top_hits": {}
}
}
}
}
}
and the result is
{
"took": 981,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"hits": {
"total": 482909,
"max_score": 0,
"hits": []
},
"aggregations": {
"duplicateCount": {
"doc_count_error_upper_bound": 11667,
"sum_other_doc_count": 1958146,
"buckets": [
{
"key": "CM",
"doc_count": 46867,
"duplicateDocuments": {
"hits": {
"total": 46867,
"max_score": 1,
"hits": [
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtoBQTqkjEa7YQG1",
"_score": 1,
"_source": {
"id": "20642316",
"cat_id": "43606",
"user_id": "1825875",
"title": "125 CM HOME",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtpdQTqkjEa7YQHc",
"_score": 1,
"_source": {
"id": "20642379",
"cat_id": "43604",
"user_id": "4642299",
"title": "Home with Big CM",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73Etp6QTqkjEa7YQHp",
"_score": 1,
"_source": {
"id": "20642409",
"cat_id": "43607",
"user_id": "4813303",
"title": "100 of live CM is here ",
"desc": "DESC"
}
}
]
}
}
},
}
]
}
}
}
I'm looking for Exact (or similar) titles not abundance words in titles, how can I get get Duplicate(similar) Docs in Elastic Search?

Elasticsearch - Search with wildcards

I've managed to populate my index with 4 documents using this bulk request:
POST localhost:9200/titles/movies/_bulk
{"index":{"_id":"1"}}
{"id": "1","level": "first","titles": [{"value": "The Bad and the Beautiful","type": "Catalogue","main": true},{"value": "The Bad and the Beautiful (1945)","type": "International","main": false}]}
{"index":{"_id":"2"}}
{"id": "2","level": "first","titles": [{"value": "Bad Day at Black Rock","type": "Drama","main": true}]}
{"index":{"_id":"3"}}
{"id": "3","level": "second","titles": [{"value": "Baker's Wife","type": "AnotherType","main": true},{"value": "Baker's Wife (1940)","type": "Trasmitted","main": false}]}
{"index":{"_id":"4"}}
{"id": "4","level": "second","titles": [{"value": "Bambi","type": "Educational","main": true},{"value": "The Baby Deer and the hunter (1942)","type": "Fantasy","main": false}]}
Now how can I perform searches with wildcards on all available titles?
Something like
localhost:9200/titles/movies/_search?q=*&sort=level:asc
but providing one or more wilcards. For instance searching for "The % the %" and parsing the response from elasticsearch to eventually return something like:
{
"count":2,
"results":[{
"id":"1",
"level":"first",
"foundInTitleTypes":["Catalogue","International"]
},{
"id":"4",
"level":"second",
"foundInTitleTypes":["Fantasy"]
}]
}
Thanks!
Elasticsearch provides regex support in the the regular match query
GET titles/movies/_search
{
"query": {
"match" : { "titles.value" : "The * the *" }
}
}
Gives you this
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.6406528,
"hits": [
{
"_index": "titles",
"_type": "movies",
"_id": "4",
"_score": 1.6406528,
"_source": {
"id": "4",
"level": "second",
"titles": [
{
"value": "Bambi",
"type": "Educational",
"main": true
},
{
"value": "The Baby Deer and the hunter (1942)",
"type": "Fantasy",
"main": false
}
]
}
},
{
"_index": "titles",
"_type": "movies",
"_id": "1",
"_score": 0.9026783,
"_source": {
"id": "1",
"level": "first",
"titles": [
{
"value": "The Bad and the Beautiful",
"type": "Catalogue",
"main": true
},
{
"value": "The Bad and the Beautiful (1945)",
"type": "International",
"main": false
}
]
}
}
]
}
}
To update to your question URI search, I'm not sure if it is possible, if you do it with curl you just omit the query dsl as data
curl localhost:9200/titles/movies/_search -d '{"query":{"match":{"titles.value":"The * the *"}}}'
{"took":46,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":2,"max_score":1.6406528,"hits":[{"_index":"titles","_type":"movies","_id":"4","_score":1.6406528,"_source":{"id": "4","level": "second","titles": [{"value": "Bambi","type": "Educational","main": true},{"value": "The Baby Deer and the hunter (1942)","type": "Fantasy","main": false}]}},{"_index":"titles","_type":"movies","_id":"1","_score":0.9026783,"_source":{"id": "1","level": "first","titles": [{"value": "The Bad and the Beautiful","type": "Catalogue","main": true},{"value": "The Bad and the Beautiful (1945)","type": "International","main": false}]}}]}}
Update to latest question:
Well if you want to sort by level, you need to provide a mapping for elasticsearch. What I did:
Delete index
DELETE titles
Add mapping
PUT titles
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"movies": {
"properties": {
"level": {
"type": "keyword"
}
}
}
}
}
Refine Query DSL
GET titles/movies/_search
{
"_source": [
"id",
"level",
"titles.value"
],
"sort": [
{
"level": {
"order": "asc"
}
}
],
"query": {
"match": {
"titles.value": "The * the *"
}
}
}
That gives me
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": null,
"hits": [
{
"_index": "titles",
"_type": "movies",
"_id": "1",
"_score": null,
"_source": {
"level": "first",
"id": "1",
"titles": [
{
"value": "The Bad and the Beautiful"
},
{
"value": "The Bad and the Beautiful (1945)"
}
]
},
"sort": [
"first"
]
},
{
"_index": "titles",
"_type": "movies",
"_id": "4",
"_score": null,
"_source": {
"level": "second",
"id": "4",
"titles": [
{
"value": "Bambi"
},
{
"value": "The Baby Deer and the hunter (1942)"
}
]
},
"sort": [
"second"
]
}
]
}
}

Resources