Find Duplicate Documents in Elastic Search - elasticsearch

I'm looking for a solution to find duplicate(exact) Docs in ElasticSearch.
I've read https://qbox.io/blog/minimizing-document-duplication-in-elasticsearch and tried it but its results are not as I expected as example this is my sample simple query :
GET /last_month_ads/_search
{
"size": 0,
"fields": [
"title"
],
"aggs": {
"duplicateCount": {
"terms": {
"field": "title",
"size" : 3
},
"aggs": {
"duplicateDocuments": {
"top_hits": {}
}
}
}
}
}
and the result is
{
"took": 981,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"hits": {
"total": 482909,
"max_score": 0,
"hits": []
},
"aggregations": {
"duplicateCount": {
"doc_count_error_upper_bound": 11667,
"sum_other_doc_count": 1958146,
"buckets": [
{
"key": "CM",
"doc_count": 46867,
"duplicateDocuments": {
"hits": {
"total": 46867,
"max_score": 1,
"hits": [
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtoBQTqkjEa7YQG1",
"_score": 1,
"_source": {
"id": "20642316",
"cat_id": "43606",
"user_id": "1825875",
"title": "125 CM HOME",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73EtpdQTqkjEa7YQHc",
"_score": 1,
"_source": {
"id": "20642379",
"cat_id": "43604",
"user_id": "4642299",
"title": "Home with Big CM",
"desc": "DESC"
}
},
{
"_index": "last_month_ads",
"_type": "ads",
"_id": "AV73Etp6QTqkjEa7YQHp",
"_score": 1,
"_source": {
"id": "20642409",
"cat_id": "43607",
"user_id": "4813303",
"title": "100 of live CM is here ",
"desc": "DESC"
}
}
]
}
}
},
}
]
}
}
}
I'm looking for Exact (or similar) titles not abundance words in titles, how can I get get Duplicate(similar) Docs in Elastic Search?

Related

Sorting aggregated data in elastic search

I am doing a search that is doing an aggregation by xyz field and getting the latest version. Now I need to sort the aggregated data based on created field. Let me know how we can do that.
{
"query": {
"query_string": {
"query": ""
}
},
"aggs": {
"uuid": {
"terms": {
"field": "xyz.keyword"
},
"aggs": {
"top_trades_hits": {
"top_hits": {
"sort": [
{
"version": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
the Above mentioned query returns
{
"aggregations": {
"uuid": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "794a5b8f-3e22-4ff9-98bb-b8b54c85948e",
"doc_count": 3,
"agg": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_type": "doc",
"_id": "7",
"_score": null,
"_source": {
"uuid": "794a5b8f-3e22-4ff9-98bb-b8b54c85948e",
"type": "qsdn",
"discontinued": false,
"minSupportedPlatformVersion": "11.5.3.3",
"version": 2,
"created": 1658428291346
},
"sort": [
2
]
}
]
}
}
},
{
"key": "03504029-a029-417d-bd67-fb1b5fc5055b",
"doc_count": 2,
"agg": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_type": "doc",
"_id": "9",
"_score": null,
"_source": {
"uuid": "03504029-a029-417d-bd67-fb1b5fc5055b",
"type": "gdsg",
"discontinued": false,
"version": 1.1,
"created": 1554904300799
},
"sort": [
1.1
]
}
]
}
}
}
]
}
}
}
Document for the elastic search is as follows
{
"_index": "index",
"_type": "doc",
"_id": "3",
"_version": 2,
"_seq_no": 1,
"_primary_term": 1,
"found": true,
"_source": {
"doc": {
"uuid": "abcd",
"type": "strifn",
"name": "default",
"version": 3.12,
"s3ObjectVersionId": "",
"created": 165842829134
}
}
}
Expected result
{
"aggregations": {
"uuid": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "03504029-a029-417d-bd67-fb1b5fc5055b",
"doc_count": 2,
"agg": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_type": "doc",
"_id": "9",
"_score": null,
"_source": {
"uuid": "03504029-a029-417d-bd67-fb1b5fc5055b",
"type": "gdsg",
"discontinued": false,
"version": 1.1,
"created": 1554904300799
},
"sort": [
1.1
]
}
]
}
}
},
{
"key": "794a5b8f-3e22-4ff9-98bb-b8b54c85948e",
"doc_count": 3,
"agg": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "index",
"_type": "doc",
"_id": "7",
"_score": null,
"_source": {
"uuid": "794a5b8f-3e22-4ff9-98bb-b8b54c85948e",
"type": "qsdn",
"discontinued": false,
"minSupportedPlatformVersion": "11.5.3.3",
"version": 2,
"created": 1658428291346
},
"sort": [
2
]
}
]
}
}
}
]
}
}
}
I am using AWS opensearch for the same
Your query is correct only, you just need to increase the size from 1 to see all the documents in your bucket sorted according to version field in your Elasticsearch index.
Can you share more info, if above doesn't help you, like sample documents and index mapping.

How to make aggregations work for text fields

I am trying to write a elasticsearch query to get unique locality towns. my locality_town_keyword is of keyword type. when I try to search into locality_town_keyword, I get search hits but nothing in "aggregations":"Buckets".
Following is how my schema looks like...
"locality_town": {
"type": "text"
},
"locality_town_keyword": {
"type": "keyword"
},
My Search query looks like following
{
"query":
{
"prefix" : { "locality_town" : "m" }
},
"size": "1",
"_source": {
"includes": [
"locality_town*"
]
},
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword",
"size": 5,
"order": {
"_count": "desc"
}
}
}
}
}
Here is the output it gives
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 799,
"max_score": 1.0,
"hits": [
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_score": 1.0,
"_source": {
"locality_town": "Manchester",
"locality_town_keyword": "Manchester"
}
}
]
},
"aggregations": {
"loc": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
This is how one document looks like
{
"_index": "tenderindex_2",
"_type": "tender_2",
"_id": "290077",
"_version": 1,
"_seq_no": 39,
"_primary_term": 1,
"found": true,
"_source": {
"title": "Legal Services",
"buyers": "CENTRAL MANCHESTER UNIVERSITY HOSPITALS NHS FOUNDATION TRUST",
"postal_code": "M13 0JR",
"publish_date": "2015-03-03T15:48:45Z",
"status": "cancelled",
"start_date": "2017-03-03T00:00:00Z",
"endt_date": "2020-03-03T00:00:00Z",
"url": "https://www.temp.com",
"country": "England",
"description": "desc......",
"language": "en-GB",
"service": "OPEN_CONTRACTING",
"value": "0",
"value_currency": "GBP",
"winner": "",
"create_time": "2019-05-11T21:39:42Z",
"deadline_date": "1970-01-01T00:00:00Z",
"address": "Central Manchester University Hospitals NHS Foundation Trust Wilmslow Park",
"locality_town": "Manchester",
"locality_town_keyword": "Manchester",
"region": "North West",
"tender_type": "planning",
"cpv": "Health services ",
"strpublish_date": "2015-03-03T15:48:45Z",
"strstart_date": "2017-03-03T00:00:00Z",
"strend_date": "2020-03-03T00:00:00Z",
"strdeadline_date": "",
"winner_email": "",
"winner_address": "",
"winner_town": "",
"winner_postalcode": "",
"winner_phone": "",
"cpvs": "[\"Health services (85100000-0)\"]"
}
}
Looks like you have a typo in your aggregation query:
"aggs": {
"loc": {
"terms": {
"field": "locality_town_keyoword", <== here
"size": 5,
Try with locality_town_keyword instead!
Hope this helps!

Unique search results from ElasticSearch

I am new to ElasticSearch and can't quite figure out what I want is possible or not.
I can query like this:
GET entity/_search
{
"query": {
"bool": {
"must": [
{ "match": { "searchField": "searchValue" }}
]
}
},
"aggs" : {
"uniq_Id" : {
"terms" : { "field" : "Id", "size":500 }
}
}
}
and it will return top search results and the term aggregation buckets. But ideally what I would like for the search results to return, is only one (perhaps the top one, does not matter) for each of unique Id's defined in the aggregation terms.
You can make use of Terms Aggregation along with the Top Hits Aggregation to give you the result you are looking for.
Now once you do that, specify the size as 1 in the Top Hits Aggregation
Based on your query I've created sample mapping,documents, aggregation query and the response for your reference.
Mapping:
PUT mysampleindex
{
"mappings": {
"mydocs": {
"properties": {
"searchField":{
"type": "text"
},
"Id": {
"type": "keyword"
}
}
}
}
}
Sample Documents:
POST mysampleindex/mydocs/1
{
"searchField": "elasticsearch",
"Id": "1000"
}
POST mysampleindex/mydocs/2
{
"searchField": "elasticsearch is awesome",
"Id": "1000"
}
POST mysampleindex/mydocs/3
{
"searchField": "elasticsearch is awesome",
"Id": "1001"
}
POST mysampleindex/mydocs/4
{
"searchField": "elasticsearch is pretty cool",
"Id": "1001"
}
POST mysampleindex/mydocs/5
{
"searchField": "elasticsearch is pretty cool",
"Id": "1002"
}
Query:
POST mysampleindex/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"searchField": "elasticsearch"
}
}
]
}
},
"aggs": {
"myUniqueIds": {
"terms": {
"field": "Id",
"size": 10
},
"aggs": {
"myDocs": {
"top_hits": { <---- Top Hits Aggregation
"size": 1 <---- Note this
}
}
}
}
}
}
Sample Response:
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"myUniqueIds": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1000",
"doc_count": 2,
"myDocs": {
"hits": {
"total": 2,
"max_score": 0.2876821,
"hits": [
{
"_index": "mysampleindex",
"_type": "mydocs",
"_id": "1",
"_score": 0.2876821,
"_source": {
"searchField": "elasticsearch",
"Id": "1000"
}
}
]
}
}
},
{
"key": "1001",
"doc_count": 2,
"myDocs": {
"hits": {
"total": 2,
"max_score": 0.25316024,
"hits": [
{
"_index": "mysampleindex",
"_type": "mydocs",
"_id": "3",
"_score": 0.25316024,
"_source": {
"searchField": "elasticsearch is awesome",
"Id": "1001"
}
}
]
}
}
},
{
"key": "1002",
"doc_count": 1,
"myDocs": {
"hits": {
"total": 1,
"max_score": 0.2876821,
"hits": [
{
"_index": "mysampleindex",
"_type": "mydocs",
"_id": "5",
"_score": 0.2876821,
"_source": {
"searchField": "elasticsearch is pretty cool",
"Id": "1002"
}
}
]
}
}
}
]
}
}
}
Notice that I am not returning any bool results in the above, the search result you are looking for comes in the form of Top Hits Aggregation.
Hope this helps!

Incorrect output result of "aggs" query

I have a query that searches the number of entries in a given datetime window (i.e. between 2017-02-17T15:00:00.000 and 2017-02-17T16:00:00.000). When I execute this query, I get the incorrect result (it's better said that the result is unexpected):
POST /myindex/_search
{
"size": 0,
"aggs": {
"range": {
"date_range": {
"field": "Datetime",
"ranges": [
{ "to": "2017-02-17T16:00:00||-1H/H" },
{ "from": "2017-02-17T16:00:00||/H" }
]
}
}
}
}
This is the output:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 11,
"max_score": 0,
"hits": []
},
"aggregations": {
"range": {
"buckets": [
{
"key": "*-2017-02-17T15:00:00.000Z",
"to": 1487343600000,
"to_as_string": "2017-02-17T15:00:00.000Z",
"doc_count": 0
},
{
"key": "2017-02-17T16:00:00.000Z-*",
"from": 1487347200000,
"from_as_string": "2017-02-17T16:00:00.000Z",
"doc_count": 0
}
]
}
}
}
In myindex I have two entries with the following values of Datetime:
2017-02-17T15:15:00.000Z
2017-02-17T15:02:00.000Z
So, the result should be equal to 2.
I don't understand how to interpret the current output. Which fields defines the number of entries?
UPDATE:
data structure:
PUT /myindex
{
"mappings": {
"intensity": {
"_all": {
"enabled": false
},
"properties": {
"Country_Id": {
"type":"keyword"
},
"Datetime": {
"type":"date"
}
}
}
}
}
sample data:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 1,
"hits": [
{
"_index": "myindex",
"_type": "intensity",
"_id": "4",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-02-18T15:01:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "6",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-03-16T16:15:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "1",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-02-17T15:15:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "7",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-03-16T16:18:00.000Z"
}
},
{
"_index": "myindex",
"_type": "intensity",
"_id": "3",
"_score": 1,
"_source": {
"Country_Id": "1",
"Datetime": "2017-02-17T15:02:00.000Z"
}
}
]
}
}
The answer that I get:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 11,
"max_score": 0,
"hits": []
},
"aggregations": {
"range": {
"buckets": [
{
"key": "2017-02-17T15:00:00.000Z-2017-02-17T16:00:00.000Z",
"from": 1487343600000,
"from_as_string": "2017-02-17T15:00:00.000Z",
"to": 1487347200000,
"to_as_string": "2017-02-17T16:00:00.000Z",
"doc_count": 0
}
]
}
}
}
Your ranges are wrong, do it like this instead
POST /myindex/_search
{
"size": 0,
"aggs": {
"range": {
"date_range": {
"field": "Datetime",
"ranges": [
{
"from": "2017-02-17T16:00:00Z||-1H/H",
"to": "2017-02-17T16:00:00Z||/H"
}
]
}
}
}
}

Elasticsearch aggregation with custom query parser

I cannot seem to aggregate my query results when using my custom query parser. I get a result set by these are not aggregated. When using a standard query parser like match everything turns out well.
What works:
GET pages/_search
{
"query": {
"match": {
"text": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
returns a nice aggregated result:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 10,
"max_score": 11.11176,
"hits": [
...
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6lzDazctHi91RE",
"_score": 3.3503218,
"_source": {
"rooturl": "document",
"type": "equation",
"url": "document:poly",
"text": "coefficient"
}
},
{
"_index": "pages",
"_type": "doc",
"_id": "AVcq6z6xzDazctHi91RF",
"_score": 3.3503218,
"_source": {
"rooturl": document",
"type": "equation",
"url": "document:poly",
"text": "dot"
}
}
...
]
},
"aggregations": {
"docs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "document",
"doc_count": 10
}
]
}
}
}
But when using my custom query parser, The result is not aggregated.
Query:
GET pages/_search
{
"query": {
"my_custom_query_parser": {
"query": "binomial"
}
},
"aggs": {
"docs": {
"terms": {
"field": "rooturl"
}
}
}
}
Can anyone point me into the right direction?

Resources