Find duplicates by id in two indices in Elasticsearch

Find duplicates by id in two indices in Elasticsearch - elasticsearch

I have several indices for storing my data in week-related index, with template myindex-2022-weekOfYear
How to find all duplicates by id across these indices?
I've tried to used aggregations (based from another questions here)
GET myindex-*/_search
{
"stored_fields": [
"myKey"
],
"size": 100,
"aggs": {
"duplicateNames": {
"terms": {
"field": "myKey",
"min_doc_count": 2
},
"aggs": {
"duplicateDocuments": {
"top_hits": {}
}
}
}
}
}
But it looks like that query is not working properly, as searching a single document by id (from query result) returns only one index, so I assume that min_doc_count is not working as I am expecting.
EDIT:
I see in response:
"genres" : {
"doc_count_error_upper_bound" : 530,
"sum_other_doc_count" : 357290963,
"buckets" : [ ]
}
so probably shard_size is too low (and I cant really increase it, due to es resources)

Tldr;
I could not find why this is not working, but I made a POC which show it is working correctly.
(For a rather smalls dimensions)
Poc
POST _bulk
{"index": {"_index": "74473038-0", "_id": "1"}}
{"data": "some dummy data", "id": 1}
{"index": {"_index": "74473038-1", "_id": "1"}}
{"data": "some dummy data", "id": 1}
{"index": {"_index": "74473038-2", "_id": "1"}}
{"data": "some dummy data", "id": 1}
{"index": {"_index": "74473038-3", "_id": "1"}}
{"data": "some dummy data", "id": 1}
{"index": {"_index": "74473038-0", "_id": "2"}}
{"data": "some dummy data", "id": 2}
{"index": {"_index": "74473038-2", "_id": "2"}}
{"data": "some dummy data", "id": 2}
{"index": {"_index": "74473038-0", "_id": "3"}}
{"data": "some dummy data", "id": 3}
{"index": {"_index": "74473038-1", "_id": "3"}}
{"data": "some dummy data", "id": 3}
{"index": {"_index": "74473038-3", "_id": "3"}}
{"data": "some dummy data", "id": 3}
{"index": {"_index": "74473038-0", "_id": "4"}}
{"data": "some dummy data", "id": 4}
{"index": {"_index": "74473038-2", "_id": "4"}}
{"data": "some dummy data", "id": 4}
{"index": {"_index": "74473038-3", "_id": "4"}}
{"data": "some dummy data", "id": 4}
{"index": {"_index": "74473038-0", "_id": "5"}}
{"data": "some dummy data", "id": 5}
GET 74473038-*/_search
{
"size": 0,
"aggs": {
"duplicateNames": {
"terms": {
"field": "id",
"min_doc_count": 2
},
"aggs": {
"duplicateDocuments": {
"top_hits": {}
}
}
}
}
}
I am getting as expected, document with id : 1, 2, 3, 4.
Omitting 5.
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 4,
"successful": 4,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 13,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"duplicateNames": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 4,
"duplicateDocuments": {
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "74473038-0",
"_id": "1",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 1
}
},
{
"_index": "74473038-1",
"_id": "1",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 1
}
},
{
"_index": "74473038-3",
"_id": "1",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 1
}
}
]
}
}
},
{
"key": 3,
"doc_count": 3,
"duplicateDocuments": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "74473038-0",
"_id": "3",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 3
}
},
{
"_index": "74473038-1",
"_id": "3",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 3
}
},
{
"_index": "74473038-3",
"_id": "3",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 3
}
}
]
}
}
},
{
"key": 4,
"doc_count": 3,
"duplicateDocuments": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "74473038-3",
"_id": "4",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 4
}
},
{
"_index": "74473038-2",
"_id": "4",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 4
}
},
{
"_index": "74473038-0",
"_id": "4",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 4
}
}
]
}
}
},
{
"key": 2,
"doc_count": 2,
"duplicateDocuments": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "74473038-2",
"_id": "2",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 2
}
},
{
"_index": "74473038-0",
"_id": "2",
"_score": 1,
"_source": {
"data": "some dummy data",
"id": 2
}
}
]
}
}
}
]
}
}
}

Related

Elasticsearch: search score puzzle me. Same score for different match levels

To simplify:
PUT /test/vendors/1
{
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
PUT /test/vendors/2
{
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
PUT /test/vendors/3
{
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
Then search:
GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ]
}
}
}
I understand why I get Jack who works in San Fran -- it's because he's a doctor too. However, I can't figure out why the match score is the SAME for him. The other two were matched with the place too, weren't they? why aren't Ron and Tom scored higher?
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.9245277,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}
Is there a way to force it to score less when less search keywords are found? Also, If I'n going to wrong way about this kind of search and there's a better pattern/way to do it -- I'd appreciate to be pointed in the right direction.

Your search structure is incorrect. The search query above is ignoring the place property and that's why you get the same score for all documents (only type property is taken into account). The reason for that is because works_at is a nested mapping, which should be treated differently when searching.
First, you should defined works_at as a nested mapping (read more here). Then you'll have to adjust your query to work with that nested mapping, see an example here.

GET /test/_search
{
"query": {
"multi_match" : {
"query": "doctor in Boston",
"fields": [ "type", "place" ],
"type": "most_fields" . <---- I WAS MISSING THIS
}
}
}
once in, that gave the correct results, where the "San Fran" guy is scored lower.
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.2122098,
"hits": [
{
"_index": "test",
"_type": "vendors",
"_id": "2",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Tom",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "1",
"_score": 1.2122098,
"_source": {
"type": "doctor",
"name": "Ron",
"place": "Boston"
}
},
{
"_index": "test",
"_type": "vendors",
"_id": "3",
"_score": 0.9245277,
"_source": {
"type": "doctor",
"name": "Jack",
"place": "San Fran"
}
}
]
}
}

Boost score of term query result with multiple matches

I have serveral documents that look like the following stored in my elastic search index:
PUT tests
{
"mappings": {
"_doc": {
"dynamic": false,
"properties": {
"objects": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"text": {
"type": "text"
}
}
}
}
}
PUT tests/_doc/1
{
"text": "lel",
"objects": ["A"]
}
PUT tests/_doc/2
{
"text": "lol",
"objects": ["B"]
}
PUT tests/_doc/3
{
"text": "lil",
"objects": ["C"]
}
PUT tests/_doc/4
{
"text": "lul",
"objects": ["A", "B", "C"]
}
I want to query for objects with the following query:
GET _search
{
"query": {
"terms": {
"objects.keyword": ["A", "B", "C"]
}
}
}
The result includes all three sample objects I provided.
My question is simply whether I can make an object appear of a higher importance (boost) that has a full match (all keywords in the objects array) and not just only a partial match and if so how, since I could not find any information in the elastic search documentation.
This is the result I am currently receiving:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 11,
"successful": 11,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1,
"hits": [
{
"_index": "tests",
"_type": "_doc",
"_id": "2",
"_score": 1,
"_source": {
"text": "lol",
"objects": [
"B"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "4",
"_score": 1,
"_source": {
"text": "lul",
"objects": [
"A",
"B",
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "1",
"_score": 1,
"_source": {
"text": "lel",
"objects": [
"A"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "3",
"_score": 1,
"_source": {
"text": "lil",
"objects": [
"C"
]
}
}
]
}
}

I think your best bet is using a bool query with should and minimum_should_match: 1.
GET _search
{
"query": {
"bool": {
"should": [
{
"term": {
"objects.keyword": "A"
}
},
{
"term": {
"objects.keyword": "B"
}
},
{
"term": {
"objects.keyword": "C"
}
}
],
"minimum_should_match": 1
}
}
}
Results:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1.5686159,
"hits": [
{
"_index": "tests",
"_type": "_doc",
"_id": "4",
"_score": 1.5686159,
"_source": {
"text": "lul",
"objects": [
"A",
"B",
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "1",
"_score": 0.2876821,
"_source": {
"text": "lel",
"objects": [
"A"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "3",
"_score": 0.2876821,
"_source": {
"text": "lil",
"objects": [
"C"
]
}
},
{
"_index": "tests",
"_type": "_doc",
"_id": "2",
"_score": 0.18232156,
"_source": {
"text": "lol",
"objects": [
"B"
]
}
}
]
}
}
EDIT: Here's why, as explained by the docs (https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html):
The bool query takes a more-matches-is-better approach, so the score from each matching must or should clause will be added together to provide the final _score for each document.

How to correctly aggregate with the field is a list on Elasticsearch

Currently the ES logs are indexed in a way that some fields have a list instead of a single value.
For example:
_source:{
"field1":"["item1", "item2", "item3"],
"field2":"something",
"field3": "something_else"
}
Of course, the length of list is not always the same. I'm trying to find a way to aggregate the number of logs that consist each item (so some logs will be counted multiple times)
I know I have to use aggs, but how can I form the right query (after -d)?

You can use below query that uses terms aggregation and top_hits.
{
"size": 0,
"aggs": {
"group": {
"terms": {
"script": "_source.field1.each{}"
},
"aggs":{
"top_hits_log" :{
"top_hits" :{
}
}
}
}
}
}
Output will be:
"buckets": [
{
"key": "item1",
"doc_count": 3,
"top_hits_log": {
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
},
{
"_index": "so",
"_type": "test",
"_id": "2",
"_score": 1,
"_source": {
"field1": [
"item1"
],
"field2": "something2"
}
},
{
"_index": "so",
"_type": "test",
"_id": "3",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2"
],
"field2": "something3"
}
}
]
}
}
},
{
"key": "item2",
"doc_count": 2,
"top_hits_log": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
},
{
"_index": "so",
"_type": "test",
"_id": "3",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2"
],
"field2": "something3"
}
}
]
}
}
},
{
"key": "item3",
"doc_count": 1,
"top_hits_log": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "so",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"field1": [
"item1",
"item2",
"item3"
],
"field2": "something1"
}
}
]
}
}
}
]
Make sure to enable dynamic scripting. Set script.disable_dynamic: false
Hope this helps.

There is no need to use scripting. It will be slow especially _source parsing. You also need to make sure your field1 is not_analyzed or you will get weird results as terms aggregation is performed on unique tokens in Inverted Index.
{
"size": 0,
"aggs": {
"unique_items": {
"terms": {
"field": "field1",
"size": 100
},
"aggs": {
"documents": {
"top_hits": {
"size": 10
}
}
}
}
}
}
Here the size is 100 inside terms aggregation, change this according to how many unique values you think you have(default is 10).
Hope this helps!

ElasticSearch Order By String Length

I am using ElasticSearch via NEST c#. I have large list of information about people
{
firstName: 'Frank',
lastName: 'Jones',
City: 'New York'
}
I'd like to be able to filter and sort this list of items by lastName as well as order by the length so people who only have 5 characters in their name will be at the beginning of the result set then people with 10 characters.
So with some pseudo code I'd like to do something like
list.wildcard("j*").sort(m => lastName.length)

You can do the sorting with script-based sorting.
As a toy example, I set up a trivial index with a few documents:
PUT /test_index
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"name":"Bob"}
{"index":{"_id":2}}
{"name":"Jeff"}
{"index":{"_id":3}}
{"name":"Darlene"}
{"index":{"_id":4}}
{"name":"Jose"}
Then I can order search results like this:
POST /test_index/_search
{
"query": {
"match_all": {}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": null,
"_source": {
"name": "Bob"
},
"sort": [
3
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
To filter by length, I can use a script filter in a similar way:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"script": {
"script": "doc['name'].value.length() > 3",
"params": {}
}
}
}
},
"sort": {
"_script": {
"script": "doc['name'].value.length()",
"type": "number",
"order": "asc"
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"name": "Jose"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": null,
"_source": {
"name": "Jeff"
},
"sort": [
4
]
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"name": "Darlene"
},
"sort": [
7
]
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/22fef6dc5453eaaae3be5fb7609663cc77c43dab
P.S.: If any of the last names will contain spaces, you might want to use "index": "not_analyzed" on that field.

Find all ID where ID are not in my blacklist

After many lectures , I cannot say if this kind of query is possible with elasticsearch , I found the "getting started" really excellent but the rest of guide have a lack of examples (from my point of vue ).
See my structure below, I need to retrieve all id who are not in my blacklist. My blacklist is some reference id. For this example I am the id 1 with the firstname "me" . Here in the structure we see I blacklisted "bob" , so the bob id (2) is in my blacklist array because I don't want to find bob in my search result.. :)
Is it possible to only retrieve (dynamically for sure) all id who are not in my blacklist in one query?
If you come from SQL, the same logic could be :
SELECT id FROM index WHERE id NOT IN (SELECT * FROM blacklist WHERE id = 1)
I would like to avoid the 2 step query , if my schema is bad and should be reconsidered , please I'm totally open for advice or suggestions.
Here is the structure :
{
"id: 1,
"balance": 16623,
"firstname": "me",
"blacklist" : [2,1982,939,1982,98716,7611,983838, and thousands others ....],
}
{
"id: 2,
"balance": 16623,
"firstname": "bob,
"blacklist" : [18,1982,939,1982,98716,7611,983838, and thousands others ....],
}
{
"id: 3,
"balance": 16623,
"firstname": "jhon",
"blacklist" : [18,1982,939,1982,98716,7611,983838, and thousands others ....],
}

You can use use a terms filter lookup together with a not filter as follows.
I set up the index with the three docs you have listed:
DELETE /test_index
PUT /test_index
PUT /test_index/doc/1
{
"id": 1,
"balance": 16623,
"firstname": "me",
"blacklist" : [2,1982,939,1982,98716,7611,983838]
}
PUT /test_index/doc/2
{
"id": 2,
"balance": 16623,
"firstname": "bob",
"blacklist" : [18,1982,939,1982,98716,7611,983838]
}
PUT /test_index/doc/3
{
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist" : [18,1982,939,1982,98716,7611,983838]
}
Then set up a query that filters out docs that are in the blacklist for "me":
POST /test_index/doc/_search
{
"filter": {
"not": {
"filter": {
"terms": {
"id": {
"index": "test_index",
"type": "doc",
"id": "1",
"path": "blacklist"
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"id": 1,
"balance": 16623,
"firstname": "me",
"blacklist": [2,1982,939,1982,98716,7611,983838]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist": [18,1982,939,1982,98716,7611,983838]
}
}
]
}
}
If you also want to filter out the user whose blacklist is being used, you can set up a slightly more complex filter using or:
POST /test_index/doc/_search
{
"filter": {
"not": {
"filter": {
"or": {
"filters": [
{
"terms": {
"id": {
"index": "test_index",
"type": "doc",
"id": "1",
"path": "blacklist"
}
}
},
{
"term": {
"id": "1"
}
}
]
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"id": 3,
"balance": 16623,
"firstname": "john",
"blacklist": [18,1982,939,1982,98716,7611,983838]
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/0b6808414f9447d4f7d23eb4c0d3e937ec2ea4e7

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Find duplicates by id in two indices in Elasticsearch - elasticsearch

Related

Elasticsearch: search score puzzle me. Same score for different match levels

Boost score of term query result with multiple matches

How to correctly aggregate with the field is a list on Elasticsearch

ElasticSearch Order By String Length

Find all ID where ID are not in my blacklist

Categories

Resources