why Elasticsearch doesn't return right results? - elasticsearch

I'm using Elasticsearch 6.2 configured with one cluster of 2 nodes.
GET _cluster/health:
{
"cluster_name": "cluster_name",
"status": "green",
"timed_out": false,
"number_of_nodes": 2,
"number_of_data_nodes": 2,
"active_primary_shards": 47,
"active_shards": 94,
"relocating_shards": 0,
"initializing_shards": 0,
"unassigned_shards": 0,
"delayed_unassigned_shards": 0,
"number_of_pending_tasks": 0,
"number_of_in_flight_fetch": 0,
"task_max_waiting_in_queue_millis": 0,
"active_shards_percent_as_number": 100
}
GET myindex/_settings:
{
"myindex": {
"settings": {
"index": {
"number_of_shards": "3",
"analysis": {
"analyzer": {
"url_split_analyzer": {
"filter": "lowercase",
"tokenizer": "url_split"
}
},
"tokenizer": {
"url_split": {
"pattern": "[^a-zA-Z0-9]",
"type": "pattern"
}
}
},
"number_of_replicas": "1",
"version": {
"created": "6020499"
}
}
}
}
}
here a snapshot of the _mappings structure:
"myindex": {
"mappings": {
"mytype": {
"properties": {
"#timestamp": {
"type": "date"
},
............
"active": {
"type": "short"
},
"id_domain": {
"type": "short",
"ignore_malformed": true
},
"url": {
"type": "text",
"similarity": "boolean",
"analyzer": "url_split_analyzer"
}
}
.......
I have casually found documents, within my index, that I cannot find if I query the index using the id_domain property.
For example:
GET /myindex/mytype/_search
{
"query": {
"bool": {
"must": [
{
"match": { "active": 1 }
}
]
}
}
}
output example:
{
"_index": "myindex",
"_type": "mytype",
"_id": "myurl",
"_score": 1,
"_source": {
"id_domain": "73993",
"active": 1,
"url": "myurl",
"#timestamp": "2018-05-21T10:55:16.247Z"
}
}
....
returns a list of documents where I found id_domain that I cannot find querying against that id domain, like this:
GET /myindex/mytype/_search
{
"query": {
"match": {
"id_domain": 73993 // with or without " got the same result
}
}
}
output
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
I cannot understand why this happens.
I also tried to reindex the index but I got the same result.
I am convinced that I'm missing something.
Is there any reason about that behaviour?
Thank you

In your mapping, id_domain has type short, but in your document you have a value that is out of the bounds for short values ([-32,768 to 32,767]), i.e. 73993.
You need to change the type to integer and all will be fine

Related

Elasticsearch terms aggregation returns no buckets

New elasticsearch user here and having an issue with a terms aggregation.
I have indexed 187 documents with fields like "name","host","risk" etc.
The field risk has 4 unique values ("Critical","High","Medium","Low","Informational")
I am running a terms aggregations like this:
POST http://localhost:9200/{index_name}/_search?size=0
{
"aggs":{
"riskCount":{
"terms":{
"field":"risk.keyword"
}
}
}
}
I was expecting a result stating that i have x of Critical, x of High etc.
Thing is, i get no buckets returned.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 187,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"riskCount": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
My Elasticsearch version is 7.12.0 Any ideas
Edit:
So, here's the mapping:
"findings": {
"mappings": {
"properties": {
"date_uploaded": {
"type": "date"
},
"host": {
"type": "text"
},
"name": {
"type": "text"
},
"risk": {
"type": "text"
}
}
}
}
And here's the document:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 187,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "findings",
"_type": "_doc",
"_id": "f86b6b5b-f09e-4350-9a66-d88a3a78f640",
"_score": 1.0,
"_source": {
"risk": "Informational",
"name": "HTTP Server Type and Version",
"host": "10.10.9.10",
"date_uploaded": "2021-05-07T19:39:10.810663+00:00"
}
}
]
}
}
Since the risk field is of text type, you need to update your index mapping as
PUT /_mapping
{
"properties": {
"risk": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
Then run the update_by_query API to reindex the data
You don't have any risk.keyword field in your mapping. You need to change your mapping as follows. Just run the following command to update your mapping and create the risk.keyword sub-field:
PUT index-name/_mapping
{
"properties": {
"date_uploaded": {
"type": "date"
},
"host": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"risk": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
Then reindex your data using this command:
POST index-name/_update_by_query
And then your query can be run like this:
{
"aggs":{
"riskCount":{
"terms":{
"field":"risk.keyword"
}
}
}
}

Phrase suggester Elasticsearch not correcting two words

I have the following mapping to my phrase suggester:
{
"settings": {
"analysis": {
"analyzer": {
"suggests_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"shingle_filter"
],
"type": "custom"
}
},
"filter": {
"shingle_filter": {
"min_shingle_size": 2,
"max_shingle_size": 6,
"type": "shingle"
}
}
}
},
"mappings": {
"sample_data": {
"properties": {
"name": {
"type": "string",
"analyzer": "suggests_analyzer"
}
}
}
}
}
I have "lung cancer", "colorectal cancer", "breast cancer" indexed in my index. But when I query for a mispelt query where both words are mispelt like "lhng cancar" returns zero results when I use the collate functionality. My sample query is as follows.
{
"suggest": {
"text": "lhng cancar",
"simple_phrase": {
"phrase": {
"field": "name",
"size": 5,
"real_word_error_likelihood": 0.95,
"max_errors": 0.5,
"direct_generator": [
{
"field": "name",
"suggest_mode": "always",
"size": 5
}
],
"collate": {
"query": {
"inline": {
"match_phrase": {
"{{field_name}}": "{{suggestion}}"
}
}
},
"params": {
"field_name": "name"
},
"prune": false
}
}
}
},
"size": 0
}
The response to the above query is:
{
"took": 17,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1868381,
"max_score": 0,
"hits": []
},
"suggest": {
"simple_phrase": [
{
"text": "lhng cancar",
"offset": 0,
"length": 11,
"options": []
}
]
}
}
What changes do need to do in the query so that I get the expected result as "lung cancer" in the suggestions?
You have to raise max_errors to 0.8 or more.
Same answer is given here
ElasticSearch - Phrase Suggestor
Raising the parameter of max_errors: 2 solved my problem.

Elastic Search : Restricting the search result in array

My index metadata :
{
"never": {
"aliases": {},
"mappings": {
"userDetails": {
"properties": {
"Residence_address": {
"type": "nested",
"include_in_parent": true,
"properties": {
"Address_type": {
"type": "string",
"analyzer": "standard"
},
"Pincode": {
"type": "string",
"analyzer": "standard"
},
"address": {
"type": "string",
"analyzer": "standard"
}
}
}
}
}
},
"settings": {
"index": {
"creation_date": "1468850158519",
"number_of_shards": "5",
"number_of_replicas": "1",
"version": {
"created": "1060099"
},
"uuid": "v2njuC2-QwSau4DiwzfQ-g"
}
},
"warmers": {}
}
}
My setting :
POST never
{
"settings": {
"number_of_shards" : 5,
"analysis": {
"analyzer": {
"standard": {
"tokenizer": "keyword",
"filter" : ["lowercase","reverse"]
}
}
}
}
}
My data :
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.375,
"hits": [
{
"_index": "never",
"_type": "userDetails",
"_id": "1",
"_score": 0.375,
"_source": {
"Residence_address": [
{
"address": "Omega Residency",
"Address_type": "Owned",
"Pincode": "500004"
},
{
"address": "Collage of Engineering",
"Address_type": "Rented",
"Pincode": "411005"
}
]
}
}
]
}
}
My query :
POST /never/_search?pretty
{
"query": {
"match": {
"Residence_address.address": "Omega"
}
}
}
My Result :
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.375,
"hits": [
{
"_index": "never",
"_type": "userDetails",
"_id": "1",
"_score": 0.375,
"_source": {
"Residence_address": [
{
"address": "Omega Residency",
"Address_type": "Owned",
"Pincode": "500004"
},
{
"address": "Collage of Engineering",
"Address_type": "Rented",
"Pincode": "411005"
}
]
}
}
]
}
}
Is there any way to restrict my result to only object containing address = Omega Residency and NOT the other object having address = Collage of Engineering?
You can only do it with nested query and inner_hits. I see that you have include_in_parent: true and not using nested queries though. If you only want to get the matched nested objects you'd need to use inner_hits from nested queries:
GET /never/_search?pretty
{
"_source": false,
"query": {
"nested": {
"path": "Residence_address",
"query": {
"match": {
"Residence_address.address": "Omega Residency"
}
},
"inner_hits" : {}
}
}
}

elasticsearch: term query fails

I have a mapping for some documents and queries agains terms does fail. I don't understand why:
"mappings":{
"timeslot":{
"properties":{
"FOB_IN":{
"type":"long"
},
"TRIGGER_CODE":{
"type":"long"
},
"FLIGHT_PHASE":{
"type":"long"
},
"REP16_TRIG":{
"type":"long"
},
"fwot":{
"type":"string"
},
"FOB_OUT":{
"type":"long"
},
"FP":{
"type":"long"
},
"FLTNB":{
"type":"string"
},
"Date":{
"format":"strict_date_optional_time||epoch_millis",
"type":"date"
}
}
}
}
I can make a term query against TRIGGER_CODE, for example, and it works fine
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 4.4446826,
"hits": [
{
"_index": "merged-2016-04",
"_type": "timeslot",
"_id": "AVRS8VnirVLwfvMnwpXb",
"_score": 4.4446826,
"_source": {
"Date": "2016-04-03T08:42:44+0000",
"FLIGHT_PHASE": 20,
"TRIGGER_CODE": 4000,
"fwot": "A6-APA"
}
}
]
}
}
now the same against fwot does fail. What's wrong?
GET merged-2016-04/_search?size=1
{
"query" : {
"term" : { "fwot": "A6-APA"}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
You need fwot to be "index": "not_analyzed" for that to work. And you need to reindex the data for the above change to work.
Here's the complete list of commands for the mapping change and some test data:
PUT /merged-2016-04
{
"mappings": {
"timeslot": {
"properties": {
"FOB_IN": {
"type": "long"
},
"TRIGGER_CODE": {
"type": "long"
},
"FLIGHT_PHASE": {
"type": "long"
},
"REP16_TRIG": {
"type": "long"
},
"fwot": {
"type": "string",
"index": "not_analyzed"
},
"FOB_OUT": {
"type": "long"
},
"FP": {
"type": "long"
},
"FLTNB": {
"type": "string"
},
"Date": {
"format": "strict_date_optional_time||epoch_millis",
"type": "date"
}
}
}
}
}
POST /merged-2016-04/timeslot
{
"Date": "2016-04-03T08:42:44+0000",
"FLIGHT_PHASE": 20,
"TRIGGER_CODE": 4000,
"fwot": "A6-APA"
}
GET merged-2016-04/_search?size=1
{
"query": {
"term": {
"fwot": "A6-APA"
}
}
}
See the doc page Query DLS term query, note "Why doesn’t the term query match my document" for a detailed explanation.
We Can use keyword
GET merged-2016-04/_search?size=1
{
"query": {
"term": {
"fwot.keyword": "A6-APA"
}
}
}

Querying a string consisting exactly a part of a query

I have a field named "lang" which consists values "en_US","en_GB","ru_RU", e.t.c. with this mapping
"lang": {
"type": "string",
"index": "not_analyzed",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
How to filter for documents, e.g. from "US"?
One way you can do it is change "index": "not_analyzed" on the upper-level field, and set up a pattern analyzer for that field. Since you already have the "lang.raw" field set up, you'll still be able to get the untouched version for faceting or whatever.
So, to test it I set up an index like this:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"whitespace_underscore": {
"type": "pattern",
"pattern": "[\\s_]+",
"lowercase": false
}
}
}
},
"mappings": {
"doc": {
"properties": {
"name": {
"type": "string"
},
"lang": {
"type": "string",
"index_analyzer": "whitespace_underscore",
"search_analyzer": "standard",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"ignore_above": 256
}
}
}
}
}
}
}
And added a few docs:
POST /test_index/doc/_bulk
{"index":{"_id":1}}
{"name":"doc1","lang":"en_US"}
{"index":{"_id":2}}
{"name":"doc2","lang":"en_GB"}
{"index":{"_id":3}}
{"name":"doc3","lang":"ru_RU"}
Now I can filter by "US" like this:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"term": {
"lang": "US"
}
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"name": "doc1",
"lang": "en_US"
}
}
]
}
}
And I can still get a list of values with a terms aggregation on "lang.raw":
POST /test_index/_search?search_type=count
{
"aggs": {
"lang_terms": {
"terms": {
"field": "lang.raw"
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"lang_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en_GB",
"doc_count": 1
},
{
"key": "en_US",
"doc_count": 1
},
{
"key": "ru_RU",
"doc_count": 1
}
]
}
}
}
Here is the code I used to test it:
http://sense.qbox.io/gist/ac3f3fd66ea649c0c3a8010241d1f6981a7e012c

Resources