Elasticsearch equivalent SQL In Query with multiple fields - elasticsearch

I am looking for a Elasticsearch equivalent SQL In query like below
SELECT * from table
WHERE (section , class) in (('a','1'), ('b', '2'))
I know how to use In query for single fields in elasticsearch
SELECT * FROM table WHERE class IN ('1', '2');
Elasticsearch query -
{
"query" : {
"bool" : {
"filter" : {
"terms" : {
"class" : ['1', '2']
}
}
}
}
}
My actual problem statement -
Sample index data :
[
{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "41",
"_score" : 1.0,
"_source" : {
"class" : "1",
"section" : "a",
"attribute_3" : "hello world"
},
{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "42",
"_score" : 1.0,
"_source" : {
"class" : "2",
"section" : "a",
"attribute_3" : "hello world"
},
{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "43",
"_score" : 1.0,
"_source" : {
"class" : "1",
"section" : "b",
"attribute_3" : "hello world"
},
{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "44",
"_score" : 1.0,
"_source" : {
"class" : "2",
"section" : "b",
"attribute_3" : "hello world"
},
{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "45",
"_score" : 1.0,
"_source" : {
"class" : "3",
"section" : "b",
"attribute_3" : "hello world"
}
]
I want to use a filter on data where (class is 1 AND section is a) OR (class is 2 AND section is b)
Note : I'm preparing this 'OR' combination dynamically and its going to be more than two combinations.
My expected search result should be -
[{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "41",
"_score" : 1.0,
"_source" : {
"class" : "1",
"section" : "a",
"attribute_3" : "hello world"
},
{
"_index" : "some_index",
"_type" : "_doc",
"_id" : "44",
"_score" : 1.0,
"_source" : {
"class" : "2",
"section" : "b",
"attribute_3" : "hello world"
}]

This would translate to:
{
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"term": {
"a": 0
}
},
{
"term": {
"b": 9
}
}
]
}
},
{
"bool": {
"must": [
{
"term": {
"a": 0
}
},
{
"term": {
"b": 4
}
}
]
}
}
]
}
}
}
but if a is always 0 as you mentioned in the example the query can be rephrased to:
{
"query": {
"bool": {
"must": [
{
"term": {
"a": 0
}
},
{
"terms": {
"b": [
9,
4
]
}
}
]
}
}
}

Related

Order results by smallest absolute difference from input

Can elasticsearch find the closest number to an input?
Example: I have apartments with 1, 2, 5, 6 and 10 rooms. I want a search for apartments with 5 rooms to order results by absolute difference (e.g. |6-5| = 1, |2-5| = 3 etc)
What I want to see: 5, 6, 2, 1, 10.
GET appartaments/_search
{
"query": {
"bool": {
"must":[
{
"match":{
"properties.id":1
}
},
{
"match":{
"properties.value":"5"
}
}
]
}
}
}
You can probably achieve what you want using script-based sorting:
GET appartaments/_search
{
"sort": {
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": "Math.abs(params.value - Integer.parseInt(doc['properties.value.keyword'].value))",
"params": {
"value": 5
}
},
"order": "asc"
}
},
"query": {
"bool": {
"must": [
{
"match": {
"properties.id": 1
}
}
]
}
}
}
Results =>
"hits" : [
{
"_index" : "appartaments",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"properties" : {
"id" : 1,
"value" : "5"
}
},
"sort" : [
0.0
]
},
{
"_index" : "appartaments",
"_type" : "_doc",
"_id" : "6",
"_score" : null,
"_source" : {
"properties" : {
"id" : 1,
"value" : "6"
}
},
"sort" : [
1.0
]
},
{
"_index" : "appartaments",
"_type" : "_doc",
"_id" : "2",
"_score" : null,
"_source" : {
"properties" : {
"id" : 1,
"value" : "2"
}
},
"sort" : [
3.0
]
},
{
"_index" : "appartaments",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"properties" : {
"id" : 1,
"value" : "1"
}
},
"sort" : [
4.0
]
},
{
"_index" : "appartaments",
"_type" : "_doc",
"_id" : "10",
"_score" : null,
"_source" : {
"properties" : {
"id" : 1,
"value" : "10"
}
},
"sort" : [
5.0
]
}
]
}

Search as per the relevance in Elastic Search

We are trying to apply a fuzzy search on zipcodes using following analyzer
PUT test_index
{
"settings": {
"index": {
"max_ngram_diff": 40
},
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete"
]
},
"autocomplete_search": {
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
},
"filter": {
"autocomplete": {
"type": "ngram",
"min_gram": 2,
"max_gram": 40
}
}
}
},
"mappings": {
"properties": {
"zipcode": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
Sample data in the index is as follows
PUT test_index/_doc/1
{ "zipcode": "01103" }
PUT test_index/_doc/2
{ "zipcode": "01104" }
PUT test_index/_doc/3
{ "zipcode": "11010" }
PUT test_index/_doc/4
{ "zipcode": "11016" }
PUT test_index/_doc/5
{ "zipcode": "11020" }
PUT test_index/_doc/6
{ "zipcode": "01107" }
PUT test_index/_doc/7
{ "zipcode": "11024" }
PUT test_index/_doc/8
{ "zipcode": "04110" }
Search query used on zipcode field is as follows :
GET test_index/_search
{
"query": {
"match": {
"zipcode": {
"query": "110",
"operator": "and"
}
}
}
}
We expected the data to be returned as per the more relevant one which is :
11010
11020
11024
11016
01103
01104
01107
but the actual data is returned is in this order .How can we boost the documents starting from 110.. to appear first
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "4",
"_score" : 0.45532414,
"_source" : {
"zipcode" : "11016"
}
},
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.45532414,
"_source" : {
"zipcode" : "01103"
}
},
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "3",
"_score" : 0.2885665,
"_source" : {
"zipcode" : "11010"
}
},
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "5",
"_score" : 0.2885665,
"_source" : {
"zipcode" : "11020"
}
},
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "7",
"_score" : 0.2885665,
"_source" : {
"zipcode" : "11024"
}
},
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "8",
"_score" : 0.2885665,
"_source" : {
"zipcode" : "04110"
}
},
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.2885665,
"_source" : {
"zipcode" : "01104"
}
},
{
"_index" : "test_index",
"_type" : "_doc",
"_id" : "6",
"_score" : 0.2885665,
"_source" : {
"zipcode" : "01107"
}
}
Following Query gives the order as expected but i am not sure which one should i use for my use case
GET test_index/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"zipcode": "110"
}
},
{
"match_phrase_prefix": {
"zipcode": "110"
}
}
]
}
}
}
vs
GET test_index/_search
{
"query": {
"prefix" : { "zipcode" : "110" }
}
}
Add a should clause with prefix query. It will give higher score to documents which start with that prefix.
Query
{
"query": {
"bool": {
"should": [
{
"match": {
"zipcode": {
"query": "110",
"operator": "and"
}
}
},
{
"prefix": {
"zipcode": "110"
}
}
]
}
}
}
Result
"hits" : [
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "3",
"_score" : 5.0441885,
"_source" : {
"zipcode" : "11010"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "4",
"_score" : 5.0441885,
"_source" : {
"zipcode" : "11016"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "5",
"_score" : 5.0441885,
"_source" : {
"zipcode" : "11020"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "7",
"_score" : 5.0441885,
"_source" : {
"zipcode" : "11024"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "1",
"_score" : 3.0168114,
"_source" : {
"zipcode" : "01103"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "2",
"_score" : 3.0168114,
"_source" : {
"zipcode" : "01104"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "6",
"_score" : 3.0168114,
"_source" : {
"zipcode" : "01107"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "8",
"_score" : 0.18093312,
"_source" : {
"zipcode" : "04110"
}
}
]

Date histogram with different Timezones with Elasticsearch v.7.13.3

I have to acquire datas from different part of Timezones (example New York -6.00 and Rome +2.00).
In the document I have a field 'timestamp' defined as "data" and I have for example create a "date_histogram" for example from 8.00 AM to 9.00 AM. How can I match the USA 8.00-9.00 and the ITA 8.00-9.00 datas in order to compare the two data from the same period?
This is my datas with two different fuse. 2 from USA and 2 from ITA:
"hits" : [
{
"_index" : "test-data-2021-8-4",
"_type" : "_doc",
"_id" : "9tS4EHsB4Ke8qtFfYqbg",
"_score" : 1.0,
"_source" : {
"id" : "mtKDIsEfSr3I8AwCDE1Gjw_11",
"value" : 87.2,
"timestamp" : "2021-08-04T12:32:04+02:00"
}
},
{
"_index" : "test-data-2021-8-4",
"_type" : "_doc",
"_id" : "99S4EHsB4Ke8qtFfYqbg",
"_score" : 1.0,
"_source" : {
"id" : "mtKDIsEfSr3I8AwCDE1Gjw_5",
"value" : 31.0025,
"timestamp" : "2021-08-04T12:32:04+02:00"
}
},
{
"_index" : "test-data-2021-8-4",
"_type" : "_doc",
"_id" : "wdOREHsB4Ke8qtFfuZAf",
"_score" : 1.0,
"_source" : {
"id" : "mtKDIsEfSr3I8AwCDE1Gjw_11",
"value" : 15.1,
"timestamp" : "2021-08-04T05:49:50-04:00"
}
},
{
"_index" : "test-data-2021-8-4",
"_type" : "_doc",
"_id" : "wtOREHsB4Ke8qtFfuZAg",
"_score" : 1.0,
"_source" : {
"id" : "mtKDIsEfSr3I8AwCDE1Gjw_5",
"value" : 27.9457,
"timestamp" : "2021-08-04T05:49:50-04:00"
}
}
]
This is my date_histogram query:
GET /test-data-*/_search?size=10000
{
"aggs": {
"agg_sum": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "1h"
},
"aggs": {
"aggregazione": {
"sum": {
"field": "value"
}
}
}
}
},
"size": 0,
"fields": [
{
"field": "timestamp",
"format": "date_time"
}
],
"stored_fields": [
"*"
],
"query": {
"bool": {
"must": [],
"filter": [
{
"range": {
"timestamp": {
"gte": "2021-08-04T08:00:00.000",
"lte": "2021-08-04T09:00:00.000",
"format": "strict_date_optional_time"
}
}
}
],
"should": [],
"must_not": []
}
}
}
Thanks in advance for the response.

How to access the date_histogram key field in the child aggregation in elasticsearch?

I want to apply some filters on the bucket response generated by the date_histogram, that filter is dependent on the key of the date_histogram output buckets.
Suppose I have following data in
{
"entryTime":"",
"soldTime:""
}
the elastic query is something like this
{
"aggs": {
"date": {
"date_histogram": {
"field": "entryTime",
"interval": "month",
"keyed": true
},
"aggs": {
"filter_try": {
"filter": {
"bool": {
"must": [
{
"range": {
"entryTime": {
"lte": 1588840533000
}
}
},
{
"bool": {
"should": [
{
"bool": {
"must": [
{
"exists": {
"field": "soldTime"
}
},
{
"range": {
"soldTime": {
"gt": 1588840533000
}
}
}
]
}
},
{
"bool": {
"must_not": [
{
"exists": {
"field": "soldTime"
}
}
]
}
}
]
}
}
]
}
}
}
}
}
}
}
so here in that bool query, I want to use the date generated for the specific bucket by date_histogram aggregation in both the range clauses instead of the hardcoded epoch time.
Even if we can access using script then also it's fine.
for further clarification, this is the boolean query and in the query want to replace this "DATE" with the date_histogram bucket key.
# (entryTime < DATE)
# AND
# (
# (soldTime != null AND soldTime > DATE)
# OR
# (soldTime == NULL)
# )
Consider below 10 Document I have:
"hits" : [
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1577869200000",
"soldTime" : "1578646800000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1578214800000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1578560400000",
"soldTime" : "1579942800000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "4",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1579683600000",
"soldTime" : "1581325200000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "5",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1580893200000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "6",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1582189200000",
"soldTime" : "1582362000000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "7",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1582621200000",
"soldTime" : "1584349200000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "8",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1583053200000",
"soldTime" : "1583830800000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "9",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1584262800000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "10",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1585472400000"
}
}
]
Now the end of January 2020 in epoch is -> 1580515199000
So if I apply on the above-mentioned bool query,
Will get the output as the
"hits" : [
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "4",
"_score" : 3.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1579683600000",
"soldTime" : "1581325200000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1578214800000"
}
}
]
As document with ID 4 satisfy (soldTime != null AND soldTime > DATE) and document with ID 2 satisfy (soldTime == null) condition from OR part.
Now for the same bool request If I use the date of end February 2020 -> 1583020799000, will get the hits as follows
"hits" : [
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "7",
"_score" : 3.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1582621200000",
"soldTime" : "1584349200000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1578214800000"
}
},
{
"_index" : "vi_test",
"_type" : "_doc",
"_id" : "5",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1580893200000"
}
}
]
ID 7: Entry in Feb, but sold in March so is in stock for Feb-2020
ID 2: Entry in Jan, not sold yet means in the stock
ID 5: Entry in Feb, not sold yet means in the stock
Now the same data required for each end of the month of a whole year to plot the trend.
Thank you
I couldn't find a way using normal queries as parent aggregation key is not available in sub aggregation. I have written a script for this which selects documents where soldTime is either null or doesnot fall in same month as entryTime
Query:
{
"query": {
"script": {
"script": """
ZonedDateTime entry;
ZonedDateTime sold;
if(doc['entryTime'].size()>0)
{
entry= doc['entryTime'].value;
}
if(doc['soldTime'].size()>0)
{
sold = doc['soldTime'].value;
}
if(sold==null || ( entry.getMonthValue()!==sold.getMonthValue()|| entry.getYear()!==sold.getYear()))
{
return true;
}
else false;
"""
}
},
"size": 10,
"aggs": {
"monthly_trend": {
"date_histogram": {
"field": "entryTime",
"interval": "month"
},
"aggs": {
"docs": {
"top_hits": {
"size": 10
}
}
}
}
}
}
Result:
"hits" : [
{
"_index" : "index22",
"_type" : "_doc",
"_id" : "55Kv83EB8a54AbXfngYU",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1578214800000"
}
}
]
},
"aggregations" : {
"monthly_trend" : {
"buckets" : [
{
"key_as_string" : "2020-01-01T00:00:00.000Z",
"key" : 1577836800000,
"doc_count" : 1,
"docs" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "index22",
"_type" : "_doc",
"_id" : "55Kv83EB8a54AbXfngYU",
"_score" : 1.0,
"_source" : {
"deaerId" : "4",
"entryTime" : "1578214800000"
}
}
]
}
}
}
]
}
}

Is there a way in ElasticSearch to get the shortest (closest) word at top?

I have the words inside my index: "Kem, Kemi, Kemah, Kemer, Kemerburgaz, Kemang, Kembs, Kemnay, Kempley, Kempsey, Kemerovo".
When i search for "Kem" i want "Kemi" to come at top because it is the closest word. (Kem + i = Kemi). But it doesn't go the way i want.
Index:
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 15
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"name": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text",
"similarity": "classic",
"analyzer": "autocomplete",
"search_analyzer": "standard"
},
"id": {
"type": "keyword"
},
"slug": {
"type": "keyword"
},
"type": {
"type": "keyword"
}
}
}
}
}
Query:
{
"from" : 0, "size" : 10,
"query": {
"bool": {
"must": [
{
"match": {
"name": "Kem"
}
}
],
"should": [
{
"term": {
"name.keyword": {
"value": "Kem"
}
}
}
]
}
}
}
'
Result:
{
"took" : 6,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 143,
"max_score" : 20.795834,
"hits" : [
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "lPL8Y2YBqxTX_xwrZlGc",
"_score" : 20.795834,
"_source" : {
"id" : "c6317201",
"name" : "Kem",
"slug" : "yurtdisi/karelya-cumhuriyeti/kem"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "se78Y2YBqxTX_xwrVFIU",
"_score" : 8.61574,
"_source" : {
"id" : "c121023",
"name" : "Kemah",
"slug" : "yurtdisi/houston-ve-civari/kemah"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "ze78Y2YBqxTX_xwrVFo5",
"_score" : 8.61574,
"_source" : {
"id" : "c1783",
"name" : "Kemerovo",
"slug" : "yurtdisi/kemerovo-oblasti/kemerovo"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "xe78Y2YBqxTX_xwrVFs9",
"_score" : 8.61574,
"_source" : {
"id" : "c1786",
"name" : "Kemi",
"slug" : "yurtdisi/rovaniemi/kemi"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "Tu78Y2YBqxTX_xwrVG-X",
"_score" : 8.61574,
"_source" : {
"id" : "c1900",
"name" : "Kempsey",
"slug" : "yurtdisi/new-south-wales/kempsey"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "Bu78Y2YBqxTX_xwrVILt",
"_score" : 8.61574,
"_source" : {
"id" : "c3000010982",
"name" : "Kempley",
"slug" : "yurtdisi/dymock/kempley"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "B-78Y2YBqxTX_xwrVILt",
"_score" : 8.61574,
"_source" : {
"id" : "c3000010983",
"name" : "Kemnay",
"slug" : "yurtdisi/inverurie/kemnay"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "CO78Y2YBqxTX_xwrVIb_",
"_score" : 8.61574,
"_source" : {
"id" : "c3000013079",
"name" : "Kemerburgaz",
"slug" : "eyup/kemerburgaz"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "-fL8Y2YBqxTX_xwrZQxf",
"_score" : 8.61574,
"_source" : {
"id" : "c6190744",
"name" : "Kembs",
"slug" : "yurtdisi/haut-rhin-bolge/kembs"
}
},
{
"_index" : "destinations",
"_type" : "_doc",
"_id" : "xfL8Y2YBqxTX_xwrZSG-",
"_score" : 8.61574,
"_source" : {
"id" : "c6216986",
"name" : "Kemang",
"slug" : "yurtdisi/cakarta/kemang"
}
}
]
}
}
Now they are at same score because everyone have the "Kem" i guess. But if i do "match" or "match_phrase" the outcome is the same.
In your example it seems that you want your results sorted by length. You can do that with a script.
POST your_index/_doc/_search
{
"from": 0,
"size": 10,
"query": {
"bool": {
"must": [
{
"match": {
"name": "Kem"
}
}
],
"should": [
{
"term": {
"name.keyword": {
"value": "Kem"
}
}
}
]
}
},
"sort": [
{
"_score": {"order": "desc"}
},
{
"_script": {
"script": "doc['name.keyword'].value.length()",
"type": "number",
"order": "asc"
}
},
{
"name.keyword": {"order": "asc"}
}
]
}

Resources