ElasticSearch join data within the same index - elasticsearch

I am quite new with ElasticSearch and I am collecting some application logs within the same index which have this format
{
"_index" : "app_logs",
"_type" : "_doc",
"_id" : "JVMYi20B0a2qSId4rt12",
"_source" : {
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "STARTED",
"ts" : "2019-10-02T08:11:53Z"
}
I can have different event types. In this case I am interested in STARTED and FINISHED. I would like to query ES in order to get all the app that started in a certain day and enrich them with their end time. Basically I want to create couples of start/end (an end might also be missing, but that's fine).
I have realized join relations in sql cannot be used in ES and I was wondering if I can exploit some other feature in order to get this result in one query.
Edit: these are the details of the index mapping
{
“app_logs" : {
"mappings" : {
"_doc" : {
"properties" : {
"event_type" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
“app_id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"ts" : {
"type" : "date"
},
“event_type” : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}}}}

What I understood is that you would want to collate list of documents having same app_id along with the status as either STARTED or FINISHED.
I do not think Elasticsearch is not meant to perform JOIN operations. I mean you can but then you have to design your documents as mentioned in this link.
What you would need is an Aggregation query.
Below is the sample mapping, documents, the aggregation query and the response as how it appears, which would actually help you get the desired result.
Mapping:
PUT mystatusindex
{
"mappings": {
"properties": {
"username":{
"type": "keyword"
},
"app_id":{
"type": "keyword"
},
"event_type":{
"type":"keyword"
},
"ts":{
"type": "date"
}
}
}
}
Sample Documents
POST mystatusindex/_doc/1
{
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "STARTED",
"ts" : "2019-10-02T08:11:53Z"
}
POST mystatusindex/_doc/2
{
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "FINISHED",
"ts" : "2019-10-02T08:12:53Z"
}
POST mystatusindex/_doc/3
{
"username" : "mapred",
"app_id" : "application_1569623930006_490201",
"event_type" : "STARTED",
"ts" : "2019-10-02T09:30:53Z"
}
POST mystatusindex/_doc/4
{
"username" : "mapred",
"app_id" : "application_1569623930006_490202",
"event_type" : "STARTED",
"ts" : "2019-10-02T09:45:53Z"
}
POST mystatusindex/_doc/5
{
"username" : "mapred",
"app_id" : "application_1569623930006_490202",
"event_type" : "FINISHED",
"ts" : "2019-10-02T09:45:53Z"
}
POST mystatusindex/_doc/6
{
"username" : "mapred",
"app_id" : "application_1569623930006_490203",
"event_type" : "STARTED",
"ts" : "2019-10-03T09:30:53Z"
}
POST mystatusindex/_doc/7
{
"username" : "mapred",
"app_id" : "application_1569623930006_490203",
"event_type" : "FINISHED",
"ts" : "2019-10-03T09:45:53Z"
}
Query:
POST mystatusindex/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"ts": {
"gte": "2019-10-02T00:00:00Z",
"lte": "2019-10-02T23:59:59Z"
}
}
}
],
"should": [
{
"match": {
"event_type": "STARTED"
}
},
{
"match": {
"event_type": "FINISHED"
}
}
]
}
},
"aggs": {
"application_IDs": {
"terms": {
"field": "app_id"
},
"aggs": {
"ids": {
"top_hits": {
"size": 10,
"_source": ["event_type", "app_id"],
"sort": [
{ "event_type": { "order": "desc"}}
]
}
}
}
}
}
}
Notice that for filtering I've made use of Range Query as you only want to filter documents for that date and also added a bool should logic to filter based on STARTED and FINISHED.
Once I have the documents, I've made use of Terms Aggregation and Top Hits Aggregation to get the desired result.
Result
{
"took" : 12,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"application_IDs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "application_1569623930006_490200", <----- APP ID
"doc_count" : 2,
"ids" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "1", <--- Document with STARTED status
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490200"
},
"sort" : [
"STARTED"
]
},
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "2", <--- Document with FINISHED status
"_score" : null,
"_source" : {
"event_type" : "FINISHED",
"app_id" : "application_1569623930006_490200"
},
"sort" : [
"FINISHED"
]
}
]
}
}
},
{
"key" : "application_1569623930006_490202",
"doc_count" : 2,
"ids" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "4",
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490202"
},
"sort" : [
"STARTED"
]
},
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"event_type" : "FINISHED",
"app_id" : "application_1569623930006_490202"
},
"sort" : [
"FINISHED"
]
}
]
}
}
},
{
"key" : "application_1569623930006_490201",
"doc_count" : 1,
"ids" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "3",
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490201"
},
"sort" : [
"STARTED"
]
}
]
}
}
}
]
}
}
}
Note that the last document with only STARTED appears in the aggregation result as well.
Updated Answer
{
"size":0,
"query":{
"bool":{
"must":[
{
"range":{
"ts":{
"gte":"2019-10-02T00:00:00Z",
"lte":"2019-10-02T23:59:59Z"
}
}
}
],
"should":[
{
"term":{
"event_type.keyword":"STARTED" <----- Changed this
}
},
{
"term":{
"event_type.keyword":"FINISHED" <----- Changed this
}
}
]
}
},
"aggs":{
"application_IDs":{
"terms":{
"field":"app_id.keyword" <----- Changed this
},
"aggs":{
"ids":{
"top_hits":{
"size":10,
"_source":[
"event_type",
"app_id"
],
"sort":[
{
"event_type.keyword":{ <----- Changed this
"order":"desc"
}
}
]
}
}
}
}
}
}
Note the changes I've made. Whenever you would need exact matches or want to make use of aggregation, you would need to make use of keyword type.
In the mapping you've shared, there is no username field but two event_type fields. I'm assuming its just a human err and that one of the field should be username.
Now if you notice carefully, the field event_type has a text and its sibling keyword field. I've just modified the query to make use of the keyword field and when I am doing that, I'm use Term Query.
Try this out and let me know if it helps!

Related

Elastic search (Kibana) - intersect between boolean results

I am facing a problem in Kibana how to correctly filter a data. Basically my aim is to filter PASSED or FAILED tests from the following data structure.
{
"_index":"qa-reporting-2020-04",
"_type":"qa-reporting",
"_id":"456.0",
"_version":1,
"_score":null,
"_source":{
"TestId":"CXXX01",
"TestStatus":0,
"Issues":[
],
"MetaData":{
"TestName":"Test1",
"LastException":null,
"DurationMs":1980.5899000000002,
"Platform":{
"BrowserName":"chrome",
"OS":"windows",
"OsVersion":"10"
},
"Categories":[
"Cat1",
"Cat2",
"CXXX01"
],
"Priority":null,
"TestStatusStr":"Passed",
"JobName":"My-Demo/dev/ServerJob1",
"Logs":"PASS - Passed - CXXX01",
"SuiteName":"Tests.ServerTests",
"LastMessage":"PASS - Passed - CXXX01: \n",
"Environment":"dev"
}
}
}
The problem is that during a time (day), the logs will have several entries (e.g. the test failed and later in the same day the test passed). I have an aggregation query that gives me both results which is not desired. I want to have a result returning the intersection so the report will contain either failed/passed tests.
Here is my query (I am begginer) which gives me aggregation for a specific test that failed and passed.
GET qa-reporting-*/_search
{"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"MetaData.Environment": "dev"
}
},
{
"match": {
"TestId": "CXXX01"
}
},
{
"range": {
"Created": {
"gte": "now-0d/d"
}
}
}
]
}
},
"aggs": {
"tests": {
"terms": {"field": "MetaData.TestStatusStr.keyword"}
}
}
}
It returns the following (excerpt from the entire object)
"aggregations": {
"tests": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Passed",
"doc_count": 10
},
{
"key": "Failed",
"doc_count": 1
}
]
}
}
As above mentioned how to get the intersection between failed/passed tests during a specific date time? Is it possible to do it in Kibana or it needs to be solved on a script level outside of it?
Currently my report looks like this:
DateTime | TestId | Status
Apr 24,12:00 | CXXX01 | Failed
Apr 24,15:00 | CXXX01 | Passed
Wanted report will be only the intersection of the above status
Apr 24 | CXXX01 | Failed
or
Apr 24 | CXXX01 | Passed
So the latest result will have more weight I think.
I have taken data for two days. Two entries for each day. Last entry for day 1 is "Failed" and latest entry for day 2 is "Passed"
Data:
"hits" : [
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QKlosXEBuBOc-UQkKecO",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-23T01:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Passed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
},
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QalosXEBuBOc-UQkieeR",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-23T10:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Failed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
},
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QqlosXEBuBOc-UQkoue4",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-24T10:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Failed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
},
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "Q6losXEBuBOc-UQkwecl",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-24T11:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Passed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
}
]
Query: I have used date_histogram to create bucket of each day and top_hits
aggregation to get last document of that day
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"MetaData.Environment": "dev"
}
},
{
"match": {
"TestId": "CXXX01"
}
},
{
"range": {
"Created": {
"gte": "now-3d/d"
}
}
}
]
}
},
"aggs": {
"daily":{
"date_histogram": {
"field": "Created",
"interval": "day"
},
"aggs": {
"last_result": {
"top_hits": {
"size": 1,
"_source": ["MetaData.TestStatusStr"],
"sort": [
{"Created":"desc"}]
}
}
}
}
}
}
Result:
"aggregations" : {
"daily" : {
"buckets" : [
{
"key_as_string" : "2020-04-23T00:00:00.000Z",
"key" : 1587600000000,
"doc_count" : 2,
"last_result" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QalosXEBuBOc-UQkieeR",
"_score" : null,
"_source" : {
"MetaData" : {
"TestStatusStr" : "Failed"
}
},
"sort" : [
1587636000000
]
}
]
}
}
},
{
"key_as_string" : "2020-04-24T00:00:00.000Z",
"key" : 1587686400000,
"doc_count" : 2,
"last_result" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "Q6losXEBuBOc-UQkwecl",
"_score" : null,
"_source" : {
"MetaData" : {
"TestStatusStr" : "Passed"
}
},
"sort" : [
1587726000000
]
}
]
}
}
}
]
}
}

Elasticsearch returns 0.0 for metrics sum aggregation

Elasticsearch returns 0.0 for metrics sum aggregation. Expected output will be some of metric probe_http_duration_seconds.
Elasticsearch version: 7.1.1
Query used for aggregation:
GET some_metric/_search
{
"query": {
"bool": {
"must": [
{
"range": { "time": { "gte" : "now-1m", "lt": "now" } }
},
{
"match": {"name": "probe_http_duration_seconds"}
},
{
"match": {"labels.instance": "some-instance"}
}
]
}
},
"aggs" : {
"sum_is" : { "sum": { "field" : "value" } }
}
}
The above query returns for documents followed by:
"aggregations" : {
"sum_is" : {
"value" : 0.0
}
Each document in the index looks like:
{
"_index" : "some_metric-2019.12.03-000004",
"_type" : "_doc",
"_id" : "_wCjz24Bk6FPpmW1lC31",
"_score" : 5.3475914,
"_source" : {
"name" : "probe_http_duration_seconds",
"time" : 1575441630181,
"value" : 0,
"labels" : {
"__name__" : "probe_http_duration_seconds",
"app" : "some-events",
"i" : "some_metric",
"instance" : "some-instance",
"job" : "someproject-k8s-service",
"kubernetes_name" : "some-events",
"kubernetes_namespace" : "deploytest",
"phase" : "connect",
"t" : "type",
"v" : "1"
}
}
}
In query on changing must to should, I get:
"aggregations" : {
"sum_is" : {
"value" : 1.5389155527088604E16
}
}
The index dynamic mapping looks something like this:
"mappings" : {
"dynamic_templates" : [
{
"strings" : {
"unmatch" : "*seconds*",
"match_mapping_type" : "string",
"mapping" : {
"type" : "keyword"
}
}
},
{
"to_float" : {
"match" : "*seconds*",
"mapping" : {
"type" : "float"
}
}
}
],
However in our requirement, we need results matching all of the clauses in the query.
For metrics aggregation elasticsearch converts everything to double, still this doesn't explain result as zero.
Any pointers will be helpful. Thanks for attention.
NOTE: I see that in example document, value field is zero. Maybe while drafting/editing I made a mistake.
Below is the result of past 2 mins. This shows value field is actually float.
Query:
GET some_metric/_search?size=3
{
"_source": ["value"],
"query": {
"bool": {
"must": [
{
"range": { "time": { "gte" : "now-2m", "lt": "now" } }
},
{
"match": {"name": "probe_http_duration_seconds"}
},
{
"match": {"labels.instance": "some-instance"}
}
]
}
}
}
Result:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10,
"relation" : "eq"
},
"max_score" : 14.551308,
"hits" : [
{
"_index" : "some_metric-2019.12.04-000005",
"_type" : "_doc",
"_id" : "7oog0G4Bk6EPplW1ibD1",
"_score" : 14.551308,
"_source" : {
"value" : 0.040022423
}
},
{
"_index" : "some_metric-2019.12.04-000005",
"_type" : "_doc",
"_id" : "74og0G4Bk6EPplW1ibD1",
"_score" : 14.551308,
"_source" : {
"value" : 3.734E-5
}
},
{
"_index" : "some_metric-2019.12.04-000005",
"_type" : "_doc",
"_id" : "A4og0G4Bk6EPplW1ibH1",
"_score" : 14.551308,
"_source" : {
"value" : 0.015694122
}
}
]
}
}
What you see is just what you indexed in the source document. ES will never modify your source document. However, since the type is long as I thought then it will index that float value as a long and not as a float.
This usually happens when the very first document to be indexed has an integer value, such as 0, for instance.
You can either reindex your data with the proper mapping... Or since you have time-based indexes, just modify the dynamic template and tomorrow's index will be created correctly.

Nested boolean aggregation in elastic

I have json payloads as such
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 61,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "CAojVWwBO8H0jj7a_j3P",
"_score" : 1.0,
"_source" : {
"appName" : "BigApp",
"appVer" : "1.0",
"reviews" : {
"reviewer" : {
"value" : "Bob"
},
"testsPass" : [
{
"name" : "unit",
"pass" : false
},
{
"name" : "integraton",
"pass" : false
},
{
"name" : "ui",
"pass" : false
}
]
}
}
}
]
}
}
In elastic I want to aggregate the boolean values under testsPass to return true if all of the pass values are true.
I am new to Elastic and struggling to write a query in that shape, can someone please help?
So far I have tried nested aggregators but can't get the syntax right.
Looking at your data, I'm assuming the structure of your mapping is as follow:
Mapping:
PUT myindex
{
"mappings": {
"properties": {
"appName":{
"type": "keyword"
},
"appVer": {
"type": "keyword"
},
"reviews":{
"properties": {
"reviewer":{
"properties":{
"value": {
"type": "keyword"
}
}
},
"testsPass":{
"type": "nested"
}
}
}
}
}
}
Sample Documents:
POST myindex/_doc/1
{
"appName":"BigApp",
"appVer":"1.0",
"reviews":{
"reviewer":{
"value":"Bob"
},
"testsPass":[
{
"name":"unit",
"pass":false
},
{
"name":"integraton",
"pass":false
},
{
"name":"ui",
"pass":false
}
]
}
}
POST myindex/_doc/2
{
"appName":"MidApp",
"appVer":"1.0",
"reviews":{
"reviewer":{
"value":"Bob"
},
"testsPass":[
{
"name":"unit",
"pass":true
},
{
"name":"integraton",
"pass":true
},
{
"name":"ui",
"pass":true
}
]
}
}
POST myindex/_doc/3
{
"appName":"SmallApp",
"appVer":"1.0",
"reviews":{
"reviewer":{
"value":"Bob"
},
"testsPass":[
{
"name":"unit",
"pass":true
},
{
"name":"integraton",
"pass":true
},
{
"name":"ui",
"pass":false
}
]
}
}
Note that in the list of the above documents, only the document having appName: MidApp(2nd document) has the list of all true values.
Aggregation Query:
POST myindex/_search
{
"size":0,
"aggs":{
"pass_reviewers":{
"filter":{
"bool":{
"must":[
{
"nested":{
"path":"reviews.testsPass",
"query":{
"match":{
"reviews.testsPass.pass":"true"
}
}
}
}
],
"must_not":[
{
"nested":{
"path":"reviews.testsPass",
"query":{
"match":{
"reviews.testsPass.pass":"false"
}
}
}
}
]
}
},
"aggs":{
"myhits":{
"top_hits":{
"size":10
}
}
}
}
}
}
Note that the above returns only the concerned document as result of Top Hits aggregation. The main aggregation over here is in filter section which is just a Filter Aggregation
Response:
{
"took" : 7,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"pass_reviewers" : {
"doc_count" : 1, <------ Note this. Returns count of docs. This is result of filtered aggregation
"myhits" : { <------ Start of top hits aggregation
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "2", <----- Document
"_score" : 1.0,
"_source" : {
"appName" : "MidApp",
"appVer" : "1.0",
"reviews" : {
"reviewer" : {
"value" : "Bob"
},
"testsPass" : [
{
"name" : "unit",
"pass" : true
},
{
"name" : "integraton",
"pass" : true
},
{
"name" : "ui",
"pass" : true
}
]
}
}
}
]
}
}
}
}
}
Just in case if you just want the query to return the documents having all true, and not necessarily make use of aggregation, you can simply make use of the below query:
Query:
POST myindex/_search
{
"query":{
"bool":{
"must":[
{
"nested":{
"path":"reviews.testsPass",
"query":{
"match":{
"reviews.testsPass.pass":"true"
}
}
}
}
],
"must_not":[
{
"nested":{
"path":"reviews.testsPass",
"query":{
"match":{
"reviews.testsPass.pass":"false"
}
}
}
}
]
}
}
}
Basically the core execution logic is the same in both the queries, I've just narrowed down the logic you are looking for.
Response:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.597837,
"hits" : [
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.597837,
"_source" : {
"appName" : "MidApp",
"appVer" : "1.0",
"reviews" : {
"reviewer" : {
"value" : "Bob"
},
"testsPass" : [
{
"name" : "unit",
"pass" : true
},
{
"name" : "integraton",
"pass" : true
},
{
"name" : "ui",
"pass" : true
}
]
}
}
}
]
}
}
Hope this helps!

Fetching unique data in Elasticsearch

I have following data
ID: 1, fldname: pawan
ID: 1, fldname: pawan1
ID: 1, fldname: pawan2
ID: 2, fldname: pawan3
ID: 3, fldname: pawan4
ID: 4, fldname: pawan5
I am trying to get unique data based on ID field, similar to what we get in MySQL while firing group by queries like:
select * from table_name where fldname like 'pawan%' group by ID
This will return unique values. Same works in sphinx search when we use group by function.
Is there any way to get unique values in elasticsearch..?
Below is my sample mapping:
"mappings": {
"my_type": {
"properties": {
"docid": {
"type": "keyword"
},
"flgname": {
"type": "text"
}
}
}
}
I suggest that you slightly modify your mapping:
{
"record" : {
"dynamic" : "false",
"_all" : {
"enabled" : false
},
"properties" : {
"docid" : {
"type" : "long"
},
"flgname" : {
"type" : "text"
}
}
}
}
so that docid is a long
Then you could try fuzzy queries for filtering, together with aggregations, like this one here which retrieves the minimum, maximum, average and count of docid's:
{
"from" : 0,
"size" : 10,
"_source" : true,
"query" : {
"bool" : {
"must" : [ {
"match" : {
"flgname" : {
"query" : "pawan",
"operator" : "OR",
"fuzziness" : "1",
"prefix_length" : 1,
"max_expansions" : 50,
"fuzzy_transpositions" : true,
"lenient" : false,
"zero_terms_query" : "NONE",
"boost" : 1.0
}
}
} ]
}
},
"aggs" : {
"my_cardinality" : {
"cardinality" : {
"field" : "docid"
}
},
"my_avg" : {
"avg" : {
"field" : "docid"
}
},
"my_min" : {
"min" : {
"field" : "docid"
}
},
"my_max" : {
"max" : {
"field" : "docid"
}
}
}
}
By the way this is the result of the above query on the data you proposed:
{
"took" : 47,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.9808292,
"hits" : [ {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "40b5eac0-743b-4a6a-a06d-3ae4d56f4aca",
"_score" : 0.9808292,
"_source" : {
"docid" : "1",
"flgname" : "pawan"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "27821c39-e722-4361-bc07-0dcd5181a1ad",
"_score" : 0.7846634,
"_source" : {
"docid" : "2",
"flgname" : "pawan3"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "86fcd9c1-a688-4a6a-9c45-e91791a8b902",
"_score" : 0.7846634,
"_source" : {
"docid" : "4",
"flgname" : "pawan5"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "fb00a3cc-f1b8-4073-8808-f2ddbc4979e2",
"_score" : 0.55451775,
"_source" : {
"docid" : "1",
"flgname" : "pawan1"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "18e5e20d-17a7-4d59-b2f1-7bf325a4c4df",
"_score" : 0.55451775,
"_source" : {
"docid" : "3",
"flgname" : "pawan4"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "fbf49af6-f574-4ad2-8686-cbbedc5e70c4",
"_score" : 0.23014566,
"_source" : {
"docid" : "1",
"flgname" : "pawan2"
}
} ]
},
"aggregations" : {
"my_cardinality" : {
"value" : 4
},
"my_max" : {
"value" : 4.0
},
"my_avg" : {
"value" : 2.0
},
"my_min" : {
"value" : 1.0
}
}
}
If you make flgname also a keyword, then you can use sub-aggregation to aggregate over docID and subaggregate over flgname. Result will be similar to the SQL query you mentioned.
Query would look like:
{ "size": 0,
"query": {
"regexp":{
"flgname": "pawa.*"
}
},
"aggs" : {
"docids": {
"terms": {"field": "docid"},
"aggs": { "flgnam": { "terms": {"field": "flgname"}}}}
}
}

Elasticsearch wildcard query with spaces

I'm trying to do a wildcard query with spaces. It easily matches the words on term basis but not on field basis.
I've read the documentation which says that I need to have the field as not_analyzed but with this type set, it returns nothing.
This is the mapping with which it works on term basis:
{
"denshop" : {
"mappings" : {
"products" : {
"properties" : {
"code" : {
"type" : "string"
},
"id" : {
"type" : "long"
},
"name" : {
"type" : "string"
},
"price" : {
"type" : "long"
},
"url" : {
"type" : "string"
}
}
}
}
}
}
This is the mapping with which the exact same query returns nothing:
{
"denshop" : {
"mappings" : {
"products" : {
"properties" : {
"code" : {
"type" : "string"
},
"id" : {
"type" : "long"
},
"name" : {
"type" : "string",
"index" : "not_analyzed"
},
"price" : {
"type" : "long"
},
"url" : {
"type" : "string"
}
}
}
}
}
}
The query is here:
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*test*"}}}'
Response with the not_analyzed property:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
Response without not_analyzed:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 5,
"max_score" : 1.0,
"hits" : [ {
...
EDIT: Adding requested info
Here is the list of documents:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 5,
"max_score" : 1.0,
"hits" : [ {
"_index" : "denshop",
"_type" : "products",
"_id" : "3L1",
"_score" : 1.0,
"_source" : {
"id" : 3,
"name" : "Testovací produkt 2",
"code" : "",
"price" : 500,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-2/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "4L1",
"_score" : 1.0,
"_source" : {
"id" : 4,
"name" : "Testovací produkt 3",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-3/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "2L1",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "Testovací produkt",
"code" : "",
"price" : 500,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "5L1",
"_score" : 1.0,
"_source" : {
"id" : 5,
"name" : "Testovací produkt 4",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-4/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "6L1",
"_score" : 1.0,
"_source" : {
"id" : 6,
"name" : "Testovací produkt 5",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/tricka-tilka-tuniky/testovaci-produkt-5/"
}
} ]
}
}
Without the not_analyzed it returns with this:
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*testovací*"}}}'
But not with this (notice the space before asterisk):
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*testovací *"}}}'
When I add the not_analyzed to mapping, it returns no hits no matter what I put in the wildcard query.
Add a custom analyzer that should lowercase the text. Then in your search query, before passing the text to it have it lowercased in your client application.
To, also, keep the original analysis chain, I've added a sub-field to your name field that will use the custom analyzer.
PUT /denshop
{
"settings": {
"analysis": {
"analyzer": {
"keyword_lowercase": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"products": {
"properties": {
"name": {
"type": "string",
"fields": {
"lowercase": {
"type": "string",
"analyzer": "keyword_lowercase"
}
}
}
}
}
}
}
And the query will work on the sub-field:
GET /denshop/products/_search
{
"query": {
"wildcard": {
"name.lowercase": "*testovací *"
}
}
}

Resources