must_not not working for nested elastic query - elasticsearch

I have a question.
Suppose there are 2 transactions for the same customer id '11'.In one transaction customer bought 'CLEANSING' product and in second transaction customer bought 'SKIN CARE' product.Now I wanted to filter out customers who bought product 'CLEANSING' but not 'SKIN CARE'.But when I try to aggregate by customer id '11' I get the customer because in the 1st transaction he did not purchase product 'SKIN CARE'.How to make elastic look for the entire transactions of a customer and not a single transaction.Please help me out.
These are the transactions -
{
"transactionId" : "1211",
"CDID" : "11",
"transactionDate" : "2019-06-24T09:35:30.2117315Z",
"lineItems" : [
{
"description" : "BUBBLE BUBBLE MILD FOAMING CLEANSER",
"markdownFlag" : "N",
"quantity" : 1,
"rate" : 14,
"value" : 14,
"discount" : 0,
"amount" : 13.33,
"grossAmount" : 14,
"itemDetails" : {
"itemName" : "BUBBLE BUBBLE MILD FOAMING CLEANSER",
"retailDepartmentName" : "CLEANSING",
}
}
]
}
{
"transactionId" : "1232",
"CDID" : "11",
"transactionDate" : "2019-06-24T09:35:30.2117315Z",
"lineItems" : [
{
"description" : "BUBBLE BUBBLE MILD FOAMING CLEANSER",
"markdownFlag" : "N",
"quantity" : 1,
"rate" : 14,
"value" : 14,
"discount" : 0,
"amount" : 13.33,
"grossAmount" : 14,
"itemDetails" : {
"itemName" : "BUBBLE BUBBLE MILD FOAMING CLEANSER",
"retailDepartmentName" : "SKIN CARE",
}
}
]
}
lineItems is of nested type
The transactions are made by the same customer
I am trying to get the customer who bought 'CLEANSING' but did not buy 'SKIN CARE'.I should get no results.
My query -
{
"aggs": {
"CDID": {
"terms": {
"field": "CDID.keyword",
"size": 10
},
"aggs": {
"lineItems1": {
"filter": {
"nested": {
"path": "lineItems",
"query": {
"bool": {
"must": [
{
"bool": {
"must_not": [
{
"match": {
"lineItems.itemDetails.retailDepartmentName.keyword": "SKIN CARE"
}
}
],
"must": [
{
"match": {
"lineItems.itemDetails.retailDepartmentName.keyword": "CLEANSING"
}
}
]
}
}
]
}
}
}
},
"aggs": {
"nested_path": {
"nested": {
"path": "lineItems"
},
"aggs": {
"sum1": {
"sum": {
"field": "lineItems.quantity"
}
}
}
}
}
}
}
}
}
}
Result -
"aggregations" : {
"CDID" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "11",
"doc_count" : 2,
"lineItems1" : {
"doc_count" : 1,
"nested_path" : {
"doc_count" : 1,
"sum1" : {
"value" : 1.0
}
}
}
}
]
}
}
UPDATE-Still didn't find the answer

Below Query you can achieve the result.
Mapping Query:
PUT /<index_name>
{
"mappings" : {
"properties" : {
"transactionId": {
"type": "text"
},
"CDID": {
"type": "text"
},
"transactionDate": {
"type": "text"
},
"lineItems" : {
"type" : "nested"
}
}
}
}
Sample Data Mapping:
POST /<index_name>/_doc
{
"transactionId": "1211",
"CDID": "11",
"transactionDate": "2019-06-24T09:35:30.2117315Z",
"lineItems": [
{
"description": "BUBBLE BUBBLE MILD FOAMING CLEANSER",
"markdownFlag": "N",
"quantity": 1,
"rate": 14,
"value": 14,
"discount": 0,
"amount": 13.33,
"grossAmount": 14,
"itemDetails": {
"itemName": "BUBBLE BUBBLE MILD FOAMING CLEANSER",
"retailDepartmentName": "CLEANSING"
}
}
]
}
Search Query:
GET /test_trans/_search
{
"query": {
"nested": {
"path": "lineItems",
"query": {
"bool": {
"must": [
{
"match": {
"lineItems.itemDetails.retailDepartmentName": "CLEANSING"
}
}
],
"must_not": [
{
"match": {
"lineItems.itemDetails.retailDepartmentName": "SKIN CARE"
}
}
]
}
},
"score_mode": "avg"
}
},
"aggs": {
"nested_path": {
"nested": {
"path": "lineItems"
},
"aggs": {
"sum1": {
"sum": {
"field": "lineItems.quantity"
}
}
}
}
}
}

Related

Elasticsearch sub agregation

With the following query, I get the minimum value in each chunk of 15 minutes. I use the moving_fn function. Now I need to get the maximum value in each chunk in 1 hour from the previous request. As I understand it cannot be used for aggregation after moving_fn. How can you do this?
This is my query:
GET logstash-2021.12.2*/_search
{
"query": {
"bool": {
"filter": [
{
"range": {
"#timestamp": {
"gte": "now-24h"
}
}
},
{
"bool": {
"should": [
{
"match_phrase": {
"company": "BLAH-BLAH"
}
}
]
}
}
]
}
},
"size": 0,
"aggs": {
"myDatehistogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"offset": "+30s"
}, "aggs": {
"the_count": {
"moving_fn": {
"buckets_path": "_count",
"window": 15,
"script": "MovingFunctions.min(values)"
}
}
}
}
}
}
My response:
"aggregations" : {
"myDatehistogram" : {
"buckets" : [
{
"key_as_string" : "2021-12-25T05:58:30.000Z",
"key" : 1640411910000,
"doc_count" : 1196,
"the_count" : {
"value" : null
}
},
{
"key_as_string" : "2021-12-25T05:59:30.000Z",
"key" : 1640411970000,
"doc_count" : 1942,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:00:30.000Z",
"key" : 1640412030000,
"doc_count" : 1802,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:01:30.000Z",
"key" : 1640412090000,
"doc_count" : 1735,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:02:30.000Z",
"key" : 1640412150000,
"doc_count" : 1699,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:03:30.000Z",
"key" : 1640412210000,
"doc_count" : 1506,
"the_count" : {
"value" : 1196.0
}
}
From this response, I need to get the maximum value for each hour. Thank you in advance
Just add a second agg:
"myDatehistogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"offset": "+30s"
}, "aggs": {
"min_15": {
"moving_fn": {
"buckets_path": "_count",
"window": 15,
"script": "MovingFunctions.min(values)"
}
}
"max_60": {
"moving_fn": {
"buckets_path": "_count",
"window": 60,
"script": "MovingFunctions.max(values)"
}
}
}
}

How do I compare two source IP from two different specific log in elastic search

In Elasticsearch I want to compare two logs (natlog and Gateway log) with DSL Query.
In nat log there is srcip1 and In gateway log there is srcip2
I want to if this condition srcip1 === srcip2 satisfied, "agent.id" display in result.
On top of it I will put my already corelated query which I have made
{
"query": {
"bool": {
"should": [
{
"match": {
"location": "\\Users\\Saad\\Desktop\\nat.log"
}
},
{
"match": {
"location": "\\Users\\Saad\\Desktop\\attendance-logs-with-ports.log"
}
}
],
"must": [
{
"term": {
"data.srcip": "1.1.1.1"
}
}
]
}
},
"fields": [
"data.srcip1"
],
"_source": false
}
I tried multiple things but not succeeded.
To display summaries of data you use aggregations. In case you want to compare the different agents depending on the log type for a certain ip the query will be this one:
Ingest data
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\nat.log",
"data": {
"srcip1": "1.1.1.1"
},
"agent": {
"id": "agent_1"
}
}
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\attendance-logs-with-ports.log",
"data": {
"srcip2": "1.1.1.1"
},
"agent": {
"id": "agent_1"
}
}
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\nat.log",
"data": {
"srcip1": "1.1.1.1"
},
"agent": {
"id": "agent_2"
}
}
Request
POST test_saad/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"term": {
"data.srcip1.keyword": "1.1.1.2"
}
},
{
"term": {
"data.srcip2.keyword": "1.1.1.2"
}
}
],
"minimum_should_match": 1
}
},
{
"bool": {
"should": [
{
"term": {
"location.keyword": """\Users\Saad\Desktop\nat.log"""
}
},
{
"term": {
"location.keyword": """\Users\Saad\Desktop\attendance-logs-with-ports.log"""
}
}
],
"minimum_should_match": 1
}
}
]
}
},
"aggs": {
"log_types": {
"terms": {
"field": "location.keyword",
"size": 10
},
"aggs": {
"agent_types": {
"terms": {
"field": "agent.id.keyword",
"size": 10
}
}
}
}
}
}
Response
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"log_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : """\Users\Saad\Desktop\nat.log""",
"doc_count" : 2,
"agent_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "agent_1",
"doc_count" : 1
},
{
"key" : "agent_2",
"doc_count" : 1
}
]
}
},
{
"key" : """\Users\Saad\Desktop\attendance-logs-with-ports.log""",
"doc_count" : 1,
"agent_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "agent_1",
"doc_count" : 1
}
]
}
}
]
}
}
}

How to count number of fields inside nested field? - Elasticsearch

I did the following mapping. I would like to count the number of products in each nested field "products" (for each document separately). I would also like to do a histogram aggregation, so that I would know the number of specific bucket sizes.
PUT /receipts
{
"mappings": {
"properties": {
"id" : {
"type": "integer"
},
"user_id" : {
"type": "integer"
},
"date" : {
"type": "date"
},
"sum" : {
"type": "double"
},
"products" : {
"type": "nested",
"properties": {
"name" : {
"type" : "text"
},
"number" : {
"type" : "double"
},
"price_single" : {
"type" : "double"
},
"price_total" : {
"type" : "double"
}
}
}
}
}
}
I've tried this query, but I get the number of all the products instead of number of products for each document separately.
GET /receipts/_search
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products"
}
}
}
}
}
}
Result of the query:
"aggregations" : {
"terms" : {
"doc_count" : 6552,
"bucket_size" : {
"value" : 0
}
}
}
UPDATE
Now I have this code where I make separate buckets for each id and count the number of products inside them.
GET /receipts/_search
{
"query": {
"match_all": {}
},
"size" : 0,
"aggs": {
"terms":{
"terms":{
"field": "_id"
},
"aggs": {
"nested": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products.number"
}
}
}
}
}
}
}
}
Result of the query:
"aggregations" : {
"terms" : {
"doc_count_error_upper_bound" : 5,
"sum_other_doc_count" : 490,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"nested" : {
"doc_count" : 21,
"bucket_size" : {
"value" : 21
}
}
},
{
"key" : "10",
"doc_count" : 1,
"nested" : {
"doc_count" : 5,
"bucket_size" : {
"value" : 5
}
}
},
{
"key" : "100",
"doc_count" : 1,
"nested" : {
"doc_count" : 12,
"bucket_size" : {
"value" : 12
}
}
},
...
Is is possible to group these values (21, 5, 12, ...) into buckets to make a histogram of them?
products is only the path to the array of individual products, not an aggregatable field. So you'll need to use it on one of your product's field -- such as the number:
GET receipts/_search
{
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products.number"
}
}
}
}
}
}
Note that is a product has no number, it'll not contribute to the total count. It's therefore best practice to always include an ID in each of them and then aggregate on that field.
Alternatively you could use a script to account for missing values. Luckily value_count does not deduplicate -- meaning if two products are alike and/or have empty values, they'll still be counted as two:
GET receipts/_search
{
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"script": {
"source": "doc['products.number'].toString()"
}
}
}
}
}
}
}
UPDATE
You could also use a nested composite aggregation which'll give you the histogrammed product count w/ the corresponding receipt id:
GET /receipts/_search
{
"size": 0,
"aggs": {
"my_aggs": {
"nested": {
"path": "products"
},
"aggs": {
"composite_parent": {
"composite": {
"sources": [
{
"receipt_id": {
"terms": {
"field": "_id"
}
}
},
{
"product_number": {
"histogram": {
"field": "products.number",
"interval": 1
}
}
}
]
}
}
}
}
}
}
The interval is modifiable.

Documents repeating in the query of elasticsearch

I'm new to elasticsearch. I need to build the query dynamically, where for each field name the the corresponding file is fetched
I have the below query, can anyone say if its the right approach? Also with this query, the documents are just repeating for one particular file name
Please let me know how to go about it
GET index_name/_search
{
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"match_phrase": {
"field_name": "program"
}
},
{
"match_phrase": {
"field_value": "aaa-123"
}
}
]
}
},
{
"bool": {
"must": [
{
"match_phrase": {
"field_name": "species"
}
},
{
"match_phrase": {
"field_value": "mouse"
}
}
]
}
},
{
"bool": {
"must": [
{
"match_phrase": {
"field_name": "model name"
}
},
{
"match_phrase": {
"field_value": "b45"
}
}
]
}
}
]
}
},"aggs": {
"2": {
"terms": {
"field": "myfile_file_name.keyword",
"size": 1000,
"order": {
"_key": "asc"
}
},
"aggs": {
"3": {
"terms": {
"field": "field_name.keyword",
"size": 1000,
"order": {
"_key": "asc"
}
}
}
}
}
}
}
mapping and Output
{
"_index" : "test",
"_type" : "test_data",
"_id" : "123",
"_score" : 1.0,
"_source" : {
"document_id" : 123,
"m_id" : 1,
"source" : "ADDD",
"type" : "M",
"name" : "Animal",
"value" : "None",
"test_type" : "Test123",
"file_name" : "AA.zip",
"description" : "testing",
"program" : ["hello"],
"species" : ["mouse"],
"study" : ["Study1"],
"create_date" : "2020-08-20 11:51:21.152",
"update_date" : "2020-08-20 11:51:21.152",
"source_name" : "Anim",
"auth" : ["na"],
"treatment" : ["TR001", "TR002", "TR004"],
"timepoint" : ["72", "48"],
"findings_reports" : "na",
"model" : ["None",],
"additional" : "{'view': '', 'load': []}",
"data" : "Pre"
}
},
]
}
}

Elastic Search: Selecting multiple vlaues in aggregates

In Elastic Search I have the following index with 'allocated_bytes', 'total_bytes' and other fields:
{
"_index" : "metrics-blockstore_capacity-2017_06",
"_type" : "datapoint",
"_id" : "AVzHwgsi9KuwEU6jCXy5",
"_score" : 1.0,
"_source" : {
"timestamp" : 1498000001000,
"resource_guid" : "2185d15c-5298-44ac-8646-37575490125d",
"allocated_bytes" : 1.159196672E9,
"resource_type" : "machine",
"total_bytes" : 1.460811776E11,
"machine" : "2185d15c-5298-44ac-8646-37575490125d"
}
I have the following query to
1)get a point for 30 minute interval using date-histogram
2)group by field on resource_guid.
3)max aggregate to find the max value.
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValue": {
"max": {
"field": "allocated_bytes"
}
}
}
},
"sumUnique": {
"sum_bucket": {
"buckets_path": "groupByField>maxValue"
}
}
}
}
}
}
But with this query I am able to get only allocated_bytes, but I need to have both allocated_bytes and total_bytes at the result point.
Following is the result from the above query:
{
"key_as_string" : "2017-06-20T21:00:00.000Z",
"key" : 1497992400000,
"doc_count" : 9,
"groupByField" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "2185d15c-5298-44ac-8646-37575490125d",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
}, {
"key" : "c3513cdd-58bb-4f8e-9b4c-467230b4f6e2",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156165632E9
}
}, {
"key" : "eff13403-9737-4d08-9dca-fb6c12c3a6fa",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
} ]
},
"sumUnique" : {
"value" : 3.468529664E9
}
}
I do need both allocated_bytes and total_bytes. How do I get multiple fields( allocated_bytes, total_bytes) for each point?
For example:
"sumUnique" : {
"Allocatedvalue" : 3.468529664E9,
"TotalValue" : 9.468529664E9
}
or like this:
"allocatedBytessumUnique" : {
"value" : 3.468529664E9
}
"totalBytessumUnique" : {
"value" : 9.468529664E9
},
You can just add another aggregation:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValueAllocated": {
"max": {
"field": "allocated_bytes"
}
},
"maxValueTotal": {
"max": {
"field": "total_bytes"
}
}
}
},
"sumUniqueAllocatedBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueAllocated"
}
},
"sumUniqueTotalBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueTotal"
}
}
}
}
}
}
I hope you are aware that sum_bucket calculates sibling aggregations only, in this case gives sum of max values, not the sum of total_bytes. If you want to get sum of total_bytes you can use sum aggregation

Resources