ElasticSearch - How to aggregation access log ignore GET parameter? - elasticsearch

I want to aggregate access by function path.
{
"query": {
"bool": {
"must": [
{
"wildcard": {
"path.keyword": "/hex/*"
}
}
]
}
},
"from": 0,
"size": 0,
"aggs": {
"path": {
"terms": {
"field": "path.keyword"
}
}
}
}
And i get the result like these..
{
"key": "/hex/user/admin_user/auth",
"doc_count": 38
},
{
"key": "/hex/report/chart/fastreport_lobby_all?start_date=2017-06-29&end_date=2017-07-05&category=date_range&value[]=payoff",
"doc_count": 35
},
{
"key": "/hex/report/chart/fastreport_lobby_all?start_date=2017-06-29&end_date=2017-07-05&category=lobby&value[]=payoff",
"doc_count": 35
},
{
"key": "/hex/report/chart/online_membership?start_date=2017-06-29&end_date=2017-07-05&category=datetime_range&value[]=user_total",
"doc_count": 34
}
There are two /hex/report/chart/fastreport_lobby_all?balabala... result.
It's not the real count about this function.
Do i have any method to count these as one?
{
"key": "/hex/report/chart/fastreport_lobby_all",
"doc_count": 70
}

I don't think this is possible without a custom analyzer like
PUT your_index
{
"settings": {
"analysis": {
"analyzer": {
"query_analyzer": {
"type": "custom",
"tokenizer": "split_query",
"filter": ["top1"
]
}
},
"filter":{
"top1":{
"type": "limit",
"max_token_count": 1
}
},
"tokenizer":{
"split_query":{
"type": "pattern",
"pattern": "\\?"
}
}
}
},
"mappings": {
"your_log_type": {
"properties": {
"path": {
"type": "text",
"fields": {
"keyword": {
"type":"keyword"
},
"no_query": {
"type":"string",
"fielddata":true,
"analyzer":"query_analyzer"
}
}
}
}
}
}
}
And then query on
POST test/log_type/_search
{
"query": {
"bool": {
"must": [
{
"wildcard": {
"path.keyword": "/hex/*"
}
}
]
}
},
"from": 0,
"size": 0,
"aggs" : {
"genres" : {
"terms" : { "field" : "path.no_query" }
}
}
}

Related

How to aggregate matched terms in a query_string search?

I wish to search wildcard terms in a nested list of dict and then obtain a list of terms and its uuid grouped by matched wildcard.
I've the following mapping in my index:
"mappings": {
"properties": {
"uuid": {
"type": "keyword"
},
"urls": {
"type": "nested",
"properties": {
"url": {
"type": "keyword"
},
"is_visited": {
"type": "boolean"
}
}
}
}
}
and a lot of data such this:
{
"uuid":"afa9ac03-0723-4d66-ae18-08a51e2973bd"
"urls": [
{
"is_visited": true,
"url": "https://www.google.com"
},
{
"is_visited": false,
"url": "https://www.facebook.com"
},
{
"is_visited": true,
"url": "https://www.twitter.com"
},
]
},
{
"uuid":"4a1c695d-756b-4d9d-b3a0-cf524d955884"
"urls": [
{
"is_visited": true,
"url": "https://www.stackoverflow.com"
},
{
"is_visited": false,
"url": "https://www.facebook.com"
},
{
"is_visited": false,
"url": "https://drive.google.com"
},
{
"is_visited": false,
"url": "https://maps.google.com"
},
]
}
...
I wish to search via wildcard "*google.com OR *twitter.com" and obtain something like this:
"hits": [
"*google.com": [
{
"uuid": "4a1c695d-756b-4d9d-b3a0-cf524d955884",
"_source": {
"is_visited": false,
"url": "https://drive.google.com"
}
},
{
"id": "4a1c695d-756b-4d9d-b3a0-cf524d955884",
"_source": {
"is_visited": false,
"url": "https://maps.google.com"
}
},
{
"uuid":"afa9ac03-0723-4d66-ae18-08a51e2973bd",
"_source": {
"is_visited": true,
"url": "https://www.google.com"
}
}
]
"*twitter.com": [
{
"uuid":"afa9ac03-0723-4d66-ae18-08a51e2973bd",
"_source": {
"is_visited": true,
"url": "https://www.twitter.com"
},
},
]
]
This is my (python) search query:
body = {
#"_source": False,
"size": 100,
"query": {
"nested": {
"path": "urls",
"query":{
"query_string":{
"query": f"urls.url:{urlToSearch}",
}
}
,"inner_hits": {
"size":100 # returns top 100 results
}
}
}
}
but it returns an hit for each matched term instead of aggregate them in a list similar to what I would like to get.
EDIT
This is my setting and mapping:
{
"settings": {
"analysis": {
"char_filter": {
"my_filter": {
"type": "mapping",
"mappings": [
"- => _",
]
},
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_filter"
],
"filter": [
"lowercase",
]
}
}
}
},
"mappings": {
"properties": {
"uuid": {
"type": "keyword"
},
"urls": {
"type": "nested",
"properties": {
"url": {
"type": "keyword"
},
"is_visited": {
"type": "boolean"
}
}
}
}
}
}
Elasticsearch will not provide the output you want the way you set up the query.
This scenario to be an aggregation. My suggestion was to apply the nested query and use aggregation on the results.
Attention point wildcard query:
Avoid beginning patterns with * or ?. This can increase the iterations
needed to find matching terms and slow search performance.
{
"size": 0,
"query": {
"nested": {
"path": "urls",
"query": {
"bool": {
"should": [
{
"wildcard": {
"urls.url": {
"value": "*google.com"
}
}
},
{
"wildcard": {
"urls.url": {
"value": "*twitter.com"
}
}
}
]
}
}
}
},
"aggs": {
"agg_providers": {
"nested": {
"path": "urls"
},
"aggs": {
"google.com": {
"terms": {
"field": "urls.url",
"include": ".*google.com",
"size": 10
}
},
"twitter.com": {
"terms": {
"field": "urls.url",
"include": ".*twitter.com",
"size": 10
}
}
}
}
}
}
Results:
"aggregations": {
"agg_providers": {
"doc_count": 7,
"twitter.com": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "https://www.twitter.com",
"doc_count": 1
}
]
},
"google.com": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "https://drive.google.com",
"doc_count": 1
},
{
"key": "https://maps.google.com",
"doc_count": 1
},
{
"key": "https://www.google.com",
"doc_count": 1
}
]
}
}
}

How to ignore the nested objects that have null value or don't exist

I have the below aggregations query.
{
"aggs": {
"selected_brand": {
"filter": {
"term": {
"brandId": "b1d28821-3730-4266-8f55-eb69596004fb"
}
}
},
"sponsorshipSets": {
"nested": {
"path": "sponsorshipSets"
},
"aggs": {
"sponsorships": {
"nested": {
"path": "sponsorshipSets.sponsorships"
},
"aggs": {
"count": {
"value_count": {
"field": "sponsorshipSets.sponsorships.id"
}
}
}
}
}
}
}
}
The response is the below.
{
"hits": {
"total": {
"value": 2980,
"relation": "eq"
}
},
"aggregations": {
"selected_brand": {
"doc_count": 314
},
"sponsorshipSets": {
"doc_count": 2635,
"sponsorships": {
"doc_count": 1076,
"count": {
"value": 1076
}
}
}
}
}
The response shows the count of sponsorship documents is 1076, now I want to retrieve the documents these documents and tried with the below query.
{
"query": {
"bool": {
"must": {
"nested": {
"path": "sponsorshipSets",
"query": {
"nested": {
"path": "sponsorshipSets.sponsorships",
"query": {
"bool": {
"must_not": [
{
"match": {
"sponsorshipSets.sponsorships": "null"
}
}
]
}
}
}
}
}
},
"filter": [
{
"term": {
"brandId": "b1d28821-3730-4266-8f55-eb69596004fb"
}
}
]
}
}
}
The interesting thing for the second query is the hits below is only 82.
"hits": {
"total": {
"value": 82,
"relation": "eq"
},
What I really want is to retrieve the count of all the sponsorshipSets.sponsorships documents that are not null or not exist. SponsorshipSets can be missing as well.
Find below the abbreviated template.
{
"index_patterns": "campaigns*",
"order": 4,
"version": 4,
"aliases": {
"campaigns": {
}
},
"settings": {
"number_of_shards": 5
},
"mappings": {
"dynamic": "false",
"properties": {
"brandId": {
"type": "keyword"
},
"sponsorshipSets": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"sponsorships": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
}
}
}
}
}
}
You can use exists query to filter such documents. Below query should help.
Query Request:
POST <your_index_name>/_search
{
"query":{
"bool":{
"must":{
"nested":{
"path":"sponsorshipSets",
"query":{
"nested":{
"path":"sponsorshipSets.sponsorships",
"query":{
"bool":{
"must_not":[
{
"exists":{
"field":"sponsorshipSets.sponsorships"
}
}
]
}
}
}
}
}
},
"filter":[
{
"term":{
"brandId":"b1d28821-3730-4266-8f55-eb69596004fb"
}
}
]
}
}
}
This should return all three scenarios where your document JSON structure would be
sponsorshipSets.sponsorships: {} i.e. you have empty structure for sponsorships
sponsorshipSets.sponsorships: null i.e. the value is set as null
Or your document doesn't have sponsorships field in first place.
You don't need to use any aggregations for this as ES would return you the count of such documents in hits.total.value of the response.
Let me know if this helps!

Elastic Search: Bool Query in nested properties

Lets assume I have data structured like this:
{ "id": "120400871755634330808993320",
"name": "Metaalschroef binnenzeskant, DIN 912 RVS A4-80",
"description": "m16x70 cilinderschroef bzk a4-80 din912 klasse 80",
"fullDescription": "Metaalschroef met een binnenzeskant cilinderkop",
"synonyms": [],
"properties": [
{
"name": "draad",
"value": "16",
"sort": 99
},
{
"name": "lengte",
"value": "70",
"sort": 99
},
{
"name": "materiaal",
"value": "roestvaststaal",
"sort": 99
},
{
"name": "kwaliteit (materiaal)",
"value": "A4",
"sort": 99
},
{
"name": "DIN",
"value": "912",
"sort": 99
},
{
"name": "AISI",
"value": "316",
"sort": 99
},
{
"name": "draadsoort",
"value": "metrisch",
"sort": 99
},
{
"name": "Merk",
"value": "Elcee Holland",
"sort": 1
}
]
}
How do I write a boolean query where I select all documents that have a property with name "draad" and value "16" and a property with name "lengte" and value "70".
Right now I have this but it returns 0 results:
"query" : {
"nested" : {
"path" : "properties",
"query" : {
"bool" : {
"must" : [{
"bool" : {
"must" : [{
"term" : {
"properties.name" : "Merk"
}
}, {
"term" : {
"properties.value" : "Facom"
}
}
]
}
}, {
"bool" : {
"must" : [{
"term" : {
"properties.name" : "materiaal"
}
}, {
"term" : {
"properties.value" : "kunststof"
}
}
]
}
}
]
}
}
}
}
Replacing the highest level "must" with "should" returns too many results, which makes sense as it translates to an "or".
When using must, the engine is trying to search for nested documents with name:Merk and value:Facom. But also with name:materiaal and value:kunststof - which is impossible to happen in the same nested document at once.
When using should as you mentioned, it translate to or - which is indeed possible.
Problem is, you also getting the entire parent document with all it's nested documents.
In my own answer I'm showing the steps to create an index with nested documents (you should mark the field properties as nested type`).
After complete those steps, you'll be able to get results with the following query:
{
"_source": [
"id",
"name",
"description"
],
"query": {
"bool": {
"must": [
{
"nested": {
"path": "properties",
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"term": {
"properties.name": "Merk"
}
},
{
"term": {
"properties.value": "Facom"
}
}
]
}
},
{
"bool": {
"must": [
{
"term": {
"properties.name": "materiaal"
}
},
{
"term": {
"properties.value": "kunststof"
}
}
]
}
}
]
}
},
"inner_hits":{
"size": 10
}
}
}
]
}
}
}
I found a solution that is working very well!
My property object now looks like this:
{
"name": "breedte(mm)",
"value": "1000",
"unit": "mm",
"sort": 99,
"nameSlug": "breedte-mm",
"slug": "breedte-mm-1000"
},
I added a slug (containing a normalized string for key + value) and a nameslug which is a normalized string for the name.
My index is mapped like this:
"properties": {
"type": "nested",
"include_in_parent": true,
"properties": {
"name": {
"type": "keyword"
},
"nameSlug": {
"type": "keyword"
},
"slug": {
"type": "keyword"
},
"sort": {
"type": "long"
},
"unit": {
"type": "text",
"index": false
},
"value": {
"type": "keyword"
}
}
}
The "include_in_parent" is important here. It allows me to do the query below:
"query": {
"bool": {
"must": [
{
"terms": {
"properties.slug": [
"merk-orbis",
"merk-bahco"
]
}
},
{
"terms": {
"properties.slug": [
"materiaal-staal",
"materiaal-kunststof"
]
}
}
]
}
},
This queries searches for all documents where "merk" is "Orbis" or "Bahco" and where "materiaal" is "staal" or "kunststof".
My aggregations look like this:
"merk_query": {
"filter": {
"bool": {
"must": [
{
"terms": {
"properties.slug": [
"materiaal-staal",
"materiaal-kunststof"
]
}
}
]
}
},
"aggs": {
"merk_facets": {
"nested": {
"path": "properties"
},
"aggs": {
"merk_only": {
"filter": {
"term": {
"properties.nameSlug": {
"value": "merk"
}
}
},
"aggs": {
"facets": {
"terms": {
"field": "properties.name",
"size": 1
},
"aggs": {
"facetvalues": {
"terms": {
"field": "properties.value",
"size": 10
}
}
}
}
}
}
}
}
}
},
I run filteraggregate which filters all documents that match a facet (but not the current one I am bulding).
The result of this aggragate is something like this:
"merk_query": {
"doc_count": 7686,
"merk_facets": {
"doc_count": 68658,
"merk_only": {
"doc_count": 7659,
"facets": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Merk",
"doc_count": 7659,
"facetvalues": {
"doc_count_error_upper_bound": 10,
"sum_other_doc_count": 438,
"buckets": [
{
"key": "Orbis",
"doc_count": 6295
},
{
"key": "DX",
"doc_count": 344
},
{
"key": "AXA",
"doc_count": 176
},
{
"key": "Talen Tools",
"doc_count": 127
},
{
"key": "Nemef",
"doc_count": 73
},
{
"key": "bonfix",
"doc_count": 67
},
{
"key": "Bahco",
"doc_count": 64
},
{
"key": "Henderson",
"doc_count": 27
},
{
"key": "Maasland Groep",
"doc_count": 25
},
{
"key": "SYSTEC",
"doc_count": 23
}
]
}
}
]
}
}
}
}
},
And this is the end result in the browser:

Group by a part of string from a field rather than the full field in Elasticsearch

Here structure of my index:
[
{
"Id":"1",
"Path":"/Series/Current/SerieA/foo/foo",
"PlayCount":100
},
{
"Id":"2",
"Path":"/Series/Current/SerieA/bar/foo",
"PlayCount":1000
},
{
"Id":"3",
"Path":"/Series/Current/SerieA/bar/bar",
"PlayCount":50
},
{
"Id":"4",
"Path":"/Series/Current/SerieB/bla/bla",
"PlayCount":300
},
{
"Id":"5",
"Path":"/Series/Current/SerieB/goo/boo",
"PlayCount":200
},
{
"Id":"6",
"Path":"/Series/Current/SerieC/foo/zoo",
"PlayCount":100
}
]
I'd like to execute an aggregation that bring me sum of "PlayCount" for each Series like:
[
{
"key":"serieA",
"TotalPlayCount":1150
},
{
"key":"serieB",
"TotalPlayCount":500
},
{
"key":"serieC",
"TotalPlayCount":100
}
]
This is how I try to do it but obviously query fails since this is not the proper way:
{
"size": 0,
"query":{
"filtered":{
"query":{
"regexp":{
"Path":"/Series/Current/.*"
}
}
}
},
"aggs":{
"play_count_for_current_series":{
"terms": {
"field": "Path",
"regexp": "/Series/Current/([^/]+)"
},
"aggs":{
"Total_play": { "sum": { "field": "PlayCount" } }
}
}
}
}
Is there a way to do it?
My suggestion is as follows:
DELETE test
PUT /test
{
"settings": {
"analysis": {
"filter": {
"my_special_filter": {
"type": "pattern_capture",
"preserve_original": 0,
"patterns": [
"/Series/Current/([^/]+)"
]
}
},
"analyzer": {
"my_special_analyzer": {
"tokenizer": "whitespace",
"filter": [
"my_special_filter"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"Path": {
"type": "string",
"fields": {
"for_aggregations": {
"type": "string",
"analyzer": "my_special_analyzer"
},
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
Create a special analyzer that uses a pattern_capture filter to catch only those terms that you are interested. Because I didn't want to change your current mapping for that field I added a fields section with a sub-field that will use this special analyzer. I also added a raw field which is not_analyzed which will help with the query itself.
POST test/test/_bulk
{"index":{}}
{"Id":"1","Path":"/Series/Current/SerieA/foo/foo","PlayCount":100}
{"index":{}}
{"Id":"2","Path":"/Series/Current/SerieA/bar/foo","PlayCount":1000}
{"index":{}}
{"Id":"3","Path":"/Series/Current/SerieA/bar/bar","PlayCount":50}
{"index":{}}
{"Id":"4","Path":"/Series/Current/SerieB/bla/bla","PlayCount":300}
{"index":{}}
{"Id":"5","Path":"/Series/Current/SerieB/goo/boo","PlayCount":200}
{"index":{}}
{"Id":"6","Path":"/Series/Current/SerieC/foo/zoo","PlayCount":100}
{"index":{}}
{"Id":"7","Path":"/Sersdasdies/Curradent/SerieC/foo/zoo","PlayCount":100}
For the query, you don't need the regular expression in the query because your aggregation will use that sub-field which only has your needed SerieX terms.
GET /test/test/_search
{
"size": 0,
"query": {
"filtered": {
"query": {
"regexp": {
"Path.raw": "/Series/Current/.*"
}
}
}
},
"aggs": {
"play_count_for_current_series": {
"terms": {
"field": "Path.for_aggregations"
},
"aggs": {
"Total_play": {
"sum": {
"field": "PlayCount"
}
}
}
}
}
}
And the result is
"play_count_for_current_series": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "SerieA",
"doc_count": 3,
"Total_play": {
"value": 1150
}
},
{
"key": "SerieB",
"doc_count": 2,
"Total_play": {
"value": 500
}
},
{
"key": "SerieC",
"doc_count": 1,
"Total_play": {
"value": 100
}
}
]
}

ElasticSearch Nested Filter and Aggregation

So I have this mapping:
"employee": {
"properties": {
"DaysOff": {
"type": "nested",
"properties": {
"Date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"Days": {
"type": "double"
},
"ID": {
"type": "long"
}
}
}
}
}
So basically a employee can have days off. Each day off they have is stored in an array under the property DaysOff. Days can be a fraction of a day, so if an employee took half a day off then it would be 0.5.
So I have this search:
{
"size": 45,
"filter": {
"nested": {
"path": "DaysOff",
"filter": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
}
}
}
}
which brings me back 45 documents. which is correct. I'm just can't figure out how to now apply an aggregation to these documents in order to get back the sum of all the days that have been taken.
Using this resource I tried this aggs but didn't get me the correct result:
{
"size": 45,
"filter": {
"nested": {
"path": "DaysOff",
"filter": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
}
}
},
"aggs": {
"sum_docs": {
"nested": {
"path": "DaysOff"
},
"aggs": {
"stepped_down": {
"sum": {
"field": "DaysOff.Days"
}
}
}
}
}
}
You need to filter on those nested documents to get the correct results, From the docs
Because nested documents are indexed as separate documents, they can only be accessed within the scope of the nested query,
I created index like this
POST employee
{
"mappings": {
"emp_map": {
"properties": {
"DaysOff": {
"type": "nested",
"properties": {
"Date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"Days": {
"type": "double"
},
"ID": {
"type": "long"
}
}
},
"name": {
"type": "string"
}
}
}
}
},
Then I indexed few documents like this,
PUT employee/emp_map/1
{
"name" : "messi",
"DaysOff" : [
{
"Date" : "2015-11-01",
"Days" : 1,
"ID" : 11
},
{
"Date" : "2014-11-01",
"Days" : 2,
"ID" : 11
},
{
"Date" : "2015-12-01",
"Days" : 0.5,
"ID" : 11
}
]
}
PUT employee/emp_map/2
{
"name" : "ronaldo",
"DaysOff" : [
{
"Date" : "2015-10-01",
"Days" : 3,
"ID" : 12
},
{
"Date" : "2014-11-01",
"Days" : 2,
"ID" : 12
},
{
"Date" : "2015-12-01",
"Days" : 0.5,
"ID" : 12
}
]
}
PUT employee/emp_map/3
{
"name" : "suarez",
"DaysOff" : [
{
"Date" : "2015-11-01",
"Days" : 4,
"ID" : 13
},
{
"Date" : "2015-11-09",
"Days" : 2,
"ID" : 13
},
{
"Date" : "2015-12-01",
"Days" : 1.5,
"ID" : 13
}
]
}
This is my query, notice the filter aggregation in nested aggregation, without that ES will give you sum of all the days taken off.
GET employee/_search
{
"query": {
"bool": {
"filter": {
"nested": {
"path": "DaysOff",
"query": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
}
}
}
}
},
"aggs": {
"emp_name": {
"terms": {
"field": "name",
"size": 10
},
"aggs": {
"nesting": {
"nested": {
"path": "DaysOff"
},
"aggs": {
"filter_date": {
"filter": {
"range": {
"DaysOff.Date": {
"from": "now-2M",
"to": "now"
}
}
},
"aggs": {
"sum_taken_off_days": {
"sum": {
"field": "DaysOff.Days"
}
}
}
}
}
}
}
}
},
"size": 0
}
This is the result I get,
"aggregations": {
"emp_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "messi",
"doc_count": 1,
"nesting": {
"doc_count": 3,
"filter_date": {
"doc_count": 2,
"sum_taken_off_days": {
"value": 1.5
}
}
}
},
{
"key": "ronaldo",
"doc_count": 1,
"nesting": {
"doc_count": 3,
"filter_date": {
"doc_count": 1,
"sum_taken_off_days": {
"value": 0.5
}
}
}
},
{
"key": "suarez",
"doc_count": 1,
"nesting": {
"doc_count": 3,
"filter_date": {
"doc_count": 3,
"sum_taken_off_days": {
"value": 7.5
}
}
}
}
]
}
}
P.S : This is per employee, you can remove emp_name terms aggregation to get sum of all employees.

Resources