Elasticsearch sum first occurences of a day by term - elasticsearch

I am trying to convert our MySQL statistic Queries to our new Elasticsearch (version 5.4) Server.
In mysql we have a statistik table and a second table with only the first request of a user within a day.
Currently we're using PHP/MySQL to fill that second table and ignore all the other request that are in the statistik table.
The query we're running over the table looks like this:
SELECT
SUM(price_displayed * (query_amount / pricing_unit)) AS `requests`,
SUM(price_displayed * (order_amount / pricing_unit)) AS `orders`,
EXTRACT(YEAR_MONTH FROM `ts`) AS `date`
FROM statistics_values
The goal is to get rid of the second table.
Is it possible to get only the first document of the day and user and than use a script to calculate the result like in the mysql query?
I tried using a date_histogramm aggregation with a terms aggregation but it doesn't work.
The Query looks like this:
{
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(client_branch:DE*)"
}
},
{
"range": {
"created": {
"gte": "2017-01-01",
"lte": "2017-05-31",
"format": "date"
}
}
},
{
"term": {
"action": "query"
}
},
{
"term": {
"type": "enquiry"
}
}
],
"must_not": [
{
"term": {
"exclude_statistics": "1"
}
}
]
}
},
"aggs": {
"by_day": {
"date_histogram": {
"field": "created",
"interval": "day"
},
"aggs": {
"by_term": {
"terms": {
"field": "user_id"
}
}
}
}
},
"size": 0
}
Any ideas?

Related

What is the best way to aggregate the time between events in ElasticSearch?

I'm querying an ElasticSearch database in which several applications are logging every change they make to a shared entity - each application is responsible for managing different aspects of this shared entity. The entity is persisted in a document-database, but each change is persisted in this ElasticSearch database.
I'm attempting to query for changes to a specific property (status) in order to track the lifecycle of these Product entities over time. I need to be able to dynamically answer questions like:
Over the last N weeks, what's the average time it took for a Product to move from status-"Created" to status-"Details Submitted"?
During a specific time range, what's the average time it took for a Product to move from status-"Reviewed" to status-"Available Online"?
How long did take for Products in Group-A to move from status-"Details Submitted" to status-"Reviewed"?
In SQL I might use the group-by clause and perhaps some sub-queries, like:
select avg(submitted), avg(reviewed)
from (
select id,
max(timestamp) as reviewed,
min(timestamp) as submitted,
count(*) as statusChanges
from changes
where (
(key = 'status' and previous = 'Created' and updated = 'Details Submitted')
or (key = 'status' and previous = 'Details Submitted' and updated = 'Reviewed')
) and timestamp > ? and timestamp < ? and group_id = ?
group by id
)
where statusChanges = 2
What's the best way to accomplish something comparable in ElasticSearch?
I've tried using a Composite Index, which works decently when I need to examine the specific dates of when each Product changed its status - since it allows pagination. However this doesn't allow any further sorting of results nor overall aggregation. You can only sort by the field you grouped-by and you can't aggregate across all products.
I've just recently come across the concept of a Transform index? Is that the best approach for aggregating the results of an aggregation? I haven't gotten access to try this out yet, but I'm attempting to formulate a potential Transform Index now and struggling a bit.
Here's the composite query was able to write for finding out how long each Product remained in a specific status, although I couldn't figure out how to get min_doc_count to work in a composite query...
// GET: https://<my-cluster-hostname>:9092/product-index/_search
{
"size": 0,
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"match_phrase": {
"change.key": "status"
}
},
{
"match_phrase": {
"change.previousValue": "Created"
}
},
{
"match_phrase": {
"change.updatedValue": "Details Submitted"
}
}
]
}
},
{
"bool": {
"must": [
{
"match_phrase": {
"change.key": "status"
}
},
{
"match_phrase": {
"change.previousValue": "Details Submitted"
}
},
{
"match_phrase": {
"change.updatedValue": "Reviewed"
}
}
]
}
}
]
}
},
"aggs": {
"how-long-before-submitted-details-reviewed": {
"composite": {
"size": 20,
"after": {
"item": "<last_uuid_from_previous_page>"
},
"sources": [
{
"product": {
"terms": {
"field": "metadata.uuid.keyword",
"order": "desc"
}
}
}
]
},
"aggs": {
"detailsSubmitted": {
"min": {
"field": "timestamp"
}
},
"detailsReviewed": {
"max": {
"field": "timestamp"
}
}
}
}
}
}
Here's the Transform Index I'm thinking of submitting. But I wonder if there's a way of getting it to cover all status changes, or if instead I'll need to create an index for each status change like this and then filter/sort/aggregate over this Transform Index:
// PUT: https://<my-cluster-hostname>:9092/_transform/details-submitted-to-reviewed
{
"source": {
"index": "product-index",
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"match_phrase": {
"change.key": "status"
}
},
{
"match_phrase": {
"change.previousValue": "Created"
}
},
{
"match_phrase": {
"change.updatedValue": "Details Submitted"
}
}
]
}
},
{
"bool": {
"must": [
{
"match_phrase": {
"change.key": "status"
}
},
{
"match_phrase": {
"change.previousValue": "Details Submitted"
}
},
{
"match_phrase": {
"change.updatedValue": "Reviewed"
}
}
]
}
}
]
}
}
},
"dest": {
"index": "details-submitted-to-reviewed"
},
"pivot": {
"group_by": {
"product-id": {
"terms": {
"field": "metadata.uuid.keyword"
}
}
},
"aggregations": {
"detailsSubmitted": {
"min": {
"field": "timestamp"
}
},
"detailsReviewed": {
"max": {
"field": "timestamp"
}
}
}
}
}

Need aggregation of only the query results

I need to do an aggregation but only with the limited results I get form the query, but it is not working, it returns other results outside the size limit of the query. Here is the query I am doing
{
"size": 500,
"query": {
"bool": {
"must": [
{
"term": {
"tags.keyword": "possiblePurchase"
}
},
{
"term": {
"clientName": "Ci"
}
},
{
"range": {
"firstSeenDate": {
"gte": "now-30d"
}
}
}
],
"must_not": [
{
"term": {
"tags.keyword": "skipPurchase"
}
}
]
}
},
"sort": [
{
"firstSeenDate": {
"order": "desc"
}
}
],
"aggs": {
"byClient": {
"terms": {
"field": "clientName",
"size": 25
},
"aggs": {
"byTarget": {
"terms": {
"field": "targetName",
"size": 6
},
"aggs": {
"byId": {
"terms": {
"field": "id",
"size": 5
}
}
}
}
}
}
}
}
I need the aggregations to only consider the first 500 results of the query, sorted by the field I am requesting on the query. I am completely lost. Thanks for the help
Scope of the aggregation is the number of hits of your query, the size parameter is only used to specify the number of hits to fetch and display.
If you want to restrict the scope of the aggregation on the first n hits of a query, I would suggest the sampler aggregation in combination with your query

Elasticsearch: Group by timeframe

I made this query to get the number of requests a user made in the last month (or day) compared to the rest of the users.
{
"query": {
"bool": {
"must": [
{
"range": {
"created": {
"gte": 1554854400000
}
}
}
]
}
},
"aggs": {
"requests": {
"filters": {
"other_bucket_key": "all",
"filters": {
"user": {
"match": {
"user_id": "XXXXXX"
}
}
}
}
}
}
}
These are all the requests made in the selected period of time.
Now, I want to get the number of requests / day the user made in the last month compared to the rest of the users.
I was able to obtain this using Date Histogram Aggregation for the total number of requests made but I can't figure out how to split that into user and the rest.
I don't know if that's possible or maybe there's another way of doing this.
You're on the right path, you simply need to combine the date_histogram daily aggregation and the filters aggregation you already have:
{
"query": {
"bool": {
"must": [
{
"range": {
"created": {
"gte": 1554854400000
}
}
}
]
}
},
"aggs": {
"per_day": {
"date_histogram": {
"field": "created",
"interval": "day"
},
"aggs": {
"requests": {
"filters": {
"other_bucket_key": "all",
"filters": {
"user": {
"match": {
"user_id": "XXXXXX"
}
}
}
}
}
}
}
}
}
For each day, you're going to get the number of requests made by the user vs the number of requests for all other users.

Elasticsearch aggregate field between dates

I want to compare two buckets against each other and find new occurrences that appear in the second bucket. The below query returns all entries in the "query.keyword" field between the two UNIX timestamps provided but I want the UNIX timestamps to be apart of the aggregation section itself.
GET _search
{
"size": 0,
"query": {
"range" :{
"ts": {
"gte":1535155200,
"lte":1535414399
}
}
},
"aggs": {
"domains": {
"terms": {
"field":"query.keyword"
}
}
}
}
I've also tried this but received the error:
"Found two aggregation type definitions in [domains_prev]: [range] and [terms]",
GET _search
{
"size": 0,
"aggs": {
"domains_prev": {
"range" :{
"field":"ts",
"ranges": [
{"to" : 1535414399},
{"from" : 1535155200}
]
},
"terms": {
"field":"query.keyword"
}
}
}
}
The goal is to have something similar to this:
Agg1
"domains_prev"
"field":"query.keyword"
date:gte:timestamp, lte:timestamp
Agg2
"domains_today"
"field":"query.keyword"
date:today
show all "query.keyword" in agg2 that does not appear in agg1.
This is the SQL query that I use to achieve the intended result:
select domains FROM table WHERE date >= 20171123 and domains NOT IN (SELECT domains FROM table WHERE date < 20171123 group by domains)
You'll want to do a nested bucket aggregation starting with date range:
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-daterange-aggregation.html
From their page, start with an aggregation like this at the top level:
{
"aggs": {
"range": {
"date_range": {
"field": "date",
"format": "MM-yyy",
"ranges": [
{ "to": "now-10M/M" },
{ "from": "now-10M/M" }
]
}
}
}
}
Then nest your existing terms aggregation using query.keyword under that.
The end result should be something like:
{
"aggs": {
"range": {
"date_range": {
"field": "date",
"format": "MM-yyy",
"ranges": [
{ "to": "now-10M/M" },
{ "from": "now-10M/M" }
]
},
"aggs": {
"domains": {
"terms": {
"field":"query.keyword"
}
}
}
}
}
}

Elasticsearch count number of occurrences

I am trying to write a elastic search query that will show me the number of returning users to a site. The following query return all unique users for a day by site. I am looking for the number of users that landed on a site only once for the time period.
GET 2015.*/_search?search_type=count
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"range": {
"#timestamp": {
"gte": "now-1d/d",
"lte": "now-1d/d"
}
}
},
{
"fquery": {
"query": {
"query_string": {
"query": "event:script_initiated"
}
}
}
},
{
"fquery": {
"query": {
"query_string": {
"query": "session_depth:0"
}
}
}
}
]
}
}
}
},
"aggs": {
"Site Name": {
"terms": {
"field": "site_name",
"size": 1
},
"aggs": {
"uniques": {
"cardinality": {
"field": "user_id"
}
}
}
}
}
}
You will need to use metric aggregation and write a script for this.
In the script , you can check if the same user name has come across multiple documents and hence see the number of occurrence of a user.
Or you can wait to get the issue refereed in this bug resolved.

Resources