How to get the maximum value below the selected date? - clickhouse

I've got dataset:
date | used_key
2000-01-01 | 1
2000-01-01 | 2
2000-01-01 | 3
2000-01-01 | 4
2000-01-02 | 1
2000-01-02 | 3
2000-01-03 | 1
2000-01-04 | 5
2000-01-04 | 6
2000-01-06 | 3
I need to get the maximum key value that was reached before the selected day:
date | max_key
2000-01-01 | 4
2000-01-02 | 4
2000-01-03 | 4
2000-01-04 | 6
2000-01-06 | 6
Something like that (without join section) but in right way:
SELECT max(used_key) max_key, date
FROM t1
WHERE 'date_below' <= date
GROUP BY date

Try this query:
SELECT result.1 date, result.2 max_key
FROM (
SELECT
groupArray(date) dates,
groupArray(max_used_key) max_used_keys,
arrayMap((date, index) -> (date, arrayReduce('max', arraySlice(max_used_keys, index))), dates, arrayEnumerate(dates)) result_array,
arrayJoin(result_array) result
FROM (
SELECT date, max(used_key) max_used_key
FROM (
/* test data */
SELECT data.1 date, data.2 used_key
FROM (
SELECT arrayJoin([
(toDate('2000-01-01'), 1),
(toDate('2000-01-01'), 2),
(toDate('2000-01-01'), 3),
(toDate('2000-01-01'), 4),
(toDate('2000-01-02'), 1),
(toDate('2000-01-02'), 3),
(toDate('2000-01-03'), 1),
(toDate('2000-01-04'), 5),
(toDate('2000-01-04'), 6),
(toDate('2000-01-06'), 3)]) data)
)
GROUP BY date
ORDER BY date DESC
)
);
/* Result:
┌───────date─┬──max_key─┐
│ 2000-01-06 │ 6 │
│ 2000-01-04 │ 6 │
│ 2000-01-03 │ 4 │
│ 2000-01-02 │ 4 │
│ 2000-01-01 │ 4 │
└────────────┴──────────┘
*/

Related

How to check missing values in Clickhouse

I have a table that is filled with data every 15 minutes. I need to check that there is data for all days of the entire period. there is a time column in which the data is in the format yyyy-mm-dd hh:mm:ss
i've found the start date and the last date with
I found out that you can generate an array of dates from this interval (start and end dates) with which each line will be compared, and if there is no match, here it is the missing date.
i've tried this:
WITH dates_range AS (SELECT toDate(min(time)) AS start_date,
toDate(max(time)) AS end_date
FROM table)
SELECT dates
FROM (
SELECT arrayFlatten(arrayMap(x -> start_date + x, range(0, toUInt64(end_date - start_date)))) AS dates
FROM dates_range
)
LEFT JOIN (
SELECT toDate(time) AS date
FROM table
GROUP BY toDate(time)
) USING date
WHERE date IS NULL;
but it returns with Code: 10. DB::Exception: Not found column date in block. There are only columns: dates. (NOT_FOUND_COLUMN_IN_BLOCK) and I can't
You can also use WITH FILL modifier https://clickhouse.com/docs/en/sql-reference/statements/select/order-by/#order-by-expr-with-fill-modifier
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
SELECT *
FROM
(
SELECT
toDate(time) AS t,
count() AS c
FROM T
GROUP BY t
ORDER BY t ASC WITH FILL
)
WHERE c = 0
┌──────────t─┬─c─┐
│ 2020-01-11 │ 0 │
│ 2020-01-13 │ 0 │
│ 2020-01-16 │ 0 │
│ 2020-01-18 │ 0 │
│ 2020-01-21 │ 0 │
│ 2020-01-23 │ 0 │
│ 2020-01-26 │ 0 │
└────────────┴───┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
WITH (SELECT (toDate(min(time)), toDate(max(time))) FROM T) as x
select date, sumIf(cnt, type=1) c1, sumIf(cnt, type=2) c2 from
( SELECT arrayJoin(arrayFlatten(arrayMap(x -> x.1 + x, range(0, toUInt64(x.2 - x.1+1))))) AS date, 2 type, 1 cnt
union all SELECT toDate(time) AS date, 1 type, count() cnt FROM T GROUP BY toDate(time) )
group by date
having c1 = 0 or c2 = 0;
┌───────date─┬─c1─┬─c2─┐
│ 2020-01-11 │ 0 │ 1 │
│ 2020-01-13 │ 0 │ 1 │
│ 2020-01-16 │ 0 │ 1 │
│ 2020-01-18 │ 0 │ 1 │
│ 2020-01-21 │ 0 │ 1 │
│ 2020-01-23 │ 0 │ 1 │
│ 2020-01-26 │ 0 │ 1 │
└────────────┴────┴────┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
WITH (SELECT (toDate(min(time)), toDate(max(time))) FROM T) as x
SELECT l.*, r.*
FROM ( SELECT arrayJoin(arrayFlatten(arrayMap(x -> x.1 + x, range(0, toUInt64(x.2 - x.1+1))))) AS date) l
LEFT JOIN ( SELECT toDate(time) AS date FROM T GROUP BY toDate(time)
) r USING date
WHERE r.date IS NULL settings join_use_nulls = 1;
┌───────date─┬─r.date─┐
│ 2020-01-11 │ ᴺᵁᴸᴸ │
│ 2020-01-13 │ ᴺᵁᴸᴸ │
│ 2020-01-16 │ ᴺᵁᴸᴸ │
│ 2020-01-18 │ ᴺᵁᴸᴸ │
│ 2020-01-21 │ ᴺᵁᴸᴸ │
│ 2020-01-23 │ ᴺᵁᴸᴸ │
│ 2020-01-26 │ ᴺᵁᴸᴸ │
└────────────┴────────┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
select b from (
SELECT
b,
((b - any(b) OVER (ORDER BY b ASC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING))) AS lag
FROM
(
SELECT toDate(time) AS b
FROM T
GROUP BY b
ORDER BY b ASC
)) where lag > 1 and lag < 10000
┌──────────b─┐
│ 2020-01-12 │
│ 2020-01-14 │
│ 2020-01-17 │
│ 2020-01-19 │
│ 2020-01-22 │
│ 2020-01-24 │
│ 2020-01-27 │
└────────────┘

How to realize funnel analysis in ClickHouse

I want to do funnel analysis based on buried point data that are stored in ClickHouse. Let's define a few elements for funnel analysis:
A series of events: A (event_id = 1) -> B (event_id = 2) -> C (event_id = 3)
Time period: 0 (event_ms) ~ 500 (event_ms)
Time window: 100 (event_ms)
I want to know, for each user, if there is an event series (A->B->C) happened within the time period, and intervals between A and C is within the time window.
Here is my test dataset:
CREATE TABLE test_dataset
(
`event_id` UInt64,
`event_ms` UInt64,
`uid` UInt64 // user_id
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(toDate(event_ms))
ORDER BY (event_id, event_ms,uid)
SETTINGS index_granularity = 8192;
INSERT INTO TABLE test_dataset VALUES
(1, 100, 123),
(1, 120, 123),
(1, 130, 123),
(1, 150, 345),
(1, 180, 345),
(2, 150, 123),
(2, 200, 234),
(2, 140, 345),
(2, 210, 345),
(2, 300, 345),
(3, 180, 123),
(3, 250, 123),
(3, 290, 234),
(3, 270, 345);
I use join to find all qualified event series:
SELECT
t1.event_ms, t2.event_ms, t3.event_ms, t4.event_ms,
t1.uid, t2.uid, t3.uid, t4.uid
FROM
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 1 AND event_ms >= 0 AND event_ms <= 500) as t1
ASOF left join
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 2 AND event_ms >= 0 AND event_ms <= 500) as t2
ON t1.uid = t2.uid AND t1.event_ms < t2.event_ms
ASOF left join
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 3 AND event_ms >= 0 and event_ms <= 500) as t3
ON t2.uid = t3.uid and t2.event_ms < t3.event_ms
ASOF left join
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 3 AND event_ms >= 0 and event_ms <= 500) as t4
ON t3.uid = t4.uid and t4.event_ms < t1.event_ms + 100
WHERE t4.event_ms > 0;
Here are all qualified event series:
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 180 │ 210 │ 270 │ 270 │ 345 │ 345 │ 345 │ 345 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 120 │ 150 │ 180 │ 180 │ 123 │ 123 │ 123 │ 123 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 130 │ 150 │ 180 │ 180 │ 123 │ 123 │ 123 │ 123 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 100 │ 150 │ 180 │ 180 │ 123 │ 123 │ 123 │ 123 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
Then I know user 123 and 345 have such event series within the time period. Using join is pretty slow in ClickHouse, is there any other way to work around this problem?
By the way, I don't need to know all qualified series, I only want to know if there is one such event series for each user.
There are function windowFunnel that searches for chain of events in sliding window.
SELECT
uid,
windowFunnel(100)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
WHERE (event_ms > 0) AND (event_ms < 500)
GROUP BY uid;
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 0 │
│ 345 │ 3 │
│ 123 │ 3 │
└─────┴───────────┘
It returns matched chain length, so for users 345 and 123 we have 3 that means that whole chain is matched.
If we decrease window to 10 it will find only beginning of chain and don't match futher events due to condition timestamp of event 2 <= timestamp of event 1 + window is not hold.
SELECT
uid,
windowFunnel(10)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
WHERE (event_ms > 0) AND (event_ms < 500)
GROUP BY uid
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 0 │
│ 345 │ 1 │
│ 123 │ 1 │
└─────┴───────────┘
So, to check that is there such chain for user you can check that windowFunnel matched appropriate number of events.
Restriction on time interval (Time period: 0 (event_ms) ~ 500 (event_ms)), is simply handled in WHERE clause.
Add more events out of period:
INSERT INTO TABLE test_dataset VALUES (1, 600, 234), (2, 601, 234), (3, 602, 234);
Then check:
SELECT
uid,
windowFunnel(100)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
WHERE (event_ms > 0) AND (event_ms < 500)
GROUP BY uid
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 0 │
│ 345 │ 3 │
│ 123 │ 3 │
└─────┴───────────┘
Without WHERE
SELECT
uid,
windowFunnel(100)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
GROUP BY uid
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 3 │
│ 345 │ 3 │
│ 123 │ 3 │
└─────┴───────────┘

Record ranking in Clickhouse

For example, i have a table:
CREATE DATABASE IF NOT EXISTS example;
CREATE TABLE IF NOT EXISTS example.etable (starttime datetime, name string,) ENGINE = MergeTree;
In order to apply the GROUP BY operation , I need to calculate the rank for each record, so that consecutive ordered records with the same value of the "name" field have the same rank. If the current record has a value of the name field other than the previous one, then the rank is incremented.
In MySQL this can be done with query like that:
SELECT name, starttime,
#prev := #curr,
#curr := name,
#rank := IF(#prev = #curr, #rank, #rank+1) AS rank
FROM example.etable,
(SELECT #curr := null, #prev := null, #rank := 0) r
ORDER BY starttime ASC;
Example output:
+------+---------------------+----------------+---------------+------+
| name | starttime | #prev := #curr | #curr := name | rank |
+------+---------------------+----------------+---------------+------+
| s1 | 2020-05-14 15:56:46 | NULL | s1 | 1 |
| s1 | 2020-05-14 15:56:49 | s1 | s1 | 1 |
| s1 | 2020-05-14 15:56:51 | s1 | s1 | 1 |
| s2 | 2020-05-14 15:56:53 | s1 | s2 | 2 |
| s1 | 2020-05-14 15:56:56 | s2 | s1 | 3 |
| s3 | 2020-05-14 15:56:59 | s1 | s3 | 4 |
+------+---------------------+----------------+---------------+------+
So, here is the question, how can I achieve this in Clickhouse?
Calculation rank consist of three steps:
transform relation to arrays (groupArray)
calculate ranks (arrayCumSum)
transform arrays to relation (arrayJoin).
SELECT result.1 starttime, result.2 name, result.3 rank
FROM (
SELECT
groupArray(starttime) starttime_arr,
groupArray(name) name_arr,
arrayCumSum((name, index) -> index = 1 ? 1 : (name_arr[index - 1] = name ? 0 : 1), name_arr, arrayEnumerate(name_arr)) ranks,
arrayZip(starttime_arr, name_arr, ranks) result_array,
arrayJoin(result_array) result
FROM (
SELECT *
FROM (
/* emulate the 'example.etable'-table */
SELECT toDateTime(test_data.1) AS starttime, test_data.2 AS name
FROM (
SELECT arrayJoin([
('2020-05-14 15:56:46', 's1'),
('2020-05-14 15:56:49', 's1'),
('2020-05-14 15:56:51', 's1'),
('2020-05-14 15:56:53', 's2'),
('2020-05-14 15:56:56', 's1'),
('2020-05-14 15:56:59', 's3')
]) test_data))
ORDER BY starttime)
)
/* result
┌───────────starttime─┬─name─┬─rank─┐
│ 2020-05-14 15:56:46 │ s1 │ 1 │
│ 2020-05-14 15:56:49 │ s1 │ 1 │
│ 2020-05-14 15:56:51 │ s1 │ 1 │
│ 2020-05-14 15:56:53 │ s2 │ 2 │
│ 2020-05-14 15:56:56 │ s1 │ 3 │
│ 2020-05-14 15:56:59 │ s3 │ 4 │
└─────────────────────┴──────┴──────┘
*/

Efficient way to get rows immediately outside of a time window

We have time series data stored in a clickhouse table, similar to:
timestamp value
2020-03-05 11:03:00 2
2020-03-05 11:12:00 3
2020-03-05 11:13:00 4
2020-03-05 11:27:00 5
2020-03-05 11:31:00 6
2020-03-05 11:39:00 7
When visualising this data, we request a time range, like 2020-03-05 11:15:00 - 2020-03-05 11:30:00.
It is easy to select data within this range, but what is more useful for visualisation is to also get the points either side, i.e.:
2020-03-05 11:12:00 3
2020-03-05 11:13:00 4
2020-03-05 11:27:00 5
2020-03-05 11:31:00 6
Is there an efficient way to do this in clickhouse? At the moment I am doing (potentially) 3 separate queries:
Select data within range:
select * from data where timestamp >= "from" and timestamp <= "to" order by timestamp
If timestamp of first point != "from" timestamp:
select * from data where timestamp < "from" order by timestamp desc limit 1
If timestamp of last point != "to" timestamp:
select * from data where timestamp > "to" order by timestamp limit 1
It would be great if it were possible to get this in one query.
It looks like need just combine all three queries to one and a little change the comparison operators:
SELECT *
FROM (
SELECT *
FROM
(
/* test data */
SELECT data.1 AS timestamp, data.2 AS value
FROM (SELECT arrayJoin([(toDateTime('2020-03-05 11:03:00'), 2), (toDateTime('2020-03-05 11:12:00'), 3), (toDateTime('2020-03-05 11:13:00'), 4), (toDateTime('2020-03-05 11:27:00'), 5), (toDateTime('2020-03-05 11:31:00'), 6), (toDateTime('2020-03-05 11:39:00'), 7)]) AS data)
)
WHERE timestamp > '2020-03-05 11:15:00' AND timestamp < '2020-03-05 11:30:00'
UNION ALL
SELECT DISTINCT *
FROM (
SELECT *
FROM
(
/* test data */
SELECT data.1 AS timestamp, data.2 AS value
FROM (SELECT arrayJoin([(toDateTime('2020-03-05 11:03:00'), 2), (toDateTime('2020-03-05 11:12:00'), 3), (toDateTime('2020-03-05 11:13:00'), 4), (toDateTime('2020-03-05 11:27:00'), 5), (toDateTime('2020-03-05 11:31:00'), 6), (toDateTime('2020-03-05 11:39:00'), 7)]) AS data)
)
WHERE timestamp <= '2020-03-05 11:15:00'
ORDER BY timestamp DESC
LIMIT 1
UNION ALL
SELECT *
FROM
(
/* test data */
SELECT data.1 AS timestamp, data.2 AS value
FROM (SELECT arrayJoin([(toDateTime('2020-03-05 11:03:00'), 2), (toDateTime('2020-03-05 11:12:00'), 3), (toDateTime('2020-03-05 11:13:00'), 4), (toDateTime('2020-03-05 11:27:00'), 5), (toDateTime('2020-03-05 11:31:00'), 6), (toDateTime('2020-03-05 11:39:00'), 7)]) AS data)
)
WHERE timestamp >= '2020-03-05 11:30:00'
ORDER BY timestamp ASC
LIMIT 1))
ORDER BY timestamp;
/* result
┌───────────timestamp─┬─value─┐
│ 2020-03-05 11:13:00 │ 4 │
│ 2020-03-05 11:27:00 │ 5 │
│ 2020-03-05 11:31:00 │ 6 │
└─────────────────────┴───────┘
*/
..
WHERE timestamp > '2020-03-05 11:13:00' AND timestamp < '2020-03-05 11:30:00'
..
/* result
┌───────────timestamp─┬─value─┐
│ 2020-03-05 11:13:00 │ 4 │
│ 2020-03-05 11:27:00 │ 5 │
│ 2020-03-05 11:31:00 │ 6 │
└─────────────────────┴───────┘
*/
..
WHERE timestamp > '2020-03-05 11:15:00' AND timestamp < '2020-03-05 11:31:00'
..
/* result
┌───────────timestamp─┬─value─┐
│ 2020-03-05 11:13:00 │ 4 │
│ 2020-03-05 11:27:00 │ 5 │
│ 2020-03-05 11:31:00 │ 6 │
└─────────────────────┴───────┘
*/
..
WHERE timestamp > '2020-03-05 11:27:00' AND timestamp < '2020-03-05 11:31:00'
..
/* result
┌───────────timestamp─┬─value─┐
│ 2020-03-05 11:27:00 │ 5 │
│ 2020-03-05 11:31:00 │ 6 │
└─────────────────────┴───────┘
*/
..
WHERE timestamp > '2020-03-05 11:28:00' AND timestamp < '2020-03-05 11:28:00'
..
/* result
┌───────────timestamp─┬─value─┐
│ 2020-03-05 11:27:00 │ 5 │
│ 2020-03-05 11:31:00 │ 6 │
└─────────────────────┴───────┘
*/
..
WHERE timestamp > '2020-03-05 11:31:00' AND timestamp < '2020-03-05 11:31:00'
..
/*
result
┌───────────timestamp─┬─value─┐
│ 2020-03-05 11:31:00 │ 6 │
└─────────────────────┴───────┘
*/
I would expand range by some function toStartOfTenMinutes('2020-03-05 11:15:00)
toStartOfTenMinutes('2020-03-05 11:30:00')+600 , or -600 +600
And filter excessive rows at the client side.
Because 3 queries are slower than 1.

How to get the parameter value by day?

I need to find the value of attributes for each player in the ranking by day.
I have got table below:
player_id | date | level
----------------------------
pl1 |2018-01-01| 3
pl1 |2018-01-02| 3
pl1 |2018-01-03| 4
pl1 |2018-01-05| 4
pl1 |2018-01-06| 4
pl1 |2018-01-08| 5
pl2 |2018-01-05| 1
I need to get next result:
player_id | level_by_date
-----------------------------
pl1 | (3,3,4,4,4,4,4,5)
pl2 | (0,0,0,0,1,1,1,1)
I tried to do it next way but failed
SELECT
player_id,
groupArray(max(module_level))
FROM d_Modules
WHERE
date>='2018-01-01' AND date<=arrayMap(i -> (toDate('2018-12-31') + toIntervalDay(i)), range(toUInt64((toDate('2018-12-31') - toDate('2018-01-01')) + 1)))
How can I do it?
Query can look like this:
SELECT
player_id,
arrayMap(date -> (date, 0), arrayFilter(date -> (arrayExists(i -> (i.1 = date), origin_level_by_date) = 0), dateRange)) AS missed_level_by_date,
arraySort(x -> x.1, arrayConcat(origin_level_by_date, missed_level_by_date)) AS level_by_date,
arrayMap(x -> x.2, level_by_date) AS result
FROM
(
SELECT
player_id,
groupArray((toDate(date), level)) AS origin_level_by_date
FROM d_Modules
WHERE (date >= '2018-01-01 00:00:00') AND (date < '2019-01-01 00:00:00')
GROUP BY player_id
)
CROSS JOIN
(
SELECT
min(toDate(date)) AS min,
max(toDate(date)) AS max,
arrayMap(i -> (min + i), range(toUInt32((max - min) + 1))) AS dateRange
FROM d_Modules
WHERE (date >= '2018-01-01 00:00:00') AND (date < '2019-01-01 00:00:00')
)
ORDER BY player_id ASC
FORMAT Vertical
The result is a little different from required because for missed days level-value is zero:
┌─player_id─┬─result────────────┐
│ pl1 │ [3,3,4,0,4,4,0,5] │
│ pl2 │ [0,0,0,0,1,0,0,0] │
└───────────┴───────────────────┘

Resources