Record ranking in Clickhouse - clickhouse

For example, i have a table:
CREATE DATABASE IF NOT EXISTS example;
CREATE TABLE IF NOT EXISTS example.etable (starttime datetime, name string,) ENGINE = MergeTree;
In order to apply the GROUP BY operation , I need to calculate the rank for each record, so that consecutive ordered records with the same value of the "name" field have the same rank. If the current record has a value of the name field other than the previous one, then the rank is incremented.
In MySQL this can be done with query like that:
SELECT name, starttime,
#prev := #curr,
#curr := name,
#rank := IF(#prev = #curr, #rank, #rank+1) AS rank
FROM example.etable,
(SELECT #curr := null, #prev := null, #rank := 0) r
ORDER BY starttime ASC;
Example output:
+------+---------------------+----------------+---------------+------+
| name | starttime | #prev := #curr | #curr := name | rank |
+------+---------------------+----------------+---------------+------+
| s1 | 2020-05-14 15:56:46 | NULL | s1 | 1 |
| s1 | 2020-05-14 15:56:49 | s1 | s1 | 1 |
| s1 | 2020-05-14 15:56:51 | s1 | s1 | 1 |
| s2 | 2020-05-14 15:56:53 | s1 | s2 | 2 |
| s1 | 2020-05-14 15:56:56 | s2 | s1 | 3 |
| s3 | 2020-05-14 15:56:59 | s1 | s3 | 4 |
+------+---------------------+----------------+---------------+------+
So, here is the question, how can I achieve this in Clickhouse?

Calculation rank consist of three steps:
transform relation to arrays (groupArray)
calculate ranks (arrayCumSum)
transform arrays to relation (arrayJoin).
SELECT result.1 starttime, result.2 name, result.3 rank
FROM (
SELECT
groupArray(starttime) starttime_arr,
groupArray(name) name_arr,
arrayCumSum((name, index) -> index = 1 ? 1 : (name_arr[index - 1] = name ? 0 : 1), name_arr, arrayEnumerate(name_arr)) ranks,
arrayZip(starttime_arr, name_arr, ranks) result_array,
arrayJoin(result_array) result
FROM (
SELECT *
FROM (
/* emulate the 'example.etable'-table */
SELECT toDateTime(test_data.1) AS starttime, test_data.2 AS name
FROM (
SELECT arrayJoin([
('2020-05-14 15:56:46', 's1'),
('2020-05-14 15:56:49', 's1'),
('2020-05-14 15:56:51', 's1'),
('2020-05-14 15:56:53', 's2'),
('2020-05-14 15:56:56', 's1'),
('2020-05-14 15:56:59', 's3')
]) test_data))
ORDER BY starttime)
)
/* result
┌───────────starttime─┬─name─┬─rank─┐
│ 2020-05-14 15:56:46 │ s1 │ 1 │
│ 2020-05-14 15:56:49 │ s1 │ 1 │
│ 2020-05-14 15:56:51 │ s1 │ 1 │
│ 2020-05-14 15:56:53 │ s2 │ 2 │
│ 2020-05-14 15:56:56 │ s1 │ 3 │
│ 2020-05-14 15:56:59 │ s3 │ 4 │
└─────────────────────┴──────┴──────┘
*/

Related

How to check missing values in Clickhouse

I have a table that is filled with data every 15 minutes. I need to check that there is data for all days of the entire period. there is a time column in which the data is in the format yyyy-mm-dd hh:mm:ss
i've found the start date and the last date with
I found out that you can generate an array of dates from this interval (start and end dates) with which each line will be compared, and if there is no match, here it is the missing date.
i've tried this:
WITH dates_range AS (SELECT toDate(min(time)) AS start_date,
toDate(max(time)) AS end_date
FROM table)
SELECT dates
FROM (
SELECT arrayFlatten(arrayMap(x -> start_date + x, range(0, toUInt64(end_date - start_date)))) AS dates
FROM dates_range
)
LEFT JOIN (
SELECT toDate(time) AS date
FROM table
GROUP BY toDate(time)
) USING date
WHERE date IS NULL;
but it returns with Code: 10. DB::Exception: Not found column date in block. There are only columns: dates. (NOT_FOUND_COLUMN_IN_BLOCK) and I can't
You can also use WITH FILL modifier https://clickhouse.com/docs/en/sql-reference/statements/select/order-by/#order-by-expr-with-fill-modifier
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
SELECT *
FROM
(
SELECT
toDate(time) AS t,
count() AS c
FROM T
GROUP BY t
ORDER BY t ASC WITH FILL
)
WHERE c = 0
┌──────────t─┬─c─┐
│ 2020-01-11 │ 0 │
│ 2020-01-13 │ 0 │
│ 2020-01-16 │ 0 │
│ 2020-01-18 │ 0 │
│ 2020-01-21 │ 0 │
│ 2020-01-23 │ 0 │
│ 2020-01-26 │ 0 │
└────────────┴───┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
WITH (SELECT (toDate(min(time)), toDate(max(time))) FROM T) as x
select date, sumIf(cnt, type=1) c1, sumIf(cnt, type=2) c2 from
( SELECT arrayJoin(arrayFlatten(arrayMap(x -> x.1 + x, range(0, toUInt64(x.2 - x.1+1))))) AS date, 2 type, 1 cnt
union all SELECT toDate(time) AS date, 1 type, count() cnt FROM T GROUP BY toDate(time) )
group by date
having c1 = 0 or c2 = 0;
┌───────date─┬─c1─┬─c2─┐
│ 2020-01-11 │ 0 │ 1 │
│ 2020-01-13 │ 0 │ 1 │
│ 2020-01-16 │ 0 │ 1 │
│ 2020-01-18 │ 0 │ 1 │
│ 2020-01-21 │ 0 │ 1 │
│ 2020-01-23 │ 0 │ 1 │
│ 2020-01-26 │ 0 │ 1 │
└────────────┴────┴────┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
WITH (SELECT (toDate(min(time)), toDate(max(time))) FROM T) as x
SELECT l.*, r.*
FROM ( SELECT arrayJoin(arrayFlatten(arrayMap(x -> x.1 + x, range(0, toUInt64(x.2 - x.1+1))))) AS date) l
LEFT JOIN ( SELECT toDate(time) AS date FROM T GROUP BY toDate(time)
) r USING date
WHERE r.date IS NULL settings join_use_nulls = 1;
┌───────date─┬─r.date─┐
│ 2020-01-11 │ ᴺᵁᴸᴸ │
│ 2020-01-13 │ ᴺᵁᴸᴸ │
│ 2020-01-16 │ ᴺᵁᴸᴸ │
│ 2020-01-18 │ ᴺᵁᴸᴸ │
│ 2020-01-21 │ ᴺᵁᴸᴸ │
│ 2020-01-23 │ ᴺᵁᴸᴸ │
│ 2020-01-26 │ ᴺᵁᴸᴸ │
└────────────┴────────┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
select b from (
SELECT
b,
((b - any(b) OVER (ORDER BY b ASC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING))) AS lag
FROM
(
SELECT toDate(time) AS b
FROM T
GROUP BY b
ORDER BY b ASC
)) where lag > 1 and lag < 10000
┌──────────b─┐
│ 2020-01-12 │
│ 2020-01-14 │
│ 2020-01-17 │
│ 2020-01-19 │
│ 2020-01-22 │
│ 2020-01-24 │
│ 2020-01-27 │
└────────────┘

Join two datasets with key duplicates one by one

I need to join two datasets from e.g. left and right source to match values by some keys. Datasets can contain duplicates:
┌─key─┬─value──┬─source──┐
│ 1 │ val1 │ left │
│ 1 │ val1 │ left │ << duplicate from left source
│ 1 │ val1 │ left │ << another duplicate from left source
│ 1 │ val1 │ right │
│ 1 │ val1 │ right │ << duplicate from right source
│ 2 │ val2 │ left │
│ 2 │ val3 │ right │
└─────┴────────┴─-----───┘
I cant use full join, it gives cartesian products of all duplicates.
I am trying to use group by instead:
select
`key`,
anyIf(value, source = 'left') as left_value,
anyIf(value, source = 'right') as right_value
from test_raw
group by key;
It works good, but is there any way to match left and right duplicates?
Expected result:
┌─key─┬─left_value─┬─right_value─┐
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ │
│ 2 │ val2 │ val3 │
└─────┴────────────┴─────────────┘
Scripts to reproduce:
create table test_raw
(`key` Int64,`value` String,`source` String)
ENGINE = Memory;
insert into test_raw (`key`,`value`,`source`)
values
(1, 'val1', 'left'),
(1, 'val1', 'left'),
(1, 'val1', 'left'),
(1, 'val1', 'right'),
(1, 'val1', 'right'),
(2, 'val2', 'left'),
(2, 'val3', 'right');
select
`key`,
anyIf(value, source = 'left') as left_value,
anyIf(value, source = 'right') as right_value
from test_raw
group by key;
SELECT
key,
left_value,
right_value
FROM
(
SELECT
key,
arraySort(groupArrayIf(value, source = 'left')) AS l,
arraySort(groupArrayIf(value, source = 'right')) AS r,
arrayMap(i -> (l[i + 1], r[i + 1]), range(greatest(length(l), length(r)))) AS t
FROM test_raw
GROUP BY key
)
ARRAY JOIN
t.1 AS left_value,
t.2 AS right_value
ORDER BY key ASC
┌─key─┬─left_value─┬─right_value─┐
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ │
│ 1 │ val1 │ │
│ 2 │ val2 │ val3 │
└─────┴────────────┴─────────────┘

How to realize funnel analysis in ClickHouse

I want to do funnel analysis based on buried point data that are stored in ClickHouse. Let's define a few elements for funnel analysis:
A series of events: A (event_id = 1) -> B (event_id = 2) -> C (event_id = 3)
Time period: 0 (event_ms) ~ 500 (event_ms)
Time window: 100 (event_ms)
I want to know, for each user, if there is an event series (A->B->C) happened within the time period, and intervals between A and C is within the time window.
Here is my test dataset:
CREATE TABLE test_dataset
(
`event_id` UInt64,
`event_ms` UInt64,
`uid` UInt64 // user_id
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(toDate(event_ms))
ORDER BY (event_id, event_ms,uid)
SETTINGS index_granularity = 8192;
INSERT INTO TABLE test_dataset VALUES
(1, 100, 123),
(1, 120, 123),
(1, 130, 123),
(1, 150, 345),
(1, 180, 345),
(2, 150, 123),
(2, 200, 234),
(2, 140, 345),
(2, 210, 345),
(2, 300, 345),
(3, 180, 123),
(3, 250, 123),
(3, 290, 234),
(3, 270, 345);
I use join to find all qualified event series:
SELECT
t1.event_ms, t2.event_ms, t3.event_ms, t4.event_ms,
t1.uid, t2.uid, t3.uid, t4.uid
FROM
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 1 AND event_ms >= 0 AND event_ms <= 500) as t1
ASOF left join
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 2 AND event_ms >= 0 AND event_ms <= 500) as t2
ON t1.uid = t2.uid AND t1.event_ms < t2.event_ms
ASOF left join
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 3 AND event_ms >= 0 and event_ms <= 500) as t3
ON t2.uid = t3.uid and t2.event_ms < t3.event_ms
ASOF left join
(SELECT
uid, event_ms
FROM funnel_join_test_1
WHERE
event_id = 3 AND event_ms >= 0 and event_ms <= 500) as t4
ON t3.uid = t4.uid and t4.event_ms < t1.event_ms + 100
WHERE t4.event_ms > 0;
Here are all qualified event series:
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 180 │ 210 │ 270 │ 270 │ 345 │ 345 │ 345 │ 345 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 120 │ 150 │ 180 │ 180 │ 123 │ 123 │ 123 │ 123 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 130 │ 150 │ 180 │ 180 │ 123 │ 123 │ 123 │ 123 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
┌─t1.event_ms─┬─t2.event_ms─┬─t3.event_ms─┬─t4.event_ms─┬─t1.uid─┬─t2.uid─┬─t3.uid─┬─t4.uid─┐
│ 100 │ 150 │ 180 │ 180 │ 123 │ 123 │ 123 │ 123 │
└─────────────┴─────────────┴─────────────┴─────────────┴────────┴────────┴────────┴────────┘
Then I know user 123 and 345 have such event series within the time period. Using join is pretty slow in ClickHouse, is there any other way to work around this problem?
By the way, I don't need to know all qualified series, I only want to know if there is one such event series for each user.
There are function windowFunnel that searches for chain of events in sliding window.
SELECT
uid,
windowFunnel(100)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
WHERE (event_ms > 0) AND (event_ms < 500)
GROUP BY uid;
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 0 │
│ 345 │ 3 │
│ 123 │ 3 │
└─────┴───────────┘
It returns matched chain length, so for users 345 and 123 we have 3 that means that whole chain is matched.
If we decrease window to 10 it will find only beginning of chain and don't match futher events due to condition timestamp of event 2 <= timestamp of event 1 + window is not hold.
SELECT
uid,
windowFunnel(10)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
WHERE (event_ms > 0) AND (event_ms < 500)
GROUP BY uid
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 0 │
│ 345 │ 1 │
│ 123 │ 1 │
└─────┴───────────┘
So, to check that is there such chain for user you can check that windowFunnel matched appropriate number of events.
Restriction on time interval (Time period: 0 (event_ms) ~ 500 (event_ms)), is simply handled in WHERE clause.
Add more events out of period:
INSERT INTO TABLE test_dataset VALUES (1, 600, 234), (2, 601, 234), (3, 602, 234);
Then check:
SELECT
uid,
windowFunnel(100)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
WHERE (event_ms > 0) AND (event_ms < 500)
GROUP BY uid
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 0 │
│ 345 │ 3 │
│ 123 │ 3 │
└─────┴───────────┘
Without WHERE
SELECT
uid,
windowFunnel(100)(event_ms, event_id = 1, event_id = 2, event_id = 3) AS chain_len
FROM test_dataset
GROUP BY uid
Result:
┌─uid─┬─chain_len─┐
│ 234 │ 3 │
│ 345 │ 3 │
│ 123 │ 3 │
└─────┴───────────┘

How to get the maximum value below the selected date?

I've got dataset:
date | used_key
2000-01-01 | 1
2000-01-01 | 2
2000-01-01 | 3
2000-01-01 | 4
2000-01-02 | 1
2000-01-02 | 3
2000-01-03 | 1
2000-01-04 | 5
2000-01-04 | 6
2000-01-06 | 3
I need to get the maximum key value that was reached before the selected day:
date | max_key
2000-01-01 | 4
2000-01-02 | 4
2000-01-03 | 4
2000-01-04 | 6
2000-01-06 | 6
Something like that (without join section) but in right way:
SELECT max(used_key) max_key, date
FROM t1
WHERE 'date_below' <= date
GROUP BY date
Try this query:
SELECT result.1 date, result.2 max_key
FROM (
SELECT
groupArray(date) dates,
groupArray(max_used_key) max_used_keys,
arrayMap((date, index) -> (date, arrayReduce('max', arraySlice(max_used_keys, index))), dates, arrayEnumerate(dates)) result_array,
arrayJoin(result_array) result
FROM (
SELECT date, max(used_key) max_used_key
FROM (
/* test data */
SELECT data.1 date, data.2 used_key
FROM (
SELECT arrayJoin([
(toDate('2000-01-01'), 1),
(toDate('2000-01-01'), 2),
(toDate('2000-01-01'), 3),
(toDate('2000-01-01'), 4),
(toDate('2000-01-02'), 1),
(toDate('2000-01-02'), 3),
(toDate('2000-01-03'), 1),
(toDate('2000-01-04'), 5),
(toDate('2000-01-04'), 6),
(toDate('2000-01-06'), 3)]) data)
)
GROUP BY date
ORDER BY date DESC
)
);
/* Result:
┌───────date─┬──max_key─┐
│ 2000-01-06 │ 6 │
│ 2000-01-04 │ 6 │
│ 2000-01-03 │ 4 │
│ 2000-01-02 │ 4 │
│ 2000-01-01 │ 4 │
└────────────┴──────────┘
*/

How to get the parameter value by day?

I need to find the value of attributes for each player in the ranking by day.
I have got table below:
player_id | date | level
----------------------------
pl1 |2018-01-01| 3
pl1 |2018-01-02| 3
pl1 |2018-01-03| 4
pl1 |2018-01-05| 4
pl1 |2018-01-06| 4
pl1 |2018-01-08| 5
pl2 |2018-01-05| 1
I need to get next result:
player_id | level_by_date
-----------------------------
pl1 | (3,3,4,4,4,4,4,5)
pl2 | (0,0,0,0,1,1,1,1)
I tried to do it next way but failed
SELECT
player_id,
groupArray(max(module_level))
FROM d_Modules
WHERE
date>='2018-01-01' AND date<=arrayMap(i -> (toDate('2018-12-31') + toIntervalDay(i)), range(toUInt64((toDate('2018-12-31') - toDate('2018-01-01')) + 1)))
How can I do it?
Query can look like this:
SELECT
player_id,
arrayMap(date -> (date, 0), arrayFilter(date -> (arrayExists(i -> (i.1 = date), origin_level_by_date) = 0), dateRange)) AS missed_level_by_date,
arraySort(x -> x.1, arrayConcat(origin_level_by_date, missed_level_by_date)) AS level_by_date,
arrayMap(x -> x.2, level_by_date) AS result
FROM
(
SELECT
player_id,
groupArray((toDate(date), level)) AS origin_level_by_date
FROM d_Modules
WHERE (date >= '2018-01-01 00:00:00') AND (date < '2019-01-01 00:00:00')
GROUP BY player_id
)
CROSS JOIN
(
SELECT
min(toDate(date)) AS min,
max(toDate(date)) AS max,
arrayMap(i -> (min + i), range(toUInt32((max - min) + 1))) AS dateRange
FROM d_Modules
WHERE (date >= '2018-01-01 00:00:00') AND (date < '2019-01-01 00:00:00')
)
ORDER BY player_id ASC
FORMAT Vertical
The result is a little different from required because for missed days level-value is zero:
┌─player_id─┬─result────────────┐
│ pl1 │ [3,3,4,0,4,4,0,5] │
│ pl2 │ [0,0,0,0,1,0,0,0] │
└───────────┴───────────────────┘

Resources