Join two datasets with key duplicates one by one - clickhouse

I need to join two datasets from e.g. left and right source to match values by some keys. Datasets can contain duplicates:
┌─key─┬─value──┬─source──┐
│ 1 │ val1 │ left │
│ 1 │ val1 │ left │ << duplicate from left source
│ 1 │ val1 │ left │ << another duplicate from left source
│ 1 │ val1 │ right │
│ 1 │ val1 │ right │ << duplicate from right source
│ 2 │ val2 │ left │
│ 2 │ val3 │ right │
└─────┴────────┴─-----───┘
I cant use full join, it gives cartesian products of all duplicates.
I am trying to use group by instead:
select
`key`,
anyIf(value, source = 'left') as left_value,
anyIf(value, source = 'right') as right_value
from test_raw
group by key;
It works good, but is there any way to match left and right duplicates?
Expected result:
┌─key─┬─left_value─┬─right_value─┐
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ │
│ 2 │ val2 │ val3 │
└─────┴────────────┴─────────────┘
Scripts to reproduce:
create table test_raw
(`key` Int64,`value` String,`source` String)
ENGINE = Memory;
insert into test_raw (`key`,`value`,`source`)
values
(1, 'val1', 'left'),
(1, 'val1', 'left'),
(1, 'val1', 'left'),
(1, 'val1', 'right'),
(1, 'val1', 'right'),
(2, 'val2', 'left'),
(2, 'val3', 'right');
select
`key`,
anyIf(value, source = 'left') as left_value,
anyIf(value, source = 'right') as right_value
from test_raw
group by key;

SELECT
key,
left_value,
right_value
FROM
(
SELECT
key,
arraySort(groupArrayIf(value, source = 'left')) AS l,
arraySort(groupArrayIf(value, source = 'right')) AS r,
arrayMap(i -> (l[i + 1], r[i + 1]), range(greatest(length(l), length(r)))) AS t
FROM test_raw
GROUP BY key
)
ARRAY JOIN
t.1 AS left_value,
t.2 AS right_value
ORDER BY key ASC
┌─key─┬─left_value─┬─right_value─┐
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ val1 │
│ 1 │ val1 │ │
│ 1 │ val1 │ │
│ 2 │ val2 │ val3 │
└─────┴────────────┴─────────────┘

Related

How to check missing values in Clickhouse

I have a table that is filled with data every 15 minutes. I need to check that there is data for all days of the entire period. there is a time column in which the data is in the format yyyy-mm-dd hh:mm:ss
i've found the start date and the last date with
I found out that you can generate an array of dates from this interval (start and end dates) with which each line will be compared, and if there is no match, here it is the missing date.
i've tried this:
WITH dates_range AS (SELECT toDate(min(time)) AS start_date,
toDate(max(time)) AS end_date
FROM table)
SELECT dates
FROM (
SELECT arrayFlatten(arrayMap(x -> start_date + x, range(0, toUInt64(end_date - start_date)))) AS dates
FROM dates_range
)
LEFT JOIN (
SELECT toDate(time) AS date
FROM table
GROUP BY toDate(time)
) USING date
WHERE date IS NULL;
but it returns with Code: 10. DB::Exception: Not found column date in block. There are only columns: dates. (NOT_FOUND_COLUMN_IN_BLOCK) and I can't
You can also use WITH FILL modifier https://clickhouse.com/docs/en/sql-reference/statements/select/order-by/#order-by-expr-with-fill-modifier
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
SELECT *
FROM
(
SELECT
toDate(time) AS t,
count() AS c
FROM T
GROUP BY t
ORDER BY t ASC WITH FILL
)
WHERE c = 0
┌──────────t─┬─c─┐
│ 2020-01-11 │ 0 │
│ 2020-01-13 │ 0 │
│ 2020-01-16 │ 0 │
│ 2020-01-18 │ 0 │
│ 2020-01-21 │ 0 │
│ 2020-01-23 │ 0 │
│ 2020-01-26 │ 0 │
└────────────┴───┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
WITH (SELECT (toDate(min(time)), toDate(max(time))) FROM T) as x
select date, sumIf(cnt, type=1) c1, sumIf(cnt, type=2) c2 from
( SELECT arrayJoin(arrayFlatten(arrayMap(x -> x.1 + x, range(0, toUInt64(x.2 - x.1+1))))) AS date, 2 type, 1 cnt
union all SELECT toDate(time) AS date, 1 type, count() cnt FROM T GROUP BY toDate(time) )
group by date
having c1 = 0 or c2 = 0;
┌───────date─┬─c1─┬─c2─┐
│ 2020-01-11 │ 0 │ 1 │
│ 2020-01-13 │ 0 │ 1 │
│ 2020-01-16 │ 0 │ 1 │
│ 2020-01-18 │ 0 │ 1 │
│ 2020-01-21 │ 0 │ 1 │
│ 2020-01-23 │ 0 │ 1 │
│ 2020-01-26 │ 0 │ 1 │
└────────────┴────┴────┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
WITH (SELECT (toDate(min(time)), toDate(max(time))) FROM T) as x
SELECT l.*, r.*
FROM ( SELECT arrayJoin(arrayFlatten(arrayMap(x -> x.1 + x, range(0, toUInt64(x.2 - x.1+1))))) AS date) l
LEFT JOIN ( SELECT toDate(time) AS date FROM T GROUP BY toDate(time)
) r USING date
WHERE r.date IS NULL settings join_use_nulls = 1;
┌───────date─┬─r.date─┐
│ 2020-01-11 │ ᴺᵁᴸᴸ │
│ 2020-01-13 │ ᴺᵁᴸᴸ │
│ 2020-01-16 │ ᴺᵁᴸᴸ │
│ 2020-01-18 │ ᴺᵁᴸᴸ │
│ 2020-01-21 │ ᴺᵁᴸᴸ │
│ 2020-01-23 │ ᴺᵁᴸᴸ │
│ 2020-01-26 │ ᴺᵁᴸᴸ │
└────────────┴────────┘
create table T ( time DateTime) engine=Memory
as SELECT toDateTime('2020-01-01') + (((number * 60) * 24) * if((number % 33) = 0, 3, 1))
FROM numbers(550);
select b from (
SELECT
b,
((b - any(b) OVER (ORDER BY b ASC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING))) AS lag
FROM
(
SELECT toDate(time) AS b
FROM T
GROUP BY b
ORDER BY b ASC
)) where lag > 1 and lag < 10000
┌──────────b─┐
│ 2020-01-12 │
│ 2020-01-14 │
│ 2020-01-17 │
│ 2020-01-19 │
│ 2020-01-22 │
│ 2020-01-24 │
│ 2020-01-27 │
└────────────┘

Clickhouse - How can I get distinct values from all values inside an array type column

On a clickhouse database, I've an array type as column and I want to make an distinct for all elements inside them
Instead of getting this
Select distinct errors.message_grouping_fingerprint
FROM views
WHERE (session_date >= toDate('2022-07-21')) and (session_date < toDate('2022-07-22'))
and notEmpty(errors.message) = 1
and project_id = 162
SETTINGS distributed_group_by_no_merge=0
[-8964675922652096680,-8964675922652096680]
[-8964675922652096680]
[-8964675922652096680,-8964675922652096680,-8964675922652096680,-8964675922652096680,-8964675922652096680,-8964675922652096680,-8964675922652096680,-827009490898812590,-8964675922652096680,-8964675922652096680,-8964675922652096680,-8964675922652096680]
[-8964675922652096680,-8964675922652096680,-8964675922652096680]
[-827009490898812590]
[-1660275624223727714,-1660275624223727714]
[1852265010681444046]
[-2552644061611887546]
[-7142229185866234523]
[-7142229185866234523,-7142229185866234523]
To get this
-8964675922652096680
-827009490898812590
-1660275624223727714
1852265010681444046
-2552644061611887546
-7142229185866234523
and finally, to make a count of all them
as 6
groupUniqArrayArray
select arrayMap( i-> rand()%10, range(rand()%3+1)) arr from numbers(10);
┌─arr─────┐
│ [0] │
│ [1] │
│ [7,7,7] │
│ [8,8] │
│ [9,9,9] │
│ [6,6,6] │
│ [2,2] │
│ [8,8,8] │
│ [2] │
│ [8,8,8] │
└─────────┘
SELECT
groupUniqArrayArray(arr) AS uarr,
length(uarr)
FROM
(
SELECT arrayMap(i -> (rand() % 10), range((rand() % 3) + 1)) AS arr
FROM numbers(10)
)
┌─uarr──────────────┬─length(groupUniqArrayArray(arr))─┐
│ [0,5,9,4,2,8,7,3] │ 8 │
└───────────────────┴──────────────────────────────────┘
ARRAY JOIN
SELECT A
FROM
(
SELECT arrayMap(i -> (rand() % 10), range((rand() % 3) + 1)) AS arr
FROM numbers(10)
)
ARRAY JOIN arr AS A
GROUP BY A
┌─A─┐
│ 0 │
│ 1 │
│ 4 │
│ 5 │
│ 6 │
│ 9 │
└───┘

ClickHouse - ORDER BY WITH FILL - MONTH interval

I've got a question regarding the FILL WITH function. I need a query grouped by month with empty rows to plot on a graph. I use the FILL WITH function.
I have a simple table:
CREATE TABLE IF NOT EXISTS fillwith
(
`event_timestamp` Datetime64,
`event_date` Date,
`event_type` String
)
ENGINE = Memory
With some sample data
insert into fillwith (event_timestamp, event_date, event_type) values ('2021-01-07 19:14:33.000', '2021-01-07', 'PRODUCT_VIEW');
insert into fillwith (event_timestamp, event_date, event_type) values ('2021-02-07 19:14:33.000', '2021-02-07', 'PRODUCT_CLICK');
insert into fillwith (event_timestamp, event_date, event_type) values ('2020-11-07 19:14:33.000', '2020-11-07', 'PRODUCT_VIEW');
insert into fillwith (event_timestamp, event_date, event_type) values ('2020-12-07 19:14:33.000', '2020-12-07', 'PRODUCT_VIEW');
insert into fillwith (event_timestamp, event_date, event_type) values ('2020-09-07 19:14:33.000', '2020-09-07', 'PRODUCT_VIEW');
With a day interval, I get a full list of days but not sorted and feels likes they are random days
SELECT
toDate(toStartOfInterval(event_date, toIntervalDay(1))) AS date,
countIf(event_type = 'PRODUCT_VIEW') AS views,
countIf(event_type = 'PRODUCT_CLICK') AS clicks
FROM fillwith
GROUP BY toDate(toStartOfInterval(event_date, toIntervalDay(1)))
ORDER BY date ASC
WITH FILL FROM toDate('2020-01-01') TO toDate('2021-12-01') STEP dateDiff('second', now(), now() + toIntervalDay(1))
Result:
┌───────date─┬─views─┬─clicks─┐
│ 2020-09-07 │ 1 │ 0 │
│ 2020-11-07 │ 1 │ 0 │
│ 2020-12-07 │ 1 │ 0 │
│ 2021-01-07 │ 1 │ 0 │
│ 2021-02-07 │ 0 │ 1 │
└────────────┴───────┴────────┘
┌───────date─┬─views─┬─clicks─┐
│ 2106-02-07 │ 0 │ 0 │
│ 2005-05-25 │ 0 │ 0 │
│ 2062-07-09 │ 0 │ 0 │
│ 2106-02-07 │ 0 │ 0 │
│ 1997-05-03 │ 0 │ 0 │
│ 2054-06-17 │ 0 │ 0 │
│ 2106-02-07 │ 0 │ 0 │
│ 1989-04-11 │ 0 │ 0 │
│ 2046-05-26 │ 0 │ 0 │
│ 2103-07-11 │ 0 │ 0 │
When I try the same for a Month interval:
select
toDate(toStartOfInterval(event_date, INTERVAL 1 month)) as date,
countIf(event_type = 'PRODUCT_VIEW') as views,
countIf(event_type = 'PRODUCT_CLICK') as clicks
from fillwith
GROUP BY toDate(toStartOfInterval(event_date, INTERVAL 1 month))
ORDER BY date ASC WITH FILL
FROM toDate('2020-01-01') TO toDate('2021-04-01') STEP dateDiff('second',
now(),
now() + INTERVAL 1 month)
Result:
┌───────date─┬─views─┬─clicks─┐
│ 2020-01-01 │ 0 │ 0 │
│ 2020-09-01 │ 1 │ 0 │
│ 2020-11-01 │ 1 │ 0 │
│ 2020-12-01 │ 1 │ 0 │
│ 2021-01-01 │ 1 │ 0 │
│ 2021-02-01 │ 0 │ 1 │
└────────────┴───────┴────────┘
But I expect:
┌───────date─┬─views─┬─clicks─┐
│ 2020-01-01 │ 0 │ 0 │
│ 2020-02-01 │ 0 │ 0 │
│ 2020-03-01 │ 0 │ 0 │
│ 2020-04-01 │ 0 │ 0 │
│ 2020-05-01 │ 0 │ 0 │
│ 2020-06-01 │ 0 │ 0 │
│ 2020-07-01 │ 0 │ 0 │
│ 2020-08-01 │ 0 │ 0 │
│ 2020-09-01 │ 1 │ 0 │
│ 2020-10-01 │ 0 │ 0 │
│ 2020-11-01 │ 1 │ 0 │
│ 2020-12-01 │ 1 │ 0 │
│ 2021-01-01 │ 1 │ 0 │
│ 2021-02-01 │ 0 │ 1 │
│ 2021-03-01 │ 0 │ 0 │
│ 2021-04-01 │ 0 │ 0 │
└────────────┴───────┴────────┘
Does someone know why this happens and how I can improve this?
Thanks for your help!
Try this query:
WITH toDate(0) AS start_date, toRelativeMonthNum(toDate(0)) AS relative_month_of_start_date
SELECT
addMonths(start_date, relative_month - relative_month_of_start_date) AS month,
views,
clicks
FROM
(
SELECT
toRelativeMonthNum(event_date) AS relative_month,
countIf(event_type = 'PRODUCT_VIEW') AS views,
countIf(event_type = 'PRODUCT_CLICK') AS clicks
FROM fillwith
GROUP BY relative_month
ORDER BY relative_month ASC
WITH FILL
FROM toRelativeMonthNum(toDate('2020-01-01'))
TO toRelativeMonthNum(toDate('2021-12-01')) STEP 1
)
ORDER BY month ASC
/*
┌──────month─┬─views─┬─clicks─┐
│ 2020-01-01 │ 0 │ 0 │
│ 2020-02-01 │ 0 │ 0 │
│ 2020-03-01 │ 0 │ 0 │
│ 2020-04-01 │ 0 │ 0 │
│ 2020-05-01 │ 0 │ 0 │
│ 2020-06-01 │ 0 │ 0 │
│ 2020-07-01 │ 0 │ 0 │
│ 2020-08-01 │ 0 │ 0 │
│ 2020-09-01 │ 1 │ 0 │
│ 2020-10-01 │ 0 │ 0 │
│ 2020-11-01 │ 1 │ 0 │
│ 2020-12-01 │ 1 │ 0 │
│ 2021-01-01 │ 1 │ 0 │
│ 2021-02-01 │ 0 │ 1 │
│ 2021-03-01 │ 0 │ 0 │
│ 2021-04-01 │ 0 │ 0 │
│ 2021-05-01 │ 0 │ 0 │
│ 2021-06-01 │ 0 │ 0 │
│ 2021-07-01 │ 0 │ 0 │
│ 2021-08-01 │ 0 │ 0 │
│ 2021-09-01 │ 0 │ 0 │
│ 2021-10-01 │ 0 │ 0 │
│ 2021-11-01 │ 0 │ 0 │
└────────────┴───────┴────────┘
*/
or alternate way:
SELECT
toStartOfMonth(date) AS month,
sum(views) AS views,
sum(clicks) AS clicks
FROM
(
SELECT
event_date AS date, /* or: toDate(toStartOfDay(event_timestamp)) AS date */
countIf(event_type = 'PRODUCT_VIEW') AS views,
countIf(event_type = 'PRODUCT_CLICK') AS clicks
FROM fillwith
GROUP BY date
ORDER BY date ASC
WITH FILL
FROM toDate('2020-01-01')
TO toDate('2021-12-01')
/* type of 'date' is Date => '1' means 1 day */
STEP 1
)
GROUP BY month
ORDER BY month ASC
/*
┌──────month─┬─views─┬─clicks─┐
│ 2020-01-01 │ 0 │ 0 │
│ 2020-02-01 │ 0 │ 0 │
│ 2020-03-01 │ 0 │ 0 │
│ 2020-04-01 │ 0 │ 0 │
│ 2020-05-01 │ 0 │ 0 │
│ 2020-06-01 │ 0 │ 0 │
│ 2020-07-01 │ 0 │ 0 │
│ 2020-08-01 │ 0 │ 0 │
│ 2020-09-01 │ 1 │ 0 │
│ 2020-10-01 │ 0 │ 0 │
│ 2020-11-01 │ 1 │ 0 │
│ 2020-12-01 │ 1 │ 0 │
│ 2021-01-01 │ 1 │ 0 │
│ 2021-02-01 │ 0 │ 1 │
│ 2021-03-01 │ 0 │ 0 │
│ 2021-04-01 │ 0 │ 0 │
│ 2021-05-01 │ 0 │ 0 │
│ 2021-06-01 │ 0 │ 0 │
│ 2021-07-01 │ 0 │ 0 │
│ 2021-08-01 │ 0 │ 0 │
│ 2021-09-01 │ 0 │ 0 │
│ 2021-10-01 │ 0 │ 0 │
│ 2021-11-01 │ 0 │ 0 │
└────────────┴───────┴────────┘
*/

Clickhouse - how do I do Natural sort query with limit?

I want my select queries able to do natural sort using these concepts: https://rosettacode.org/wiki/Natural_sorting
You can play with collation settings like in the query below.
Take into account that ClickHouse has the collation bug#7482 and fails for some languages such as en, de.
SELECT arrayJoin(['kk 50', 'KK 01', ' KK 2', ' KK 3', 'kk 1', 'x9y99', 'x9y100']) item
ORDER BY item ASC
/*
Result:
┌─item──────┐
│ KK 2 │
│ KK 3 │
│ KK 01 │
│ kk 1 │
│ kk 50 │
│ x9y100 │
│ x9y99 │
└───────────┘
*/
SELECT arrayJoin(['kk 50', 'KK 01', ' KK 2', ' KK 3', 'kk 1', 'x9y99', 'x9y100']) item
ORDER BY item ASC COLLATE 'tr-u-kn-true-ka-shifted'
/*
Result:
┌─item──────┐
│ kk 1 │
│ KK 01 │
│ KK 2 │
│ KK 3 │
│ kk 50 │
│ x9y99 │
│ x9y100 │
└───────────┘
*/

How to extract ascending subsets from the sequence?

I have some data:
┌─id--┬─serial┐
│ 1 │ 1 │
│ 2 │ 2 │
│ 3 │ 3 │
│ 4 │ 1 │
│ 5 │ 3 │
│ 6 │ 2 │
│ 7 │ 1 │
│ 8 │ 2 │
│ 9 │ 3 │
│ 10 │ 1 │
│ 11 │ 2 │
│ 12 │ 1 │
│ 13 │ 2 │
│ 14 │ 3 │
└─────┴───────┘
I want to group by column 'serial' where the group rule is: any ascending subset (like this, 1 -> 2 -> 3) is a group.
I expect result:
┌─id--┬─serial┬─group─┐
│ 1 │ 1 │ 1 │
│ 2 │ 2 │ 1 │
│ 3 │ 3 │ 1 │
│ 4 │ 1 │ 2 │
│ 5 │ 3 │ 2 │
│ 6 │ 2 │ 3 │
│ 7 │ 1 │ 4 │
│ 8 │ 2 │ 4 │
│ 9 │ 3 │ 4 │
│ 10 │ 1 │ 5 │
│ 11 │ 2 │ 5 │
│ 12 │ 1 │ 6 │
│ 13 │ 2 │ 6 │
│ 14 │ 3 │ 6 │
└─────┴───────┴───────┘
If I right understand you wanna split the set into subsets with ascending trend.
SELECT r.1 id, r.2 serial, r.3 AS group, arrayJoin(result) r
FROM (
SELECT
groupArray((id, serial)) sourceArray,
/* find indexes where the ascending trend is broken */
arrayFilter(i -> (i = 1 OR sourceArray[i - 1].2 > sourceArray[i].2), arrayEnumerate(sourceArray)) trendBrokenIndexes,
/* select all groups with ascending trend and assign them group-id */
arrayMap(i ->
(i, arraySlice(sourceArray, trendBrokenIndexes[i], i < length(trendBrokenIndexes) ? trendBrokenIndexes[i+1] - trendBrokenIndexes[i] : null)),
arrayEnumerate(trendBrokenIndexes)) groups,
/* prepare the result */
arrayReduce('groupArrayArray', arrayMap(x -> arrayMap(y -> (y.1, y.2, x.1), x.2), groups)) result
FROM (
/* source data */
SELECT arrayJoin([(1 , 1),(2 , 2),(3 , 3),(4 , 1),(5 , 3),(6 , 2),(7 , 1),(8 , 2),(9 , 3),(10, 1),(11, 2),(12, 1),(13, 2),(14, 3)]) a, a.1 id, a.2 serial
ORDER BY id))
/* Result
┌─id─┬─serial─┬─group─┬─r────────┐
│ 1 │ 1 │ 1 │ (1,1,1) │
│ 2 │ 2 │ 1 │ (2,2,1) │
│ 3 │ 3 │ 1 │ (3,3,1) │
│ 4 │ 1 │ 2 │ (4,1,2) │
│ 5 │ 3 │ 2 │ (5,3,2) │
│ 6 │ 2 │ 3 │ (6,2,3) │
│ 7 │ 1 │ 4 │ (7,1,4) │
│ 8 │ 2 │ 4 │ (8,2,4) │
│ 9 │ 3 │ 4 │ (9,3,4) │
│ 10 │ 1 │ 5 │ (10,1,5) │
│ 11 │ 2 │ 5 │ (11,2,5) │
│ 12 │ 1 │ 6 │ (12,1,6) │
│ 13 │ 2 │ 6 │ (13,2,6) │
│ 14 │ 3 │ 6 │ (14,3,6) │
└────┴────────┴───────┴──────────┘
*/

Resources