Time series query based on another table - clickhouse

Initial data
CREATE TABLE a_table (
id UInt8,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY tuple()
ORDER BY id;
CREATE TABLE b_table (
id UInt8,
started_at DateTime,
stopped_at DateTime
)
ENGINE = MergeTree()
PARTITION BY tuple()
ORDER BY id;
INSERT INTO a_table (id, created_at) VALUES
(1, '2020-01-01 00:00:00'),
(2, '2020-01-02 00:00:00'),
(3, '2020-01-03 00:00:00')
;
INSERT INTO b_table (id, started_at, stopped_at) VALUES
(1, '2020-01-01 00:00:00', '2020-01-01 23:59:59'),
(2, '2020-01-02 00:00:00', '2020-01-02 23:59:59'),
(3, '2020-01-04 00:00:00', '2020-01-04 23:59:59')
;
Expected result: The 'a_table' rows by condition
b_table.started_at >= a_table.created_at AND
b_table.stopped_at <= a_table.created_at
+----+---------------------+
| id | created_at |
+----+---------------------+
| 1 | 2020-01-01 00:00:00 |
+----+---------------------+
| 2 | 2020-01-02 00:00:00 |
+----+---------------------+
What have i tried
-- No errors, empty result
SELECT a_table.*
FROM a_table
INNER JOIN b_table
ON b_table.id = a_table.id
WHERE b_table.started_at >= a_table.created_at
ANd b_table.stopped_at <= a_table.created_at
;
SELECT a_table.*
FROM a_table
ASOF INNER JOIN (
SELECT * FROM b_table
) q
ON q.id = a_table.id
AND q.started_at >= a_table.created_at
-- Error:
-- Invalid expression for JOIN ON.
-- ASOF JOIN expects exactly one inequality in ON section,
-- unexpected stopped_at <= created_at.
-- AND q.stopped_at <= a_table.created_at
;

WHERE b_table.started_at >= a_table.created_at
ANd b_table.stopped_at <= a_table.created_at
Wrong condition >= <= --> <= >=
20.8.7.15
SELECT
a_table.*,
b_table.*
FROM a_table
INNER JOIN b_table ON b_table.id = a_table.id
WHERE (b_table.started_at <= a_table.created_at) AND (b_table.stopped_at >= a_table.created_at)
┌─id─┬──────────created_at─┬─b_table.id─┬──────────started_at─┬──────────stopped_at─┐
│ 1 │ 2020-01-01 00:00:00 │ 1 │ 2020-01-01 00:00:00 │ 2020-01-01 23:59:59 │
│ 2 │ 2020-01-02 00:00:00 │ 2 │ 2020-01-02 00:00:00 │ 2020-01-02 23:59:59 │
└────┴─────────────────────┴────────────┴─────────────────────┴─────────────────────┘
In real production such queries would not work. Because JOIN is very slow.
It needs re-design. It hard to say how without knowing why do you have the second table. Probably I would use rangeHashed external dictionary.

Related

Clickhouse SQL Query: Average in intervals

I have a table:
deviceId, valueDateTime, value, valueType
Where the valueType - temperature, pressure, etc.
I have several parameters for query: begin, end (period), and time interval (for example 20 minutes)
I want to get charts for the period for each deviceId and valueType with series of average values for each interval in the period.
EDIT:
Above is the final task, at this moment I just experimenting with this task and I use https://play.clickhouse.tech/?file=playground where I trying to solve a similar task. I want to calculate the average Age in the time interval grouped by Title field. And I have a problem, how to add grouping by Title?
-- 2013-07-15 00:00:00 - begin
-- 2013-07-16 00:00:00 - end
-- 1200 - average in interval 20m
SELECT t, avg(Age) as Age FROM (
SELECT
arrayJoin(
arrayMap(x -> addSeconds(toDateTime('2013-07-15 00:00:00'), x * 1200),
range(toUInt64(dateDiff('second', toDateTime('2013-07-15 00:00:00'), toDateTime('2013-07-16 00:00:00'))/1200)))
) as t,
null as Age
UNION ALL
SELECT
(addSeconds(
toDateTime('2013-07-15 00:00:00'),
1200 * intDivOrZero(dateDiff('second', toDateTime('2013-07-15 00:00:00'), EventTime), 1200))
) as t,
avg(Age) as Age
FROM `hits_100m_obfuscated`
WHERE EventTime BETWEEN toDateTime('2013-07-15 00:00:00') AND toDateTime('2013-07-16 00:00:00')
GROUP BY t
)
GROUP BY t ORDER BY t;
EDITED 2
Correct answer from vladimir adapted to be used and tested on https://play.clickhouse.tech/?file=playground
SELECT
Title, -- as deviceId
JavaEnable, -- as valueType
groupArray((rounded_time, avg_value)) values
FROM (
WITH 60 * 20 AS interval
SELECT
Title,
JavaEnable,
toDateTime(intDiv(toUInt32(EventTime), interval) * interval)
AS rounded_time, -- EventTime as valueDateTime
avg(Age) avg_value -- Age as value
FROM `hits_100m_obfuscated`
WHERE
EventTime BETWEEN toDateTime('2013-07-15 00:00:00')
AND toDateTime('2013-07-16 00:00:00')
GROUP BY
Title,
JavaEnable,
rounded_time
ORDER BY rounded_time
)
GROUP BY
Title,
JavaEnable
ORDER BY
Title,
JavaEnable
Try this query:
SELECT
deviceId,
valueType,
groupArray((rounded_time, avg_value)) values
FROM (
WITH 60 * 20 AS interval
SELECT
deviceId,
valueType,
toDateTime(intDiv(toUInt32(valueDateTime), interval) * interval) AS rounded_time,
avg(value) avg_value
FROM
(
/* emulate the test dataset */
SELECT
number % 4 AS deviceId,
now() - (number * 60) AS valueDateTime,
number % 10 AS value,
if((number % 2) = 1, 'temp', 'pres') AS valueType
FROM numbers(48)
)
/*WHERE valueDateTime >= begin AND valueDateTime < end */
GROUP BY
deviceId,
valueType,
rounded_time
ORDER BY rounded_time
)
GROUP BY
deviceId,
valueType
ORDER BY
deviceId,
valueType
/*
┌─deviceId─┬─valueType─┬─values────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ 0 │ pres │ [('2021-02-12 06:00:00',4),('2021-02-12 06:20:00',4),('2021-02-12 06:40:00',4),('2021-02-12 07:00:00',0)] │
│ 1 │ temp │ [('2021-02-12 06:00:00',5),('2021-02-12 06:20:00',5),('2021-02-12 06:40:00',5),('2021-02-12 07:00:00',1)] │
│ 2 │ pres │ [('2021-02-12 06:00:00',4),('2021-02-12 06:20:00',4),('2021-02-12 06:40:00',4)] │
│ 3 │ temp │ [('2021-02-12 06:00:00',5),('2021-02-12 06:20:00',5),('2021-02-12 06:40:00',5)] │
└──────────┴───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┘
*/
I would recommend using Grafana to visualize CH report (see Grafana ClickHouse datasource).

Incorrect populating of materialized view

The 'test_sessions' table
CREATE TABLE IF NOT EXISTS test_sessions (
id UInt64,
name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY name;
The 'test_sessions' table data
INSERT INTO test_sessions(id, name, created_at) VALUES
(1, 'start', now()),
(1, 'stop', now() + INTERVAL 1 day),
(2, 'start', now() + INTERVAL 1 HOUR );
+----+-------+---------------------+
| id | name | created_at |
+----+-------+---------------------+
| 1 | start | 2020-11-10 07:58:19 |
+----+-------+---------------------+
| 2 | start | 2020-11-10 08:58:19 |
+----+-------+---------------------+
| 1 | stop | 2020-11-11 07:58:19 |
+----+-------+---------------------+
The 'finished_sessions' materialized view
CREATE MATERIALIZED VIEW finished_sessions (
id UInt64,
start_at DateTime,
end_at DateTime
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMM(start_at)
ORDER BY (id)
POPULATE AS
SELECT
id,
minIf(created_at, name = 'start') AS start_at,
maxIf(created_at, name = 'stop') AS end_at
FROM test_sessions
GROUP BY id
HAVING end_at <> '1970-01-01 00:00:00';
The 'finished_sessions' materialized view data
SELECT * FROM finished_sessions;
+----+---------------------+---------------------+
| id | start_at | end_at |
+----+---------------------+---------------------+
| 1 | 2020-11-10 07:58:19 | 2020-11-11 07:58:19 |
+----+---------------------+---------------------+
Until this moment, everything works correctly: there is only 1 closed session
After the close of the second session
INSERT INTO test_sessions(id, name, created_at) VALUES
(2, 'stop', now())
Incorrect populating occurs
SELECT * from finished_sessions ORDER BY id;
+----+-------------------------------+---------------------+
| id | start_at | end_at |
+----+-------------------------------+---------------------+
| 1 | 2020-11-10 07:58:19 | 2020-11-11 07:58:19 |
+----+-------------------------------+---------------------+
| 2 | ---> 1970-01-01 00:00:00 <--- | 2020-11-10 08:06:24 |
+----+-------------------------------+---------------------+
How to fix it?
You you should use AggregateFunction or better SimpleAggregateFunction
It's impossible to partition table by AggregateFunction. Because AggregateFunction are computed during merges and merges are executed over partition.
MV is an insert trigger. https://youtu.be/ckChUkC3Pns?list=PLO3lfQbpDVI-hyw4MyqxEk3rDHw95SzxJ https://den-crane.github.io/Everything_you_should_know_about_materialized_views_commented.pdf
CREATE TABLE IF NOT EXISTS test_sessions (
id UInt64,
name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY name;
INSERT INTO test_sessions(id, name, created_at) VALUES
(1, 'start', now()),
(1, 'stop', now() + INTERVAL 1 day),
(2, 'start', now() + INTERVAL 1 HOUR );
CREATE MATERIALIZED VIEW finished_sessions
ENGINE = AggregatingMergeTree
ORDER BY (id)
POPULATE AS
SELECT
id,
minStateIf(created_at, name = 'start') AS start_at,
maxStateIf(created_at, name = 'stop') AS end_at
FROM test_sessions
GROUP BY id
INSERT INTO test_sessions(id, name, created_at) VALUES
(2, 'stop', now());
SELECT
id,
minMerge(start_at),
maxMerge(end_at)
FROM finished_sessions
GROUP BY id
Query id: d797eee4-6088-40b8-aa12-b10da62b60c5
┌─id─┬──minMerge(start_at)─┬────maxMerge(end_at)─┐
│ 2 │ 2020-11-10 15:18:19 │ 2020-11-10 14:21:54 │
│ 1 │ 2020-11-10 14:18:19 │ 2020-11-11 14:18:19 │
└────┴─────────────────────┴─────────────────────┘
CREATE TABLE IF NOT EXISTS test_sessions (
id UInt64,
name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY name;
INSERT INTO test_sessions(id, name, created_at) VALUES
(1, 'start', now()),
(1, 'stop', now() + INTERVAL 1 day),
(2, 'start', now() + INTERVAL 1 HOUR );
CREATE MATERIALIZED VIEW finished_sessions
(
id UInt64,
start_at SimpleAggregateFunction(min,DateTime),
end_at SimpleAggregateFunction(max,DateTime)
)
ENGINE = AggregatingMergeTree
ORDER BY (id)
POPULATE AS
SELECT
id,
minIf(created_at, name = 'start') AS start_at,
maxIf(created_at, name = 'stop') AS end_at
FROM test_sessions
GROUP BY id;
INSERT INTO test_sessions(id, name, created_at) VALUES
(2, 'stop', now())
optimize table finished_sessions final;
SELECT
id,
min(start_at),
max(end_at)
FROM finished_sessions
GROUP BY id
┌─id─┬───────min(start_at)─┬─────────max(end_at)─┐
│ 2 │ 1970-01-01 00:00:00 │ 2020-11-10 14:29:30 │
│ 1 │ 2020-11-10 14:29:15 │ 2020-11-11 14:29:15 │
└────┴─────────────────────┴─────────────────────┘

How to pivot subgroups?

We have created a flat table for Clickhouse and are trying to get records from this table to create a Materialized view. The logic is if e_id is null the record is 'TypeB', if e_id is not null then record is 'TypeA'. Both TypeA and TypeB records will have the same p_id and s_id. We want to create one record per p_id+s_id combination.
The query given below works well with filter (p_id =1 and s_id = 1) but without filters - the exception is "DB::Exception: Scalar subquery returned more than one row"
Is it possible to do this in ClickHouse?
Would it be possible to create Materialized View with such a query?
select p_id,s_id,
groupArray(e_id),
groupArray(name),
(select groupArray(name)
from flat_table
where e_id is null and p_id =1 and s_id = 1
group by p_id,s_id) as typeB
from flat_table
where e_id is not null and p_id =1 and s_id = 1
group by p_id,s_id;
/*
This what the table looks like:
Flat_table
p_id s_id e_id name
1 1 1 Jake
1 1 2 Bob
1 1 null Barby
1 1 null Ella
This is expected result:
p_id s_id e_id typeA typeB
1 1 [1,2] [Jake,Bob] [Barby,Ella]
*/
Let's try this query:
SELECT p_id, s_id, e_ids, typeA, typeB
FROM (
SELECT
p_id,
s_id,
groupArray((e_id, name)) eid_names,
arrayMap(x -> x.1, arrayFilter(x -> not isNull(x.1), eid_names)) e_ids,
arrayMap(x -> x.2, arrayFilter(x -> not isNull(x.1), eid_names)) typeA,
arrayMap(x -> x.2, arrayFilter(x -> isNull(x.1), eid_names)) typeB
FROM test.test_006
GROUP BY p_id, s_id)
/* Result
┌─p_id─┬─s_id─┬─e_ids─┬─typeA────────────┬─typeB──────────────┐
│ 2 │ 2 │ [1,2] │ ['Jake2','Bob2'] │ ['Barby2','Ella2'] │
│ 1 │ 1 │ [1,2] │ ['Jake','Bob'] │ ['Barby','Ella'] │
└──────┴──────┴───────┴──────────────────┴────────────────────┘
*/
/* Data preparation queries */
CREATE TABLE test.test_006
(
`p_id` Int32,
`s_id` Int32,
`e_id` Nullable(Int32),
`name` String
)
ENGINE = Memory
INSERT INTO test.test_006
VALUES (1, 1, 1, 'Jake'), (1, 1, 2, 'Bob'), (1, 1, null, 'Barby'), (1, 1, null, 'Ella'),
(2, 2, 1, 'Jake2'), (2, 2, 2, 'Bob2'), (2, 2, null, 'Barby2'), (2, 2, null, 'Ella2')

Time comparison in ClickHouse

Maybe I'm missing something simple, but I could not make time filtering to work.
Here is my sample query:
select toTimeZone(ts, 'Etc/GMT+2') as z
from (select toDateTime('2019-08-31 20:35:00') AS ts)
where z > '2019-08-31 20:34:00'
I would expect 0 results, but getting:
2019-08-31T18:35:00+00:00
Is it a bug, or do I misuse the toTimeZone() function?
Thanks!
ClickHouse stores DateTime as Unix timestamp - other words without timezone.
But timezone is taken into account when sql-query executed:
SELECT
toDateTime('2019-08-31 20:35:00', 'UTC') AS origin_date,
toTimeZone(origin_date, 'Etc/GMT+2') AS d1,
toTypeName(d1) AS type1,
toUnixTimestamp(d1) AS t1,
toTimeZone(origin_date, 'UTC') AS d2,
toTypeName(d2) AS type2,
toUnixTimestamp(d2) AS t2
FORMAT Vertical
Row 1:
──────
origin_date: 2019-08-31 20:35:00
d1: 2019-08-31 18:35:00
type1: DateTime('Etc/GMT+2')
t1: 1567283700 # <-- t1 == t2
d2: 2019-08-31 20:35:00
type2: DateTime('UTC')
t2: 1567283700 # <-- t1 == t2
Your query works correctly.
To 'reset the timezone' of z-date can be used this way:
SELECT toDateTime(toString(toTimeZone(ts, 'Etc/GMT+2'))) AS z
FROM
(
SELECT toDateTime('2019-08-31 20:35:00') AS ts
)
WHERE z > '2019-08-31 20:34:00'
TZ is a property of the type not of the value
DESCRIBE TABLE
(
SELECT
toTimeZone(toDateTime('2019-08-31 20:35:00'), 'Etc/GMT+2') AS x,
toDateTime('2019-08-31 20:35:00') AS y
)
┌─name─┬─type──────────────────┬─
│ x │ DateTime('Etc/GMT+2') │
│ y │ DateTime │
└──────┴───────────────────────┴─
SELECT toTimeZone(ts, 'Etc/GMT+2') AS z
FROM
(
SELECT toDateTime('2019-08-31 20:35:00') AS ts
)
WHERE z > toDateTime('2019-08-31 20:34:00', 'Etc/GMT+2')
Ok.
0 rows in set. Elapsed: 0.002 sec.

Oracle - Join 2 periods of time

I have 2 rows with 2 periods of time that intersect. For example:
---------------------------------------------
| START_DATE | END_DATE |
---------------------------------------------
| 01/01/2018 08:00:00 | 01/01/2018 09:30:00 |
| 01/01/2018 08:30:00 | 01/01/2018 10:00:00 |
---------------------------------------------
There are 30 minutes where both periods intersect. I want to avoid it. I would like to join both rows in one single column, taking the starting date as the older and the ending date as the newer:
---------------------------------------------
| START_DATE | END_DATE |
---------------------------------------------
| 01/01/2018 08:00:00 | 01/01/2018 10:00:00 |
---------------------------------------------
Have you any idea how can I get the solution I want with a SQL sentence?
For two rows just use greatest() and least(). But the problem is when you have many rows which may overlap in different ways. You can:
add row numbers to each row,
assign groups for overlapping periods using recursive query,
group data using this value and find min and max dates in each group.
dbfiddle demo
with
r(rn, start_date, end_date) as (
select row_number() over(order by start_date), start_date, end_date from t ),
c(rn, start_date, end_date, grp) as (
select rn, start_date, end_date, 1 from r where rn = 1
union all
select r.rn,
case when r.start_date <= c.end_date and c.start_date <= r.end_date
then least(r.start_date, c.start_date) else r.start_date end,
case when r.start_date <= c.end_date and c.start_date <= r.end_date
then greatest(r.end_date, c.end_date) else r.end_date end,
case when r.start_date <= c.end_date and c.start_date <= r.end_date
then grp else grp + 1 end
from c join r on r.rn = c.rn + 1)
select min(start_date), max(end_date) from c group by grp
If all you have is a set of date ranges, with no other correlating or constraining criteria, and you want to reduce that to a set of non overlapping ranges, you can do that with a recursive query like this one:
with recur(start_date, end_date) as (
select * from yourdata yd
where not exists (select 1 from yourdata cyd
where yd.start_Date between cyd.start_date and cyd.end_date
and (yd.start_date <> cyd.start_date or yd.end_date <> cyd.end_date))
union all
select r.start_date
, yd.end_date
from recur r
join yourdata yd
on r.start_date < yd.start_date
and yd.start_date <= r.end_date
and r.end_date < yd.end_date
)
select start_date, max(end_date) end_Date from recur group by start_Date;
In this query the anchor (the part before the union all) select all records whose start date is not contained in any other range.
The recursive part (the part after the union all) then select ranges that extend the current range. In both halves the original start date is returned while in the recursive part the new extended end date is returned. This results in a set of over lapping ranges with a common start date.
Finally the output query returns the start date and max end date grouped by start date.

Resources