Clickhouse SQL Query: Average in intervals - clickhouse

I have a table:
deviceId, valueDateTime, value, valueType
Where the valueType - temperature, pressure, etc.
I have several parameters for query: begin, end (period), and time interval (for example 20 minutes)
I want to get charts for the period for each deviceId and valueType with series of average values for each interval in the period.
EDIT:
Above is the final task, at this moment I just experimenting with this task and I use https://play.clickhouse.tech/?file=playground where I trying to solve a similar task. I want to calculate the average Age in the time interval grouped by Title field. And I have a problem, how to add grouping by Title?
-- 2013-07-15 00:00:00 - begin
-- 2013-07-16 00:00:00 - end
-- 1200 - average in interval 20m
SELECT t, avg(Age) as Age FROM (
SELECT
arrayJoin(
arrayMap(x -> addSeconds(toDateTime('2013-07-15 00:00:00'), x * 1200),
range(toUInt64(dateDiff('second', toDateTime('2013-07-15 00:00:00'), toDateTime('2013-07-16 00:00:00'))/1200)))
) as t,
null as Age
UNION ALL
SELECT
(addSeconds(
toDateTime('2013-07-15 00:00:00'),
1200 * intDivOrZero(dateDiff('second', toDateTime('2013-07-15 00:00:00'), EventTime), 1200))
) as t,
avg(Age) as Age
FROM `hits_100m_obfuscated`
WHERE EventTime BETWEEN toDateTime('2013-07-15 00:00:00') AND toDateTime('2013-07-16 00:00:00')
GROUP BY t
)
GROUP BY t ORDER BY t;
EDITED 2
Correct answer from vladimir adapted to be used and tested on https://play.clickhouse.tech/?file=playground
SELECT
Title, -- as deviceId
JavaEnable, -- as valueType
groupArray((rounded_time, avg_value)) values
FROM (
WITH 60 * 20 AS interval
SELECT
Title,
JavaEnable,
toDateTime(intDiv(toUInt32(EventTime), interval) * interval)
AS rounded_time, -- EventTime as valueDateTime
avg(Age) avg_value -- Age as value
FROM `hits_100m_obfuscated`
WHERE
EventTime BETWEEN toDateTime('2013-07-15 00:00:00')
AND toDateTime('2013-07-16 00:00:00')
GROUP BY
Title,
JavaEnable,
rounded_time
ORDER BY rounded_time
)
GROUP BY
Title,
JavaEnable
ORDER BY
Title,
JavaEnable

Try this query:
SELECT
deviceId,
valueType,
groupArray((rounded_time, avg_value)) values
FROM (
WITH 60 * 20 AS interval
SELECT
deviceId,
valueType,
toDateTime(intDiv(toUInt32(valueDateTime), interval) * interval) AS rounded_time,
avg(value) avg_value
FROM
(
/* emulate the test dataset */
SELECT
number % 4 AS deviceId,
now() - (number * 60) AS valueDateTime,
number % 10 AS value,
if((number % 2) = 1, 'temp', 'pres') AS valueType
FROM numbers(48)
)
/*WHERE valueDateTime >= begin AND valueDateTime < end */
GROUP BY
deviceId,
valueType,
rounded_time
ORDER BY rounded_time
)
GROUP BY
deviceId,
valueType
ORDER BY
deviceId,
valueType
/*
┌─deviceId─┬─valueType─┬─values────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ 0 │ pres │ [('2021-02-12 06:00:00',4),('2021-02-12 06:20:00',4),('2021-02-12 06:40:00',4),('2021-02-12 07:00:00',0)] │
│ 1 │ temp │ [('2021-02-12 06:00:00',5),('2021-02-12 06:20:00',5),('2021-02-12 06:40:00',5),('2021-02-12 07:00:00',1)] │
│ 2 │ pres │ [('2021-02-12 06:00:00',4),('2021-02-12 06:20:00',4),('2021-02-12 06:40:00',4)] │
│ 3 │ temp │ [('2021-02-12 06:00:00',5),('2021-02-12 06:20:00',5),('2021-02-12 06:40:00',5)] │
└──────────┴───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┘
*/
I would recommend using Grafana to visualize CH report (see Grafana ClickHouse datasource).

Related

clickhouse sum arrays at same index [duplicate]

I am trying to add an array column element by element after a group by another column.
Having the table A below:
id units
1 [1,1,1]
2 [3,0,0]
1 [5,3,7]
3 [2,5,2]
2 [3,2,6]
I would like to query something like:
select id, sum(units) from A group by id
And get the following result:
id units
1 [6,4,8]
2 [6,2,6]
3 [2,5,2]
Where the units arrays in rows with the same id get added element by element.
Try this query:
SELECT id, sumForEach(units) units
FROM (
/* emulate dataset */
SELECT data.1 id, data.2 units
FROM (
SELECT arrayJoin([(1, [1,1,1]), (2, [3,0,0]), (1, [5,3,7]), (3, [2,5,2]), (2, [3,2,6])]) data))
GROUP BY id
/* Result
┌─id─┬─units───┐
│ 1 │ [6,4,8] │
│ 2 │ [6,2,6] │
│ 3 │ [2,5,2] │
└────┴─────────┘
*/

Time series query based on another table

Initial data
CREATE TABLE a_table (
id UInt8,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY tuple()
ORDER BY id;
CREATE TABLE b_table (
id UInt8,
started_at DateTime,
stopped_at DateTime
)
ENGINE = MergeTree()
PARTITION BY tuple()
ORDER BY id;
INSERT INTO a_table (id, created_at) VALUES
(1, '2020-01-01 00:00:00'),
(2, '2020-01-02 00:00:00'),
(3, '2020-01-03 00:00:00')
;
INSERT INTO b_table (id, started_at, stopped_at) VALUES
(1, '2020-01-01 00:00:00', '2020-01-01 23:59:59'),
(2, '2020-01-02 00:00:00', '2020-01-02 23:59:59'),
(3, '2020-01-04 00:00:00', '2020-01-04 23:59:59')
;
Expected result: The 'a_table' rows by condition
b_table.started_at >= a_table.created_at AND
b_table.stopped_at <= a_table.created_at
+----+---------------------+
| id | created_at |
+----+---------------------+
| 1 | 2020-01-01 00:00:00 |
+----+---------------------+
| 2 | 2020-01-02 00:00:00 |
+----+---------------------+
What have i tried
-- No errors, empty result
SELECT a_table.*
FROM a_table
INNER JOIN b_table
ON b_table.id = a_table.id
WHERE b_table.started_at >= a_table.created_at
ANd b_table.stopped_at <= a_table.created_at
;
SELECT a_table.*
FROM a_table
ASOF INNER JOIN (
SELECT * FROM b_table
) q
ON q.id = a_table.id
AND q.started_at >= a_table.created_at
-- Error:
-- Invalid expression for JOIN ON.
-- ASOF JOIN expects exactly one inequality in ON section,
-- unexpected stopped_at <= created_at.
-- AND q.stopped_at <= a_table.created_at
;
WHERE b_table.started_at >= a_table.created_at
ANd b_table.stopped_at <= a_table.created_at
Wrong condition >= <= --> <= >=
20.8.7.15
SELECT
a_table.*,
b_table.*
FROM a_table
INNER JOIN b_table ON b_table.id = a_table.id
WHERE (b_table.started_at <= a_table.created_at) AND (b_table.stopped_at >= a_table.created_at)
┌─id─┬──────────created_at─┬─b_table.id─┬──────────started_at─┬──────────stopped_at─┐
│ 1 │ 2020-01-01 00:00:00 │ 1 │ 2020-01-01 00:00:00 │ 2020-01-01 23:59:59 │
│ 2 │ 2020-01-02 00:00:00 │ 2 │ 2020-01-02 00:00:00 │ 2020-01-02 23:59:59 │
└────┴─────────────────────┴────────────┴─────────────────────┴─────────────────────┘
In real production such queries would not work. Because JOIN is very slow.
It needs re-design. It hard to say how without knowing why do you have the second table. Probably I would use rangeHashed external dictionary.

how can I calculated point of each user per day with sum all the points from beginning to that day in clickhouse

I have this data in clickhouse:
final point of each user in day is sum(point) from the beginning to that day.
e.g: point of user 1 in 2020-07-02 is 800 and in 2020-07-03 is 200.
I need this result: Point of each user per day:
select uid, d, t from (
select uid, groupArray(date) dg, arrayCumSum(groupArray(spt)) gt from
(select uid, date, sum(pt) spt from
(select 1 tid, '2020-07-01' date, 1 uid, 500 pt
union all
select 1 tid, '2020-07-02' date, 1 uid, 300 pt
union all
select 1 tid, '2020-07-03' date, 1 uid, -600 pt)
group by uid, date
order by uid, date)
group by uid) array join dg as d, gt as t
┌─uid─┬─d──────────┬───t─┐
│ 1 │ 2020-07-01 │ 500 │
│ 1 │ 2020-07-02 │ 800 │
│ 1 │ 2020-07-03 │ 200 │
└─────┴────────────┴─────┘

How to sum arrays element by element after group by in clickhouse

I am trying to add an array column element by element after a group by another column.
Having the table A below:
id units
1 [1,1,1]
2 [3,0,0]
1 [5,3,7]
3 [2,5,2]
2 [3,2,6]
I would like to query something like:
select id, sum(units) from A group by id
And get the following result:
id units
1 [6,4,8]
2 [6,2,6]
3 [2,5,2]
Where the units arrays in rows with the same id get added element by element.
Try this query:
SELECT id, sumForEach(units) units
FROM (
/* emulate dataset */
SELECT data.1 id, data.2 units
FROM (
SELECT arrayJoin([(1, [1,1,1]), (2, [3,0,0]), (1, [5,3,7]), (3, [2,5,2]), (2, [3,2,6])]) data))
GROUP BY id
/* Result
┌─id─┬─units───┐
│ 1 │ [6,4,8] │
│ 2 │ [6,2,6] │
│ 3 │ [2,5,2] │
└────┴─────────┘
*/

Time comparison in ClickHouse

Maybe I'm missing something simple, but I could not make time filtering to work.
Here is my sample query:
select toTimeZone(ts, 'Etc/GMT+2') as z
from (select toDateTime('2019-08-31 20:35:00') AS ts)
where z > '2019-08-31 20:34:00'
I would expect 0 results, but getting:
2019-08-31T18:35:00+00:00
Is it a bug, or do I misuse the toTimeZone() function?
Thanks!
ClickHouse stores DateTime as Unix timestamp - other words without timezone.
But timezone is taken into account when sql-query executed:
SELECT
toDateTime('2019-08-31 20:35:00', 'UTC') AS origin_date,
toTimeZone(origin_date, 'Etc/GMT+2') AS d1,
toTypeName(d1) AS type1,
toUnixTimestamp(d1) AS t1,
toTimeZone(origin_date, 'UTC') AS d2,
toTypeName(d2) AS type2,
toUnixTimestamp(d2) AS t2
FORMAT Vertical
Row 1:
──────
origin_date: 2019-08-31 20:35:00
d1: 2019-08-31 18:35:00
type1: DateTime('Etc/GMT+2')
t1: 1567283700 # <-- t1 == t2
d2: 2019-08-31 20:35:00
type2: DateTime('UTC')
t2: 1567283700 # <-- t1 == t2
Your query works correctly.
To 'reset the timezone' of z-date can be used this way:
SELECT toDateTime(toString(toTimeZone(ts, 'Etc/GMT+2'))) AS z
FROM
(
SELECT toDateTime('2019-08-31 20:35:00') AS ts
)
WHERE z > '2019-08-31 20:34:00'
TZ is a property of the type not of the value
DESCRIBE TABLE
(
SELECT
toTimeZone(toDateTime('2019-08-31 20:35:00'), 'Etc/GMT+2') AS x,
toDateTime('2019-08-31 20:35:00') AS y
)
┌─name─┬─type──────────────────┬─
│ x │ DateTime('Etc/GMT+2') │
│ y │ DateTime │
└──────┴───────────────────────┴─
SELECT toTimeZone(ts, 'Etc/GMT+2') AS z
FROM
(
SELECT toDateTime('2019-08-31 20:35:00') AS ts
)
WHERE z > toDateTime('2019-08-31 20:34:00', 'Etc/GMT+2')
Ok.
0 rows in set. Elapsed: 0.002 sec.

Resources