Incorrect populating of materialized view - clickhouse

The 'test_sessions' table
CREATE TABLE IF NOT EXISTS test_sessions (
id UInt64,
name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY name;
The 'test_sessions' table data
INSERT INTO test_sessions(id, name, created_at) VALUES
(1, 'start', now()),
(1, 'stop', now() + INTERVAL 1 day),
(2, 'start', now() + INTERVAL 1 HOUR );
+----+-------+---------------------+
| id | name | created_at |
+----+-------+---------------------+
| 1 | start | 2020-11-10 07:58:19 |
+----+-------+---------------------+
| 2 | start | 2020-11-10 08:58:19 |
+----+-------+---------------------+
| 1 | stop | 2020-11-11 07:58:19 |
+----+-------+---------------------+
The 'finished_sessions' materialized view
CREATE MATERIALIZED VIEW finished_sessions (
id UInt64,
start_at DateTime,
end_at DateTime
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMM(start_at)
ORDER BY (id)
POPULATE AS
SELECT
id,
minIf(created_at, name = 'start') AS start_at,
maxIf(created_at, name = 'stop') AS end_at
FROM test_sessions
GROUP BY id
HAVING end_at <> '1970-01-01 00:00:00';
The 'finished_sessions' materialized view data
SELECT * FROM finished_sessions;
+----+---------------------+---------------------+
| id | start_at | end_at |
+----+---------------------+---------------------+
| 1 | 2020-11-10 07:58:19 | 2020-11-11 07:58:19 |
+----+---------------------+---------------------+
Until this moment, everything works correctly: there is only 1 closed session
After the close of the second session
INSERT INTO test_sessions(id, name, created_at) VALUES
(2, 'stop', now())
Incorrect populating occurs
SELECT * from finished_sessions ORDER BY id;
+----+-------------------------------+---------------------+
| id | start_at | end_at |
+----+-------------------------------+---------------------+
| 1 | 2020-11-10 07:58:19 | 2020-11-11 07:58:19 |
+----+-------------------------------+---------------------+
| 2 | ---> 1970-01-01 00:00:00 <--- | 2020-11-10 08:06:24 |
+----+-------------------------------+---------------------+
How to fix it?

You you should use AggregateFunction or better SimpleAggregateFunction
It's impossible to partition table by AggregateFunction. Because AggregateFunction are computed during merges and merges are executed over partition.
MV is an insert trigger. https://youtu.be/ckChUkC3Pns?list=PLO3lfQbpDVI-hyw4MyqxEk3rDHw95SzxJ https://den-crane.github.io/Everything_you_should_know_about_materialized_views_commented.pdf
CREATE TABLE IF NOT EXISTS test_sessions (
id UInt64,
name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY name;
INSERT INTO test_sessions(id, name, created_at) VALUES
(1, 'start', now()),
(1, 'stop', now() + INTERVAL 1 day),
(2, 'start', now() + INTERVAL 1 HOUR );
CREATE MATERIALIZED VIEW finished_sessions
ENGINE = AggregatingMergeTree
ORDER BY (id)
POPULATE AS
SELECT
id,
minStateIf(created_at, name = 'start') AS start_at,
maxStateIf(created_at, name = 'stop') AS end_at
FROM test_sessions
GROUP BY id
INSERT INTO test_sessions(id, name, created_at) VALUES
(2, 'stop', now());
SELECT
id,
minMerge(start_at),
maxMerge(end_at)
FROM finished_sessions
GROUP BY id
Query id: d797eee4-6088-40b8-aa12-b10da62b60c5
┌─id─┬──minMerge(start_at)─┬────maxMerge(end_at)─┐
│ 2 │ 2020-11-10 15:18:19 │ 2020-11-10 14:21:54 │
│ 1 │ 2020-11-10 14:18:19 │ 2020-11-11 14:18:19 │
└────┴─────────────────────┴─────────────────────┘
CREATE TABLE IF NOT EXISTS test_sessions (
id UInt64,
name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY name;
INSERT INTO test_sessions(id, name, created_at) VALUES
(1, 'start', now()),
(1, 'stop', now() + INTERVAL 1 day),
(2, 'start', now() + INTERVAL 1 HOUR );
CREATE MATERIALIZED VIEW finished_sessions
(
id UInt64,
start_at SimpleAggregateFunction(min,DateTime),
end_at SimpleAggregateFunction(max,DateTime)
)
ENGINE = AggregatingMergeTree
ORDER BY (id)
POPULATE AS
SELECT
id,
minIf(created_at, name = 'start') AS start_at,
maxIf(created_at, name = 'stop') AS end_at
FROM test_sessions
GROUP BY id;
INSERT INTO test_sessions(id, name, created_at) VALUES
(2, 'stop', now())
optimize table finished_sessions final;
SELECT
id,
min(start_at),
max(end_at)
FROM finished_sessions
GROUP BY id
┌─id─┬───────min(start_at)─┬─────────max(end_at)─┐
│ 2 │ 1970-01-01 00:00:00 │ 2020-11-10 14:29:30 │
│ 1 │ 2020-11-10 14:29:15 │ 2020-11-11 14:29:15 │
└────┴─────────────────────┴─────────────────────┘

Related

Materialized view contains wrong result

I'm having trouble with the content of a materialized view in Oracle 19c (version 19.15). I've managed to distill the issues into a reproducible test with this script:
create table b(
tsn varchar2(16) not null primary key,
fid varchar2(256) not null
);
create table bs(
tsn varchar2(16) not null constraint bet_stakes_fk references b,
leg number(1) not null,
amount number(10) not null,
primary key (tsn, leg) using index compress 1
);
create materialized view log on b
with primary key, rowid, sequence, commit scn (fid)
including new values;
create materialized view log on bs
with primary key, rowid, sequence, commit scn (amount)
including new values;
create materialized view bsd_mv
refresh fast start with (sysdate - 1) next (sysdate + 1/14400)
as
select fid, leg, sum(amount), count(*)
from bs inner join b using (tsn)
group by fid, leg;
insert into b values ('a', 'o');
insert into bs values ('a', 1, 10);
commit;
delete from bs where tsn = 'a';
delete from b where tsn = 'a';
insert into b values ('a', 'o');
insert into bs values ('a', 1, 5);
commit;
Wait 10 seconds or so before selecting
select * from bsd_mv;
The result will vary somewhat with different runs of the script, but usually the result will be
| Fid | Leg | Sum | Count |
| --- | --- | --- | ----- |
| o | 1 | 15 | 3 |
... where I would expect it to be...
| Fid | Leg | Sum | Count |
| --- | --- | --- | ----- |
| o | 1 | 5 | 1 |
If I run the query the view is based on, I always get the expected result.
Am I missing something in the setup, or do I have the wrong expectations, or have I triggered a bug in Oracle?
It took months with Oracle support, but eventually this was accepted as a bug...

Create materialized view based on aggregate materialized view

The base table
CREATE TABLE IF NOT EXISTS test_sessions
(
session_id UInt64,
session_name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY (session_id);
With the following data
INSERT INTO test_sessions (session_id, session_name, created_at) VALUES
(1, 'start', '2021-01-31 00:00:00'),
(1, 'stop', '2021-01-31 01:00:00'),
(2, 'start', '2021-01-31 01:00:00')
;
Created 2 materialized views to get closed sessions
CREATE MATERIALIZED VIEW IF NOT EXISTS test_session_aggregate_states
(
session_id UInt64,
started_at AggregateFunction(minIf, DateTime, UInt8),
stopped_at AggregateFunction(maxIf, DateTime, UInt8)
)
ENGINE = AggregatingMergeTree
PARTITION BY tuple()
ORDER BY (session_id)
POPULATE AS
SELECT session_id,
minIfState(created_at, session_name = 'start') AS started_at,
maxIfState(created_at, session_name = 'stop') AS stopped_at
FROM test_sessions
GROUP BY session_id;
CREATE VIEW IF NOT EXISTS test_session_completed
(
session_id UInt64,
started_at DateTime,
stopped_at DateTime
)
AS
SELECT session_id,
minIfMerge(started_at) AS started_at,
maxIfMerge(stopped_at) AS stopped_at
FROM test_session_aggregate_states
GROUP BY session_id
HAVING (started_at != '0000-00-00 00:00:00') AND
(stopped_at != '0000-00-00 00:00:00')
;
It works normally: return 1 row with existing "start" and "stop"
SELECT * FROM test_session_completed;
-- 1,2021-01-31 00:00:00,2021-01-31 01:00:00
Trying to create a materialized view based on test_session_completed with joins to other tables (there are no joins in the example)
CREATE MATERIALIZED VIEW IF NOT EXISTS test_mv
(
session_id UInt64
)
ENGINE = MergeTree
PARTITION BY tuple()
ORDER BY (session_id)
POPULATE AS
SELECT session_id
FROM test_session_completed
;
Writing a test queries to test the test_mv
INSERT INTO test_sessions (session_id, session_name, created_at) VALUES
(3, 'start', '2021-01-31 02:00:00'),
(3, 'stop', '2021-01-31 03:00:00');
SELECT * FROM test_session_completed;
-- SUCCESS
-- 3,2021-01-31 02:00:00,2021-01-31 03:00:00
-- 1,2021-01-31 00:00:00,2021-01-31 01:00:00
SELECT * FROM test_mv;
-- FAILURE
-- 1
-- EXPECTED RESULT
-- 3
-- 1
How to fill test_mv based on test_session_completed ?
ClickHouse version: 20.11.4.13
Impossible to create MV over view.
MV is an insert trigger and it's impossible to get state completed without having state started in the same table. If you don't need to check that started happen before completed then you can make simpler MV and just check where completed.
You don't need minIfState you can use min (SimpleAggregateFunction). It will reduce stored data and will improve performance.
I think the second MV is excessive.
Check this:
https://den-crane.github.io/Everything_you_should_know_about_materialized_views_commented.pdf
https://youtu.be/ckChUkC3Pns?list=PLO3lfQbpDVI-hyw4MyqxEk3rDHw95SzxJ&t=9371
I would do this:
CREATE TABLE IF NOT EXISTS test_sessions
(
session_id UInt64,
session_name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY (session_id);
CREATE MATERIALIZED VIEW IF NOT EXISTS test_session_aggregate_states
(
session_id UInt64,
started_at SimpleAggregateFunction(min, DateTime),
stopped_at SimpleAggregateFunction(max, DateTime)
)
ENGINE = AggregatingMergeTree
PARTITION BY tuple()
ORDER BY (session_id)
POPULATE AS
SELECT session_id,
minIf(created_at, session_name = 'start') AS started_at,
maxIf(created_at, session_name = 'stop') AS stopped_at
FROM test_sessions
GROUP BY session_id;
INSERT INTO test_sessions (session_id, session_name, created_at) VALUES
(3, 'start', '2021-01-31 02:00:00'),
(3, 'stop', '2021-01-31 03:00:00');
completed sessions:
SELECT session_id,
min(started_at) AS started_at,
max(stopped_at) AS stopped_at
FROM test_session_aggregate_states
GROUP BY session_id
HAVING (started_at != '0000-00-00 00:00:00') AND
(stopped_at != '0000-00-00 00:00:00');
┌─session_id─┬──────────started_at─┬──────────stopped_at─┐
│ 1 │ 2021-01-31 00:00:00 │ 2021-01-31 01:00:00 │
└────────────┴─────────────────────┴─────────────────────┘
And using argMaxState you can aggregate multiple start stop within one session_id

Time series query based on another table

Initial data
CREATE TABLE a_table (
id UInt8,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY tuple()
ORDER BY id;
CREATE TABLE b_table (
id UInt8,
started_at DateTime,
stopped_at DateTime
)
ENGINE = MergeTree()
PARTITION BY tuple()
ORDER BY id;
INSERT INTO a_table (id, created_at) VALUES
(1, '2020-01-01 00:00:00'),
(2, '2020-01-02 00:00:00'),
(3, '2020-01-03 00:00:00')
;
INSERT INTO b_table (id, started_at, stopped_at) VALUES
(1, '2020-01-01 00:00:00', '2020-01-01 23:59:59'),
(2, '2020-01-02 00:00:00', '2020-01-02 23:59:59'),
(3, '2020-01-04 00:00:00', '2020-01-04 23:59:59')
;
Expected result: The 'a_table' rows by condition
b_table.started_at >= a_table.created_at AND
b_table.stopped_at <= a_table.created_at
+----+---------------------+
| id | created_at |
+----+---------------------+
| 1 | 2020-01-01 00:00:00 |
+----+---------------------+
| 2 | 2020-01-02 00:00:00 |
+----+---------------------+
What have i tried
-- No errors, empty result
SELECT a_table.*
FROM a_table
INNER JOIN b_table
ON b_table.id = a_table.id
WHERE b_table.started_at >= a_table.created_at
ANd b_table.stopped_at <= a_table.created_at
;
SELECT a_table.*
FROM a_table
ASOF INNER JOIN (
SELECT * FROM b_table
) q
ON q.id = a_table.id
AND q.started_at >= a_table.created_at
-- Error:
-- Invalid expression for JOIN ON.
-- ASOF JOIN expects exactly one inequality in ON section,
-- unexpected stopped_at <= created_at.
-- AND q.stopped_at <= a_table.created_at
;
WHERE b_table.started_at >= a_table.created_at
ANd b_table.stopped_at <= a_table.created_at
Wrong condition >= <= --> <= >=
20.8.7.15
SELECT
a_table.*,
b_table.*
FROM a_table
INNER JOIN b_table ON b_table.id = a_table.id
WHERE (b_table.started_at <= a_table.created_at) AND (b_table.stopped_at >= a_table.created_at)
┌─id─┬──────────created_at─┬─b_table.id─┬──────────started_at─┬──────────stopped_at─┐
│ 1 │ 2020-01-01 00:00:00 │ 1 │ 2020-01-01 00:00:00 │ 2020-01-01 23:59:59 │
│ 2 │ 2020-01-02 00:00:00 │ 2 │ 2020-01-02 00:00:00 │ 2020-01-02 23:59:59 │
└────┴─────────────────────┴────────────┴─────────────────────┴─────────────────────┘
In real production such queries would not work. Because JOIN is very slow.
It needs re-design. It hard to say how without knowing why do you have the second table. Probably I would use rangeHashed external dictionary.

How to pivot subgroups?

We have created a flat table for Clickhouse and are trying to get records from this table to create a Materialized view. The logic is if e_id is null the record is 'TypeB', if e_id is not null then record is 'TypeA'. Both TypeA and TypeB records will have the same p_id and s_id. We want to create one record per p_id+s_id combination.
The query given below works well with filter (p_id =1 and s_id = 1) but without filters - the exception is "DB::Exception: Scalar subquery returned more than one row"
Is it possible to do this in ClickHouse?
Would it be possible to create Materialized View with such a query?
select p_id,s_id,
groupArray(e_id),
groupArray(name),
(select groupArray(name)
from flat_table
where e_id is null and p_id =1 and s_id = 1
group by p_id,s_id) as typeB
from flat_table
where e_id is not null and p_id =1 and s_id = 1
group by p_id,s_id;
/*
This what the table looks like:
Flat_table
p_id s_id e_id name
1 1 1 Jake
1 1 2 Bob
1 1 null Barby
1 1 null Ella
This is expected result:
p_id s_id e_id typeA typeB
1 1 [1,2] [Jake,Bob] [Barby,Ella]
*/
Let's try this query:
SELECT p_id, s_id, e_ids, typeA, typeB
FROM (
SELECT
p_id,
s_id,
groupArray((e_id, name)) eid_names,
arrayMap(x -> x.1, arrayFilter(x -> not isNull(x.1), eid_names)) e_ids,
arrayMap(x -> x.2, arrayFilter(x -> not isNull(x.1), eid_names)) typeA,
arrayMap(x -> x.2, arrayFilter(x -> isNull(x.1), eid_names)) typeB
FROM test.test_006
GROUP BY p_id, s_id)
/* Result
┌─p_id─┬─s_id─┬─e_ids─┬─typeA────────────┬─typeB──────────────┐
│ 2 │ 2 │ [1,2] │ ['Jake2','Bob2'] │ ['Barby2','Ella2'] │
│ 1 │ 1 │ [1,2] │ ['Jake','Bob'] │ ['Barby','Ella'] │
└──────┴──────┴───────┴──────────────────┴────────────────────┘
*/
/* Data preparation queries */
CREATE TABLE test.test_006
(
`p_id` Int32,
`s_id` Int32,
`e_id` Nullable(Int32),
`name` String
)
ENGINE = Memory
INSERT INTO test.test_006
VALUES (1, 1, 1, 'Jake'), (1, 1, 2, 'Bob'), (1, 1, null, 'Barby'), (1, 1, null, 'Ella'),
(2, 2, 1, 'Jake2'), (2, 2, 2, 'Bob2'), (2, 2, null, 'Barby2'), (2, 2, null, 'Ella2')

ORACLE Query Get Last ID Using MIN Based On Quantity Consumed By ID

I have Incoming Stock transaction data using Oracle:
ID | DESCRIPTION | PART_NO | QUANTITY | DATEADDED
TR5 | FG | P0025 | 5 | 06-SEP-2017 08:20:33 <-- just now added
TR4 | Test | TEST1 | 8 | 05-SEP-2017 15:11:15
TR3 | FG | GSDFGSG | 10 | 31-AUG-2017 16:26:04
TR2 | FG | GSDFGSG | 2 | 31-AUG-2017 16:05:39
TR1 | FG | GSDFGSG | 2 | 30-AUG-2017 16:30:16
And now I'm grouping that data to be:
TR_ID | PART_NO | TOTAL
TR1 | GSDFGSG | 14
TR4 | TEST1 | 8
TR5 | P0025 | 5 <-- just now added
Query Code:
SELECT MIN(TRANSACTION_EQUIPMENTID) as TR_ID,
PART_NO,
SUM(T.QUANTITY) AS TOTAL
FROM WA_II_TBL_TR_EQUIPMENT T
GROUP BY T.PART_NO
As you can see on that data and query code, I'm show TR_ID using MIN to get first ID on first transaction.
And now I have Outgoing transaction data:
Assume I try to get quantity 8
ID_FK | QUANTITY
TR1 | 8
And now I want to get last ID due to quantity 8 has been consumed
ID | DESCRIPTION | PART_NO | QUANTITY
TR3| FG | GSDFGSG | 10 <-- CONSUMED 4+2+2, TOTAL 8
TR2| FG | GSDFGSG | 2 <-- CONSUMED 2+2, TOTAL 4
TR1| FG | GSDFGSG | 2 <-- CONSUMED 2
As you can see above, TR1, TR2 has been consumed. Now I want the query
SELECT MIN(TRANSACTION_EQUIPMENTID) as TR_ID,
PART_NO,
SUM(T.QUANTITY) AS TOTAL
FROM WA_II_TBL_TR_EQUIPMENT T
GROUP BY T.PART_NO
get the last id is : TR3, due to TR1 & TR2 has been consumed.
How to do that in query?
Take minimum id where growing sum is greater than 8. Use analytic sum():
select min(id) id
from (select t.*,
sum(quantity) over (partition by part_no order by id) sq
from t
where part_no = 'GSDFGSG'
)
where sq >= 8
Test data, output:
create table t(ID varchar2(3), DESCRIPTION varchar2(5),
PART_NO varchar2(8), QUANTITY number(5), DATEADDED date);
insert into t values ('TR4', 'Test', 'TEST1', 8, timestamp '2017-09-05 15:11:15');
insert into t values ('TR3', 'FG', 'GSDFGSG', 10, timestamp '2017-08-31 16:26:04');
insert into t values ('TR2', 'FG', 'GSDFGSG', 2, timestamp '2017-08-31 16:05:39');
insert into t values ('TR1', 'FG', 'GSDFGSG', 2, timestamp '2017-08-30 16:30:16');
insert into t values ('TR5', 'FG', 'GSDFGSG', 3, timestamp '2017-08-31 17:00:00');
Edit:
Add part_no and total columns and group by clause:
select min(id) id, part_no, min(sq) total
from (select t.*,
sum(quantity) over (partition by part_no order by id) sq
from t
where part_no = 'GSDFGSG'
)
where sq >= 8
group by part_no
ID PART_NO TOTAL
--- -------- ----------
TR3 GSDFGSG 14

Resources