Create materialized view based on aggregate materialized view - clickhouse

The base table
CREATE TABLE IF NOT EXISTS test_sessions
(
session_id UInt64,
session_name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY (session_id);
With the following data
INSERT INTO test_sessions (session_id, session_name, created_at) VALUES
(1, 'start', '2021-01-31 00:00:00'),
(1, 'stop', '2021-01-31 01:00:00'),
(2, 'start', '2021-01-31 01:00:00')
;
Created 2 materialized views to get closed sessions
CREATE MATERIALIZED VIEW IF NOT EXISTS test_session_aggregate_states
(
session_id UInt64,
started_at AggregateFunction(minIf, DateTime, UInt8),
stopped_at AggregateFunction(maxIf, DateTime, UInt8)
)
ENGINE = AggregatingMergeTree
PARTITION BY tuple()
ORDER BY (session_id)
POPULATE AS
SELECT session_id,
minIfState(created_at, session_name = 'start') AS started_at,
maxIfState(created_at, session_name = 'stop') AS stopped_at
FROM test_sessions
GROUP BY session_id;
CREATE VIEW IF NOT EXISTS test_session_completed
(
session_id UInt64,
started_at DateTime,
stopped_at DateTime
)
AS
SELECT session_id,
minIfMerge(started_at) AS started_at,
maxIfMerge(stopped_at) AS stopped_at
FROM test_session_aggregate_states
GROUP BY session_id
HAVING (started_at != '0000-00-00 00:00:00') AND
(stopped_at != '0000-00-00 00:00:00')
;
It works normally: return 1 row with existing "start" and "stop"
SELECT * FROM test_session_completed;
-- 1,2021-01-31 00:00:00,2021-01-31 01:00:00
Trying to create a materialized view based on test_session_completed with joins to other tables (there are no joins in the example)
CREATE MATERIALIZED VIEW IF NOT EXISTS test_mv
(
session_id UInt64
)
ENGINE = MergeTree
PARTITION BY tuple()
ORDER BY (session_id)
POPULATE AS
SELECT session_id
FROM test_session_completed
;
Writing a test queries to test the test_mv
INSERT INTO test_sessions (session_id, session_name, created_at) VALUES
(3, 'start', '2021-01-31 02:00:00'),
(3, 'stop', '2021-01-31 03:00:00');
SELECT * FROM test_session_completed;
-- SUCCESS
-- 3,2021-01-31 02:00:00,2021-01-31 03:00:00
-- 1,2021-01-31 00:00:00,2021-01-31 01:00:00
SELECT * FROM test_mv;
-- FAILURE
-- 1
-- EXPECTED RESULT
-- 3
-- 1
How to fill test_mv based on test_session_completed ?
ClickHouse version: 20.11.4.13

Impossible to create MV over view.
MV is an insert trigger and it's impossible to get state completed without having state started in the same table. If you don't need to check that started happen before completed then you can make simpler MV and just check where completed.
You don't need minIfState you can use min (SimpleAggregateFunction). It will reduce stored data and will improve performance.
I think the second MV is excessive.
Check this:
https://den-crane.github.io/Everything_you_should_know_about_materialized_views_commented.pdf
https://youtu.be/ckChUkC3Pns?list=PLO3lfQbpDVI-hyw4MyqxEk3rDHw95SzxJ&t=9371
I would do this:
CREATE TABLE IF NOT EXISTS test_sessions
(
session_id UInt64,
session_name String,
created_at DateTime
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(created_at)
ORDER BY (session_id);
CREATE MATERIALIZED VIEW IF NOT EXISTS test_session_aggregate_states
(
session_id UInt64,
started_at SimpleAggregateFunction(min, DateTime),
stopped_at SimpleAggregateFunction(max, DateTime)
)
ENGINE = AggregatingMergeTree
PARTITION BY tuple()
ORDER BY (session_id)
POPULATE AS
SELECT session_id,
minIf(created_at, session_name = 'start') AS started_at,
maxIf(created_at, session_name = 'stop') AS stopped_at
FROM test_sessions
GROUP BY session_id;
INSERT INTO test_sessions (session_id, session_name, created_at) VALUES
(3, 'start', '2021-01-31 02:00:00'),
(3, 'stop', '2021-01-31 03:00:00');
completed sessions:
SELECT session_id,
min(started_at) AS started_at,
max(stopped_at) AS stopped_at
FROM test_session_aggregate_states
GROUP BY session_id
HAVING (started_at != '0000-00-00 00:00:00') AND
(stopped_at != '0000-00-00 00:00:00');
┌─session_id─┬──────────started_at─┬──────────stopped_at─┐
│ 1 │ 2021-01-31 00:00:00 │ 2021-01-31 01:00:00 │
└────────────┴─────────────────────┴─────────────────────┘
And using argMaxState you can aggregate multiple start stop within one session_id

Related

Is it possible to partition by aggregation column with engine AggregatingMergeTree()

Created a materialized view with the engine = AggregatingMergeTree() when inserted into the 'default'.tbl crashes with an exception.
CREATE MATERIALIZED VIEW default.tbl_view ON CLUSTER test
(
id_key String,
uid String,
dt Date,
data_timeStamp AggregateFunction(min, Date)
)
ENGINE = AggregatingMergeTree()
PARTITION BY dt
ORDER BY (id_key , uid)
AS SELECT id_key as id_key,
toDate(data_timeStamp) as dt,
uid as uid,
minState(toDate(data_timeStamp)) as data_timeStamp
FROM `default`.tbl pe
GROUP BY id_key, uid
DB::Exception: Illegal type AggregateFunction(min, Date) of argument of function toDate: while pushing to view default.tbl_view (be181a81-ea4d-4118-9b0d-6fb31b48d93e). (ILLEGAL_TYPE_OF_ARGUMENT)
How can I create a view aggregation data_timeStamp Aggregate Function(min, Date) group by id_key, uid and partition By data_timeStamp ? (Clickhouse partitioning)
I tried to do it using SimpleAggregateFunction.
I created a table with the AggregatingMergeTree engine, then I will insert data into it through the materialized view
CREATE TABLE `default`.tbl ON CLUSTER test
(
key_id String,
uid String,
data_timeStamp AggregateFunction(min, Date),
dt Date
)
Engine = AggregatingMergeTree
PARTITION BY dt
ORDER BY (key_id, uid)
CREATE MATERIALIZED VIEW `default`.tbl_view ON CLUSTER test
TO `default`.tbl
AS SELECT key_id as key_id,
toDate(data_timeStamp) as dt,
uid as uid,
minState(toDate(data_timeStamp)) as data_timeStamp
FROM `default`.tb2 pe
GROUP BY key_id, uid
The error message you got:
DB::Exception: Illegal type AggregateFunction(min, Date) of argument of function toDate: while pushing to view default.tbl_view (be181a81-ea4d-4118-9b0d-6fb31b48d93e). (ILLEGAL_TYPE_OF_ARGUMENT)
Is because your MV query is wrong. You're using AggregateFunction in your FROM table so you should first merge the results and then use -State combinator to push the data to another AggregateFunction column. You can use SimpleAggregateFunction to create the partitioning key.
CREATE TABLE `default`.tbl
(
id_key String,
uid String,
data_timeStamp AggregateFunction(min, Date),
dt Date
)
Engine = AggregatingMergeTree
PARTITION BY dt
ORDER BY (id_key, uid);
CREATE MATERIALIZED VIEW default.tbl_view
(
id_key String,
uid String,
dt Date,
partition SimpleAggregateFunction(min, Date),
data_timeStamp_ AggregateFunction(min, Date)
)
ENGINE = AggregatingMergeTree()
PARTITION BY (dt, partition)
ORDER BY (id_key, uid)
AS
SELECT id_key, uid, dt, min(partition) as partition, minState(data_timeStamp_) as data_timeStamp_
FROM (SELECT id_key,
toDate(minMerge(data_timeStamp)) as dt,
uid as uid,
minMerge(data_timeStamp)::date partition,
minMerge(data_timeStamp)::date as data_timeStamp_
FROM `default`.tbl pe
GROUP BY id_key, uid
) GROUP BY id_key, uid, dt;
INSERT INTO default.tbl SELECT
CAST(number, 'String'),
'1',
minState(CAST(now(), 'date') - number),
CAST(now(), 'date') - number
FROM numbers(5)
GROUP BY
1,
2,
4
OPTIMIZE TABLE tbl_view FINAL;

Clickhouse query takes long time to execute with array joins and group by

I have a table student which has over 90 million records. The create table query is as follows:
CREATE TABLE student(
id integer,
student_id FixedString(15) NOT NULL,
teacher_array Nested(
teacher_id String,
teacher_name String,
teacher_role_id smallint
),
subject_array Nested(
subject_id String,
subject_name String,
subject_category_id smallint
),
year integer NOT NULL
)
ENGINE=MergeTree()
PRIMARY KEY id
PARTITION BY year
ORDER BY id
SETTINGS index_granularity = 8192
The following query takes 5 seconds to execute:
SELECT count(distinct id) as student_count,
(
SELECT count(distinct id)
FROM student
ARRAY JOIN teacher_array
WHERE hasAny(subject_array.subject_category_id, [1, 2]) AND (teacher_array.teacher_role_id NOT IN (1))
) AS total_student_count,
count(*) OVER () AS total_result_count,
teacher_array.teacher_role_id AS teacher_id
FROM
(
SELECT *
FROM student
ARRAY JOIN subject_array
)
ARRAY JOIN teacher_array
WHERE (subject_array.subject_category_id IN (1, 2)) AND (teacher_array.teacher_role_id NOT IN (1))
GROUP BY teacher_array.teacher_role_id
ORDER BY student_count DESC
LIMIT 0, 10
Expecting the query to run within 500 milliseconds is there any workaround for this? Tried using uniq and groupBitmap still the execution time comes around 2 seconds.

Sql query to filter out the overlapping dates

Version
start_date
end_date
1
2005-11-23
2005-11-23
2
2005-11-23
2005-11-23
3
2005-11-23
2008-10-23
4
2008-10-23
2010-05-18
5
2011-05-13
2012-05-19
In the above table instead of keeping version 1,2,3,4 we can keep version 1 starting from '2005-11-23' to '2010-05-18' since all these verions are overlapping and keep version 5 as it is.
Ouput needed
..............
Version
start_date
end_date
1
2005-11-23
2010-05-18
5
2011-05-13
2012-05-19
How we can frame sql query for thi scenario?
Hive or Postgresql
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
Query #1
with my_overlaps AS (
select
*,
LAG(end_date) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version") OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date,
CASE
WHEN md.end_date IS NULL THEN s.end_date
ELSE md.end_date
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
Version
start_date
end_date
1
2005-11-23T00:00:00.000Z
2010-05-18T00:00:00.000Z
5
2011-05-13T00:00:00.000Z
2012-05-19T00:00:00.000Z
View on DB Fiddle
Schema (PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
Query #1
with my_overlaps AS (
select
*,
LAG(end_date) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version") OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
Version
start_date
end_date
1
2005-11-23
2010-05-18
5
2011-05-13
2012-05-19
View on DB Fiddle
Update 1
Lag/Lead functions now assigned default values
Schema (PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2012-05-19');
Query #1
with my_overlaps AS (
select
*,
LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
ORDER BY
s."Version";
Version
start_date
end_date
1
2005-11-23
2012-05-19
View on DB Fiddle
With original dataset
Schema (PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
Query #1
with my_overlaps AS (
select
*,
LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version"
ORDER BY
s."Version";
Version
start_date
end_date
1
2005-11-23
2010-05-18
5
2011-05-13
2005-11-23
View on DB Fiddle
The safest way to handle this -- assuming that you can create stable sort on the rows (which version provides) -- uses a cumulative maximum instead of lag().
select min(version), min(start_date), min(end_date)
from (select t.*,
sum(case when prev_max_end_date >= start_date then 0 else 1 end) over
(order by start_date, version) as grp
from (select t.*,
max(end_date) over (order by start_date, version
rows between unbounded preceding and 1 preceding
) as prev_max_end_date
from t
) t
) t
group by grp;
This should work in any (reasonable) database. Here is a db<>fiddle that happens to use Postgres.
The issue with lag()/lead() approaches is that the overlap with earlier rows may not be on the "previous" row. For instance, consider this diagram (where lower case means start and upper case means end):
---a----b--B----c--C----d--D--e---A--E--
E overlaps with A. However, by any reasonable definition of "previous", A is not the previous row for E.

In oracle SQL DB same primary Id is present more then once with different batch_id. How can I know the batch ID just before the current batch ID

I am working on oracle database.
We load customer data in source table which eventually migrates to target table.
Every time customer data is loaded in source table it is having a unique batch_id.
If we want to update some field in customer table, then we again load the same customer in source table but this time with different batch_id.
Now I want to know batch_id of the customer just before the latest batch_id.
Batch_id we take is usually the current date.
Use ROW_NUMBER analytic function
your sample data
select * from tab
order by 1,2
CUSTOMER_ID BATCH_ID
----------- -------------------
1 09.12.2019 00:00:00
1 10.12.2019 00:00:00
2 10.12.2019 00:00:00
Row_number assihns sequence number starting from 1 for each customer order descending on BATCH_ID - you are interested on one before the latest, i.e. the rows with the number 2.
with cust as (
select
customer_id, batch_id,
row_number() over (partition by customer_id order by batch_id desc) rn
from tab)
select CUSTOMER_ID, BATCH_ID
from cust
where rn = 2;
CUSTOMER_ID BATCH_ID
----------- -------------------
1 09.12.2019 00:00:00
It seems that you're basically looking for the second biggest value in the SOURCE table.
In this example code the SOURCE_TABLE represents the table containing same CUSTOMER_NO with different BATCH_NO:
create table source_table (customer_no integer, batch_no date);
insert into source_table values ('1', SYSDATE-2);
insert into source_table values ('1', SYSDATE-1);
insert into source_table values ('1', SYSDATE);
SELECT batch_no
FROM (
SELECT batch_no, row_number() over (order by batch_no desc) as row_num
FROM source_table
) t
WHERE row_num = 2
Where row_num = 2 represents the second biggest value in the table.
The query returns SYSDATE-1.

Alternative to window function in MariaDB 10.1

I have a windows function (over, partitioned by) in my code:
FROM (SELECT wp_posts.id,
wp_postmeta.post_id,
post_title,
post_type,
meta_value,
Row_number()
OVER(
partition BY post_title
ORDER BY wp_postmeta.meta_value) rn
but apparently this isn't supported on MariaDB before 10.2 (-I am using 10.1). Could someone please suggest alternative code which is both efficient and works on MariaDB 10.1 also?
dbfiddle provided, unfortunately with only MariaDB 10.2 as the oldest; can't test 10.1 directly here
create table wp_posts (
ID integer primary key auto_increment,
post_title varchar(30),
post_type varchar(30)
);
✓
create table wp_postmeta (
ID integer primary key auto_increment,
post_id integer,
meta_key varchar(30) not null default '_regular_price',
meta_value integer not null
);
✓
insert into wp_posts (post_title, post_type) values
('Apple Pie','Product'),
('French Toast','Product'),
('Shepards Pie','Product'),
('Jam Pie','Product'),
('Jam Pie','Product'),
('Plate','Not a Product'),
('Bucket','Not a Product'),
('Chequebook','Not a Product'),
('French Toast','Product'),
('French Toast','Product'),
('Banana','Product'),
('Banana','Product'),
('Banana','Product');
✓
insert into wp_postmeta (post_id, meta_value) values
(1,10),
(2,5),
(3,9),
(4,8),
(5,11),
(6,12),
(7,10),
(8,6),
(9,1),
(10,1),
(11,7),
(12,2),
(13,2);
✓
-- Deleting all duplicate products in wp_posts table
DELETE FROM wp_posts
WHERE id IN (SELECT id
FROM (SELECT id,
post_title,
post_type,
meta_value
FROM (SELECT wp_posts.id,
wp_postmeta.post_id,
post_title,
post_type,
meta_value,
Row_number()
OVER(
partition BY post_title
ORDER BY wp_postmeta.meta_value) rn
FROM wp_postmeta
JOIN wp_posts
ON wp_postmeta.post_id = wp_posts.id
WHERE wp_posts.post_type = 'Product'
AND wp_postmeta.meta_key = '_regular_price'
) t
WHERE t.rn <> 1) AS aliasx);
✓
db<>fiddle here

Resources