ClickHouse: groupArray inside groupArray - clickhouse

table
CREATE TABLE test
(
uid UUID,
agc Int64,
stc Int8,
oci Int32,
sci Int32,
fcd String,
prc Float64
) engine = MergeTree()
ORDER BY (agc, oci);
base query
SELECT fcd, groupArray((agc, stc, oci, sci, (uid, prc))) as arr
FROM test
GROUP BY fcd;
next, I want to group groupArray by the first 4 values, like this (i know what groupArray cannot nest groupArray)
SELECT fcd, groupArray(groupArray(agc, stc, oci, sci)), (uid, prc))) as arr
example output
fcd
groupArray(groupArray(agc, stc, oci, sci)), (uid, prc)))
'str'
[(1, 1, 1, 2, [(id1, 10), (id2, 15)]), (1, 1, 1, 2, [(id3, 13), (id3, 11)])]

Try this query:
SELECT arrayJoin(arr_result) AS result
FROM
(
SELECT
id,
groupArray((v2, v3)) AS arr1,
groupArray((v4, v5)) AS arr2,
arrayMap(x -> (untuple(x), arr2), arr1) AS arr_result
FROM
(
SELECT
number % 2 AS id,
number AS v2,
number AS v3,
number AS v4,
number AS v5
FROM numbers(4)
)
GROUP BY id
)
/*
┌─result──────────────┐
│ (0,0,[(0,0),(2,2)]) │
│ (2,2,[(0,0),(2,2)]) │
│ (1,1,[(1,1),(3,3)]) │
│ (3,3,[(1,1),(3,3)]) │
└─────────────────────┘
*/

Related

argMax of two columns in clickhouse

Is that possible to get id for maximum value of timestamp and duration. I am looking for query like
SELECT name, argMax(id, (timestamp, duration)) FROM tables GROUP BY name
It's unclear what you mean by the maximum.
Clickhouse is able to compare tuples from the left to the right
https://clickhouse.com/docs/en/sql-reference/data-types/tuple/
select (2022, 1, 1) > (2021, 12, 31);
┌─greater((2022, 1, 1), (2021, 12, 31))─┐
│ 1 │
└───────────────────────────────────────┘
In this case you should use
SELECT name, argMax(id, (timestamp, duration))
FROM tables
GROUP BY name
And Clickhouse has a function greatest https://clickhouse.com/docs/en/sql-reference/functions/other-functions/#greatesta-b
select greatest(2021, 2023);
┌─greatest(2021, 2023)─┐
│ 2023 │
└──────────────────────┘
Then you should use
SELECT name, argMax(id, greatest(timestamp, duration))
FROM tables
GROUP BY name

Clickhouse. Get value from json

I use Clickhouse database. There is a table with string column (data). All rows contains data like:
'[{"a":23, "b":1}]'
'[{"a":7, "b":15}]'
I wanna get all values of key "b".
1
15
Next query:
Select JSONExtractInt('data', 0, 'b') from table
return 0 all time. How i can get values of key "b"?
SELECT tupleElement(JSONExtract(j, 'Array(Tuple(a Int64, b Int64))'), 'b')[1] AS res
FROM
(
SELECT '[{"a":23, "b":1}]' AS j
UNION ALL
SELECT '[{"a":7, "b":15}]'
)
┌─res─┐
│ 1 │
└─────┘
┌─res─┐
│ 15 │
└─────┘

Will ordering of multiple groupArray aggregations be consistent with each other?

The docs for the groupArray function warns that
Values can be added to the array in any (indeterminate) order.... In
some cases, you can still rely on the order of execution. This applies
to cases when SELECT comes from a subquery that uses ORDER BY.
Does this just mean that the array will not neccessarily be in the order specified in the ORDER BY? Can I depend on the order of multiple groupArrays in the same query being consistent with each other?
For instance given the records:
{commonField:"common", fieldA: "1a", fieldB:"1b"}
{commonField:"common", fieldA: "2a", fieldB:"2b"}
{commonField:"common", fieldA: "3a", fieldB:"3b"}
Can I depend on the query
SELECT commonField, groupArray(fieldA), groupArray(fieldB) FROM myTable GROUP BY commonField
to return
{
commonField:"common",
groupedA:[
"2a", "3a", "1a"
],
groupedB:[
"2b", "3b", "1b"
]
}
multiple groupArrays in the same query being consistent with each other?
Yes. They will be consistent.
Anyway you can use Tuple & single groupArray. And Tuple is usefull if you have NULLs, because ALL aggregate functions skip Nulls.
create table test (K Int64, A Nullable(String), B Nullable(String)) Engine=Memory;
insert into test values(1, '1A','1B')(2, '2A', Null);
select groupArray(A), groupArray(B) from test;
┌─groupArray(A)─┬─groupArray(B)─┐
│ ['1A','2A'] │ ['1B'] │
└───────────────┴───────────────┘
---- Tuple (A,B) one groupArray ----
select groupArray( (A,B) ) from test;
┌─groupArray(tuple(A, B))───┐
│ [('1A','1B'),('2A',NULL)] │
└───────────────────────────┘
select (groupArray( (A,B) ) as ga).1 _A, ga.2 _B from test;
┌─_A──────────┬─_B──────────┐
│ ['1A','2A'] │ ['1B',NULL] │
└─────────────┴─────────────┘
---- One more Tuple trick - Tuple(Null) is not Null ----
select groupArray(tuple(A)).1 _A , groupArray(tuple(B)).1 _B from test;
┌─_A──────────┬─_B──────────┐
│ ['1A','2A'] │ ['1B',NULL] │
└─────────────┴─────────────┘
---- One more Tuple trick tuple(*)
select groupArray( tuple(*) ) from test;
┌─groupArray(tuple(K, A, B))────┐
│ [(1,'1A','1B'),(2,'2A',NULL)] │
└───────────────────────────────┘

How to create 'table in table' in clickhouse?

e.g.
In clickhouse, I want to create one table like the following structure.
create table (
time DateTime,
visits array(unit)
)
Engine=memory
the unit struct {
a string,
btime int64,
c string,
e string
}
How to create the table?
It needs to use Nested data structure:
CREATE TABLE visits (
time DateTime,
visits Nested
(
a String,
btime Int64,
c String,
e String
)
) ENGINE = Memory;
/* insert test data */
INSERT INTO visits
VALUES
(now(), ['a1', 'a2'], [1, 2], ['c1', 'c2'], ['e1', 'e2']),
(now(), ['a11', 'a12'], [11, 12], ['c11', 'c12'], ['e11', 'e12']);
SELECT *
FROM visits;
/* results
┌────────────────time─┬─visits.a──────┬─visits.btime─┬─visits.c──────┬─visits.e──────┐
│ 2020-06-12 08:14:07 │ ['a1','a2'] │ [1,2] │ ['c1','c2'] │ ['e1','e2'] │
│ 2020-06-12 08:14:07 │ ['a11','a12'] │ [11,12] │ ['c11','c12'] │ ['e11','e12'] │
└─────────────────────┴───────────────┴──────────────┴───────────────┴───────────────┘
*/
Additionally, see the article Nested Data Structures in ClickHouse.

clickhouse: How do I find the least date in array that is above date in another column?

Basically I have the table with the following data-structure:
id_level1: Int32
id_level2: Int32
event_date: Date
arr_object_ids: Array of Int32 - sorted by next column
arr_object_dates: Array of Date - sorted ascending
What I need is to have the least object_date that is above event_date for each pair of (id_leve1, id_level2). How is that possible in Clickhouse?
Then I would use arrayElement(arr_object_ids, indexOf(arr_object_dates, solution) to get corresponding object_id
Try this query:
SELECT
id_level1,
id_level2,
/*arrayFirst(x -> x > event_date, arr_object_dates) least_date,*/
arrayFirstIndex(x -> x > event_date, arr_object_dates) least_date_index,
least_date_index = 0 ? -1 : arrayElement(arr_object_ids, least_date_index) object_id /* -1 if result not found */
FROM (
/* emulate original table */
SELECT 1 id_level1, 2 id_level2, '2020-01-03' event_date,
[4, 5, 6,7] arr_object_ids,
['2020-01-01', '2020-01-03', '2020-01-06', '2020-01-11'] arr_object_dates
UNION ALL
SELECT 3 id_level1, 4 id_level2, '2020-05-03' event_date,
[4, 5, 6,7] arr_object_ids,
['2020-01-01', '2020-01-03', '2020-01-06', '2020-01-11'] arr_object_dates)
ORDER BY event_date
/* result
┌─id_level1─┬─id_level2─┬─least_date_index─┬─object_id─┐
│ 1 │ 2 │ 3 │ 6 │
│ 3 │ 4 │ 0 │ -1 │
└───────────┴───────────┴──────────────────┴───────────┘
*/

Resources