Group based on date diff without using recursive - vertica

How to convert below SQL server recursive query in vertica. I know that vertica does not support recursive query. i tried using sum() over with lag but i am still not able to acheive final expected output.
with Product as (
select * from (
VALUES
(1, '2018-12-25','2019-01-05' ),
(1, '2019-03-01','2019-03-10' ),
(1, '2019-03-15','2019-03-19' ),
(1, '2019-03-22','2019-03-28' ),
(1, '2019-03-30','2019-04-02' ),
(1, '2019-04-10','2019-04-15' ),
(1, '2019-04-18','2019-04-25' )
) as a1 (ProductId ,ProductStartDt ,ProductEndDt)
), OrderedProduct as (
select *, ROW_NUMBER() over (order by ProductStartDt) as RowNum
from Product
), DateGroupsInterim (RowNum, GroupNum, GrpStartDt, Indx) as (
select RowNum, 1, ProductEndDt,1
from OrderedProduct
where RowNum=1
union all
select OrderedProduct.RowNum,
CASE WHEN OrderedProduct.ProductStartDt <= dateadd(day, 15, dgi.GrpStartDt)
THEN dgi.GroupNum
ELSE dgi.GroupNum + 1
END,
CASE WHEN OrderedProduct.ProductStartDt <= dateadd(day, 15, dgi.GrpStartDt)
THEN dgi.GrpStartDt
ELSE OrderedProduct.ProductEndDt
END,
CASE WHEN OrderedProduct.ProductStartDt <= dateadd(day, 15, dgi.GrpStartDt)
THEN 0
ELSE 1
END
from DateGroupsInterim dgi
join OrderedProduct on OrderedProduct.RowNum=dgi.RowNum+1
) select OrderedProduct.ProductId, OrderedProduct.ProductStartDt, OrderedProduct.ProductEndDt, DateGroupsInterim.GrpStartDt, DateGroupsInterim.GroupNum, Indx
from DateGroupsInterim
JOIN OrderedProduct on OrderedProduct.RowNum = DateGroupsInterim.RowNum
order by 2
Below is how the expected output looks like.

The operation you want to do is also called "sessionization" - which is the operation of splitting a time series into groups/ sub time series that have a certain meaning together.
The way you describe it, it does not seem to be possible:
The next group relies exactly on both the start of its previous group (15 min later than the start of the first row of the previous group) and the end of the previous group's last row. This needs to be a loop or a recursion, which is not offered by Vertica.
I managed to join the table with itself and get a session id for consecutive rows within 15 minutes. But, as of now, they're overlapping, and I found no way to determine which group I want to keep...
Like so:
WITH product(productid ,productstartdt ,productenddt) AS (
SELECT 1, DATE '2018-12-25',DATE '2019-01-05'
UNION ALL SELECT 1, DATE '2019-03-01',DATE '2019-03-10'
UNION ALL SELECT 1, DATE '2019-03-15',DATE '2019-03-19'
UNION ALL SELECT 1, DATE '2019-03-22',DATE '2019-03-28'
UNION ALL SELECT 1, DATE '2019-03-30',DATE '2019-04-02'
UNION ALL SELECT 1, DATE '2019-04-10',DATE '2019-04-15'
UNION ALL SELECT 1, DATE '2019-04-18',DATE '2019-04-25'
)
,
groups AS (
SELECT
a.productstartdt AS in_productstartdt
, b.*
, CONDITIONAL_CHANGE_EVENT(a.productstartdt) OVER(PARTITION BY a.productid ORDER BY a.productstartdt) AS grp
FROM product a
LEFT JOIN product b
ON a.productid = b.productid
AND a.productstartdt <= b.productstartdt
AND (a.productstartdt=b.productstartdt OR b.productstartdt <= a.productenddt + 15)
)
SELECT * FROM groups;
-- out in_productstartdt | productid | productstartdt | productenddt | grp
-- out -------------------+-----------+----------------+--------------+-----
-- out 2018-12-25 | 1 | 2018-12-25 | 2019-01-05 | 0
-- out 2019-03-01 | 1 | 2019-03-01 | 2019-03-10 | 1
-- out 2019-03-01 | 1 | 2019-03-22 | 2019-03-28 | 1
-- out 2019-03-01 | 1 | 2019-03-15 | 2019-03-19 | 1
-- out 2019-03-15 | 1 | 2019-03-15 | 2019-03-19 | 2
-- out 2019-03-15 | 1 | 2019-03-22 | 2019-03-28 | 2
-- out 2019-03-15 | 1 | 2019-03-30 | 2019-04-02 | 2
-- out 2019-03-22 | 1 | 2019-03-22 | 2019-03-28 | 3
-- out 2019-03-22 | 1 | 2019-03-30 | 2019-04-02 | 3
-- out 2019-03-22 | 1 | 2019-04-10 | 2019-04-15 | 3
-- out 2019-03-30 | 1 | 2019-04-10 | 2019-04-15 | 4
-- out 2019-03-30 | 1 | 2019-03-30 | 2019-04-02 | 4
-- out 2019-04-10 | 1 | 2019-04-10 | 2019-04-15 | 5
-- out 2019-04-10 | 1 | 2019-04-18 | 2019-04-25 | 5
-- out 2019-04-18 | 1 | 2019-04-18 | 2019-04-25 | 6
-- out (15 rows)
-- out
-- out Time: First fetch (15 rows): 35.454 ms. All rows formatted: 35.503 ms
What is the next difficulty is how to get rid of grp-s 2, 3, and 5 ....

Related

How to count total amount of pending tickets for each day this week in oracle-sql?

I want to count the total amount of pending tickets for each day in this week. I was only able to get it for one day at a time. I have this query right now:
SELECT (n.TOTAL - v.TODAY) + d.GISTER AS GISTER
FROM
(
-- Counts yesterday
SELECT
COUNT(ID) AS Gister
FROM FRESHDESK_API
-- 4 = resolved 5 = closed
-- Both count as closed
WHERE STATUS IN(4, 5)
AND TRUNC(UPDATED_AT) = TRUNC(SYSDATE - 1)
) d
CROSS JOIN
(
-- Total pending
SELECT
COUNT(ID) AS TOTAL
FROM FRESHDESK_API
-- 3 is pending
WHERE STATUS IN(3)
) n
CROSS JOIN
(
-- Pending tickets today
SELECT
COUNT(ID) AS TODAY
FROM FRESHDESK_API
-- 3 is pending
WHERE STATUS IN(3)
AND TRUNC(UPDATED_AT) = TRUNC(SYSDATE)
) v
I want to get a result like this:
+----------------------------------+---------+----------+
| day | pending_tickets |
+----------------------------------+---------+----------+
| Monday | 20 |
| Tuesday | 22 |
| Wednesday | 25 |
| Thursday | 24 |
| Friday | 19 |
+----------------------------------+---------+----------+
The table is someting like this (left the unused data out):
+----------------------------------+---------+----------+---------+-----------+----------+----------+
| id | created_at | updated_at | status |
+----------------------------------+---------+----------+----------+----------+----------+----------+
| | | | |
| | | | |
| | | | |
| | | | |
| | | | |
+----------------------------------+---------+----------+---------+-----------+---------+-----------+
You can use left join and group by as follows:
Select to_char(tday.updated_at, 'day') as updated_at,
count(tday.id) - count(yday.id) as pending_tickets
From FRESHDESK_API tday
Left join FRESHDESK_API yday
On trunc(tday.UPDATED_AT) = trunc(yday.UPDATED_AT - 1)
And trunc(yday.UPDATED_AT + 1, 'iw') = trunc(sysdate, 'iw')
And yday.status in (4,5)
Where trunc(tday.UPDATED_AT, 'iw') = trunc(sysdate, 'iw')
And tday.status = 3
Group by to_char(tday.updated_at, 'day'), trunc(tday.updated_at)
Order by trunc(tday.updated_at);

Sets From a Single Table, Grouped By a Column

I have a table:
+-------+-------+----------+
| GROUP | State | Priority |
+-------+-------+----------+
| 1 | MI | 1 |
| 1 | IA | 2 |
| 1 | CA | 3 |
| 1 | ND | 4 |
| 1 | AZ | 5 |
| 2 | IA | 2 |
| 2 | NJ | 1 |
| 2 | NH | 3 |
And so on...
How do I write a query that makes all the sets of the states by group, in priority order? Like so:
+-------+--------------------+
| GROUP | SET |
+-------+--------------------+
| 1 | MI |
| 1 | MI, IA |
| 1 | MI, IA, CA |
| 1 | MI, IA, CA, ND |
| 1 | MI, IA, CA, ND, AZ |
| 2 | NJ |
| 2 | NJ, IA |
| 2 | NJ, IA, NH |
+-------+--------------------+
This is similar to my question here and I've tried to modify that solution but, I'm just a forty watt bulb and it's a sixty watt problem...
This problem actually looks simpler than the answer to the question you linked, which is an excellent solution to that problem. Nevertheless, this uses the same hierarchical queries, with connect by
If it is the case that priority is always a continuous sequence of numbers, this will work
SELECT t.grp, level, ltrim(SYS_CONNECT_BY_PATH(state,','),',') as "set"
from t
start with priority = 1
connect by priority = prior priority + 1
and grp = prior grp
However, if that's not always true, we would require row_number() to define the sequence based on the order of priority ( which need not be consecutive integer)
with t2 AS
(
select t.*, row_number()
over ( partition by grp order by priority) as rn from t
)
SELECT t2.grp, ltrim(SYS_CONNECT_BY_PATH(state,','),',') as "set"
from t2
start with priority = 1
connect by rn = prior rn + 1
and grp = prior grp
DEMO
I realize this has already been answered, but I wanted to see if I could do this using ANSI standard syntax. "connect by" is an Oracle only feature, the following will work on multiple databases:
WITH
-- ASET is just setting up the sample dataset
aset AS
(SELECT 1 AS grp, 'MI' AS state, 1 AS priority FROM DUAL
UNION ALL
SELECT 1 AS grp, 'IA', 2 FROM DUAL
UNION ALL
SELECT 1 AS grp, 'CA', 3 FROM DUAL
UNION ALL
SELECT 1 AS grp, 'ND', 4 FROM DUAL
UNION ALL
SELECT 1 AS grp, 'AZ', 5 FROM DUAL
UNION ALL
SELECT 2 AS grp, 'IA', 2 FROM DUAL
UNION ALL
SELECT 2 AS grp, 'NJ', 1 FROM DUAL
UNION ALL
SELECT 2 AS grp, 'NH', 3 FROM DUAL),
bset AS
-- In BSET we convert the ASET records into comma separated values
( SELECT grp, LISTAGG( state, ',' ) WITHIN GROUP (ORDER BY priority) AS set1
FROM aset
GROUP BY grp),
cset ( grp
, set1
, set2
, pos ) AS
-- CSET breaks our comma separated values up into multiple rows
-- Each row adding the next CSV value
(SELECT grp AS grp
, set1 AS set1
, SUBSTR( set1 || ',', 1, INSTR( set1 || ',', ',' ) - 1 ) AS set2
, 1 AS pos
FROM bset
UNION ALL
SELECT grp AS grp
, set1 AS set1
, SUBSTR( set1 || ','
, 1
, INSTR( set1 || ','
, ','
, 1
, pos + 1 )
- 1 ) AS set2
, pos + 1 AS pos
FROM cset
WHERE INSTR( set1 || ','
, ','
, 1
, pos + 1 ) > 0)
SELECT grp, set2
FROM cset
ORDER BY grp, pos;

Lag and Lead to next month

TABLE: HIST
CUSTOMER MONTH PLAN
1 1 A
1 2 B
1 2 C
1 3 D
If I query:
select h.*, lead(plan) over (partition by customer order by month) np from HIST h
I get:
CUSTOMER MONTH PLAN np
1 1 A B
1 2 B C
1 2 C D
1 3 D (null)
But I wanted
CUSTOMER MONTH PLAN np
1 1 A B
1 2 B D
1 2 C D
1 3 D (null)
Reason being, next month to 2 is 3, with D. I'm guessing partition by customer order by month doesn't work the way I thought.
Is there a way to achieve this in Oracle 12c?
One way to do it is to use RANGE partitioning with the MIN analytic function. Like this:
select h.*,
min(plan) over
(partition by customer
order by month
range between 1 following and 1 following) np
from HIST h;
+----------+-------+------+----+
| CUSTOMER | MONTH | PLAN | NP |
+----------+-------+------+----+
| 1 | 1 | A | B |
| 1 | 2 | B | D |
| 1 | 2 | C | D |
| 1 | 3 | D | |
+----------+-------+------+----+
When you use RANGE partitioning, you are telling Oracle to make the windows based on the values of the column you are ordering by rather than making the windows based on the rows.
So, e.g.,
ROWS BETWEEN 1 following and 1 following
... will make a window containing the next row.
RANGE BETWEEN 1 following and 1 following
... will make a window containing all the rows having the next value for month.
UPDATE
If it is possible that some values for MONTH might be skipped for a given customer, you can use this variant:
select h.*,
first_value(plan) over
(partition by customer
order by month
range between 1 following and unbounded following) np
from h
+----------+-------+------+----+
| CUSTOMER | MONTH | PLAN | NP |
+----------+-------+------+----+
| 1 | 1 | A | B |
| 1 | 3 | B | D |
| 1 | 3 | C | D |
| 1 | 4 | D | |
+----------+-------+------+----+
You can use LAG/LEAD twice. The first time to check for duplicate months and to set the value to NULL in those months and the second time use IGNORE NULLS to get the next monthly value.
It has the additional benefit that if months are skipped then it will still find the next value.
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE HIST ( CUSTOMER, MONTH, PLAN ) AS
SELECT 1, 1, 'A' FROM DUAL UNION ALL
SELECT 1, 2, 'B' FROM DUAL UNION ALL
SELECT 1, 2, 'C' FROM DUAL UNION ALL
SELECT 1, 3, 'D' FROM DUAL UNION ALL
SELECT 2, 1, 'E' FROM DUAL UNION ALL
SELECT 2, 1, 'F' FROM DUAL UNION ALL
SELECT 2, 3, 'G' FROM DUAL UNION ALL
SELECT 2, 5, 'H' FROM DUAL;
Query 1:
SELECT CUSTOMER,
MONTH,
PLAN,
LEAD( np ) IGNORE NULLS OVER ( PARTITION BY CUSTOMER ORDER BY MONTH, PLAN, ROWNUM ) AS np
FROM (
SELECT h.*,
CASE MONTH
WHEN LAG( MONTH ) OVER ( PARTITION BY CUSTOMER ORDER BY MONTH, PLAN, ROWNUM )
THEN NULL
ELSE PLAN
END AS np
FROM hist h
)
Results:
| CUSTOMER | MONTH | PLAN | NP |
|----------|-------|------|--------|
| 1 | 1 | A | B |
| 1 | 2 | B | D |
| 1 | 2 | C | D |
| 1 | 3 | D | (null) |
| 2 | 1 | E | G |
| 2 | 1 | F | G |
| 2 | 3 | G | H |
| 2 | 5 | H | (null) |
Just so that it is listed here as an option for Oracle 12c (onward), you can use an apply operator for this style of problem
select
h.customer, h.month, h.plan, oa.np
from hist h
outer apply (
select
h2.plan as np
from hist h2
where h.customer = h.customer
and h2.month > h.month
order by month
fetch first 1 rows only
) oa
order by
h.customer, h.month, h.plan
I don't know of any Oracle 12c public fiddles so, an example in SQL Server can be found here: http://sqlfiddle.com/#!18/cd95e/1
| customer | month | plan | np |
|----------|-------|------|--------|
| 1 | 1 | A | C |
| 1 | 2 | B | D |
| 1 | 2 | C | D |
| 1 | 3 | D | (null) |

Not able to aggregate in case statement in hive query

I have data like below:
SELECT
mtrans.merch_num,
mtrans.card_num
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%' AND person_org_code='P' AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30;
+-----------+----------------------------+
| merch_num | card_num |
+-----------+----------------------------+
| 1 | 4658XXXXXXXXXXXXXXXXXXURMX |
| 2 | 4658XXXXXXXXXXXXXXXXXXIE6X |
| 2 | 4658XXXXXXXXXXXXXXXXXXDA8X |
| 2 | 4658XXXXXXXXXXXXXXXXXX7D1X |
| 2 | 4658XXXXXXXXXXXXXXXXXXTJ2X |
| 2 | 4658XXXXXXXXXXXXXXXXXXQQWX |
| 2 | 4659XXXXXXXXXXXXXXXXXXY4EX |
| 2 | 4658XXXXXXXXXXXXXXXXXXRDOX |
| 2 | 4658XXXXXXXXXXXXXXXXXX0O3X |
| 2 | 4658XXXXXXXXXXXXXXXXXXNVBX |
+-----------+----------------------------+
I want to aggregate trans_amt by merch_num only if I get unique card_num more than 1.
In simple Query I can do it:
SELECT
mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
SUM(mtrans.trans_amt) AS total_age_less_30_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%' AND person_org_code='P' AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30
GROUP BY
mtrans.merch_num having count(distinct mtrans.card_num) > 1;
+-----------+---------------+---------------------+
| merch_num | process_month | total_age_less_30_1 |
+-----------+---------------+---------------------+
| 2 | Nov-2017 | 2147.5 |
+-----------+---------------+---------------------+
Here I am able to skip merchant - 5493036 as it doesn't have unique cards more than 1.
But I have multiple conditions in where & want to write 1 query only.
Using case statement I am able to do it like below:
SELECT mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_less_30_1,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 40)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code='P'
GROUP BY
mtrans.merch_num
+-----------+---------------+---------------------+-------------------+
| merch_num | process_month | total_age_less_30_1 | total_age_30_40_1 |
+-----------+---------------+---------------------+-------------------+
| 3 | Nov-2017 | 0 | 0 |
| 4 | Nov-2017 | 0 | 0 |
| 1 | Nov-2017 | 2.49 | 203.68 |
| 2 | Nov-2017 | 2147.5 | 4907 |
| 5 | Nov-2017 | 0 | 0 |
+-----------+---------------+---------------------+-------------------+
I want to make 2.49 as NULL as for that merchant, more than 1 unique card is not present.
I am not able to apply having condition to check if unique card no is more than 1 then only I have to show the sum(trans_amt)
when I apply and condition in case statement, I get below error:
SELECT
mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30 and count(distinct mtrans.card_num) > 1)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_less_30_1,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 40 and count(distinct mtrans.card_num) > 1)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code='P'
GROUP BY
mtrans.merch_num;
ERROR: AnalysisException: aggregate function must not contain aggregate parameters: sum(CASE WHEN (round(datediff(mtrans.transaction_date, cdemo.date_birth) / 365) < 30 AND count(DISTINCT mtrans.card_num) > 1) THEN mtrans.trans_amt ELSE 0 END)
Can someone help?
The error seems to be because you have count inside the SUM statement. This is what you must try, Let me know how it goes :
SELECT
mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
NVL(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30 and count(distinct mtrans.card_num) > 1)
THEN SUM(mtrans.trans_amt) ELSE 0 END, NULL)
AS total_age_less_30_1,
NVL(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 40 and count(distinct mtrans.card_num) > 1)
THEN SUM(mtrans.trans_amt) ELSE 0 END, NULL)
AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code='P'
GROUP BY
mtrans.merch_num;
I would suggest doing it in a better way as follows.
(PS: I didn't have any hive access, so I am doing this using Postgresql using regular SQL. So, it should be easier to adapt to Hive SQL).
Here is my SQL Table and records inserted in the table.
CREATE TEMPORARY TABLE hivetest (
merchant_id INTEGER,
card_number TEXT,
customer_dob TIMESTAMP,
transaction_dt TIMESTAMP,
transaction_amt DECIMAL
);
INSERT INTO hivetest VALUES
(1, 'A', '1997-12-01', '2017-11-01', 10.0),
(2, 'A', '1997-12-01', '2017-11-01', 11.0),
(2, 'B', '1980-12-01', '2017-11-01', 12.0),
(3, 'A', '1997-12-01', '2017-11-01', 13.0),
(3, 'A', '1997-12-01', '2017-11-01', 14.0),
(4, 'A', '1997-12-01', '2017-11-01', 15.0),
(4, 'C', '1980-12-01', '2017-11-01', 16.0);
First, you need to join the tables and generate a dataset that gives you the transaction_age (transaction_dt - customer_dob). I have most of the data for date subtraction in this single table, but simple INNER JOIN(s) should suffice to achieve this. Anyways, here is the query for the same.
SELECT
merchant_id, card_number, DATE(customer_dob) customer_dob, DATE(transaction_dt) transaction_dt,
DATE_PART('year', DATE(transaction_dt)) - DATE_PART('year', DATE(customer_dob)) transaction_age,
transaction_amt
FROM hivetest ORDER BY 1;
This results in the data as follows.
+-------------+-------------+--------------+----------------+-----------------+----------------+
| merchant_id | card_number | customer_dob | transaction_dt | transaction_age |transaction_amt |
+-------------+-------------+--------------+----------------+-----------------+----------------+
| 1 | A | 1997-12-01 | 2017-11-01 | 20 | 10.0 |
| 2 | A | 1997-12-01 | 2017-11-01 | 20 | 11.0 |
| 2 | B | 1980-12-01 | 2017-11-01 | 37 | 12.0 |
| 3 | A | 1997-12-01 | 2017-11-01 | 20 | 13.0 |
| 3 | A | 1997-12-01 | 2017-11-01 | 20 | 14.0 |
| 4 | A | 1997-12-01 | 2017-11-01 | 20 | 15.0 |
| 4 | C | 1980-12-01 | 2017-11-01 | 37 | 16.0 |
+-------------+-------------+--------------+----------------+-----------------+----------------+
The above dataset will allow you to categorise the SUM of transaction amounts based on the transaction_age as you want. The trick is to have the above query in a sub-query and use the results of this subquery to categorize. Here is the query to do the same.
SELECT
merchant_id,
-- Transaction Age less than 30
SUM(CASE WHEN transaction_age <= 30 THEN 1 ELSE 0 END) count_30,
SUM(CASE WHEN transaction_age <= 30 THEN transaction_amt ELSE 0 END) sum_30,
-- Transaction Age between 30 and 40
SUM(CASE WHEN transaction_age > 30 AND transaction_age <= 40 THEN 1 ELSE 0 END) case_30_40,
SUM(CASE WHEN transaction_age > 30 AND transaction_age <= 40 THEN transaction_amt ELSE 0 END) sum_30_40
FROM
(
SELECT
merchant_id, transaction_amt,
DATE_PART('year', DATE(transaction_dt)) - DATE_PART('year', DATE(customer_dob)) transaction_age
FROM hivetest
) m
GROUP BY merchant_id ORDER BY 1;
This results in the categorised output as below which gives you the count of transactions and sum of transaction amounts for each category for each merchant:
+-------------+----------+--------+------------+-----------+
| merchant_id | count_30 | sum_30 | case_30_40 | sum_30_40 |
+-------------+----------+--------+------------+-----------+
| 1 | 1 | 10.0 | 0 | 0 |
| 2 | 1 | 11.0 | 1 | 12.0 |
| 3 | 2 | 27.0 | 0 | 0 |
| 4 | 1 | 15.0 | 1 | 16.0 |
+-------------+----------+--------+------------+-----------+
Now, this is our dataset which is more or less the final result. However, as per your requirement, you are only interested in merchants which have more than 1 unique cards (COUNT(DISTINCT card_number) > 1).
So, lets write another query which gives us this. Below is the query which calculates this and based on the criteria, it marks the flag as TRUE or FALSE indicating whether or not we are interested in that merchant or not.
SELECT
merchant_id,
CASE
WHEN COUNT(DISTINCT card_number) > 1 THEN
TRUE
ELSE
FALSE
END has_distinct_cards_gt_1
FROM hivetest GROUP BY merchant_id ORDER BY 1
This gives the output as below.
+-------------+-------------------------+
| merchant_id | has_distinct_cards_gt_1 |
+-------------+-------------------------+
| 1 | false |
| 2 | true |
| 3 | false |
| 4 | true |
+-------------+-------------------------+
Now, we are almost done. We just need to join these two tables and then based on the has_distinct_cards_gt_1, display the columns accordingly from the dataset generated previously.
Here is the final join query and resultset data generated.
SELECT
merchants_all.merchant_id,
-- Age < 30
CASE
WHEN merchants_cards.has_distinct_cards_gt_1 THEN
sum_30
ELSE
0
END total_sum_30,
-- Age in 30 and 40
CASE
WHEN merchants_cards.has_distinct_cards_gt_1 THEN
sum_30_40
ELSE
0
END total_sum_30_40
FROM
(
SELECT
merchant_id,
SUM(CASE WHEN transaction_age <= 30 THEN transaction_amt ELSE 0 END) sum_30,
SUM(CASE WHEN transaction_age > 30 AND transaction_age <= 40 THEN transaction_amt ELSE 0 END) sum_30_40
FROM
(
SELECT merchant_id, DATE_PART('year', DATE(transaction_dt)) - DATE_PART('year', DATE(customer_dob)) transaction_age, transaction_amt
FROM hivetest
) m
GROUP BY merchant_id
) merchants_all
JOIN
(
SELECT merchant_id, CASE WHEN COUNT(DISTINCT card_number) > 1 THEN TRUE ELSE FALSE END has_distinct_cards_gt_1
FROM hivetest GROUP BY merchant_id ORDER BY 1
) merchants_cards
ON
(merchants_all.merchant_id = merchants_cards.merchant_id);
And this generates your final data, which you need.
+-------------+--------------+-----------------+
| merchant_id | total_sum_30 | total_sum_30_40 |
+-------------+--------------+-----------------+
| 1 | 0 | 0 |
| 2 | 11.0 | 12.0 |
| 3 | 0 | 0 |
| 4 | 15.0 | 16.0 |
+-------------+--------------+-----------------+
Let me know if this helps.
COUNT inside SUM is the problem.
Here is a solution. I haven't tested it though.
It's not obvious which table person_org_code belongs to. If it is in merch_trans_daily, then add person_org_code = 'P' to the where clause in the view. Let's know whether it works!
WITH mtrans_count AS
(SELECT merch_num,
COUNT(1) AS cnt
FROM a_sbp_db.merch_trans_daily
WHERE mtrans.transaction_date LIKE '2017-09%'
)
SELECT mtrans.merch_num
,FROM_UNIXTIME(UNIX_TIMESTAMP(), 'MMM-yyyy') AS process_month
,NVL(SUM(CASE
WHEN (
ROUND(DATEDIFF(mtrans.transaction_date, cdemo.date_birth) / 365) < 30
AND mtrans_count.cnt > 1
)
THEN mtrans.trans_amt
ELSE 0
END), NULL) AS total_age_less_30_1
,NVL(SUM(CASE
WHEN (
ROUND(DATEDIFF(mtrans.transaction_date, cdemo.date_birth) / 365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date, cdemo.date_birth) / 365) < 40
AND mtrans_count.cnt > 1
)
THEN mtrans.trans_amt
ELSE 0
END), NULL) AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
INNER JOIN mtrans_count ON mtrans_count.merch_num = mtrans.merch_num
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code = 'P'
GROUP BY mtrans.merch_num;

Oracle 11g hierarchical query needs some inherited data

table looks kind of like:
create table taco (
taco_id int primary key not null,
taco_name varchar(255),
taco_prntid int,
meat_id int,
meat_inht char(1) -- inherit meat
)
data looks like:
insert into taco values (1, '1', null, 1, 'N');
insert into taco values (2, '1.1', 1, null, 'Y');
insert into taco values (3, '1.1.1', 2, null, 'N');
insert into taco values (4, '1.2', 1, 2, 'N');
insert into taco values (5, '1.2.1', 4, null, 'Y');
insert into taco values (6, '1.1.2', 2, null, 'Y');
or...
- 1 has a meat_id=1
- 1.1 has a meat_id=1 because it inherits from its parent via taco_prntid=1
- 1.1.1 has a meat_id of null because it does NOT inherit from its parent
- 1.2 has a meat_id=2 and it does not inherit from its parent
- 1.2.1 has a meat_id=2 because it does inherit from its parent via taco_prntid=4
- 1.1.2 has a meat_id=1 because it does inherit from its parent via taco_prntid=2
Now... how in the world do I query what the meat_id is for each taco_id? What is below did work until I realized that I wasn't using the inheritance flag and some of my data was messing up.
select x.taco_id,
x.taco_name,
to_number(substr(meat_id,instr(rtrim(meat_id), ' ', -1)+1)) as meat_id
from ( select taco_id,
taco_name,
level-1 "level",
sys_connect_by_path(meat_id, ' ') meat_id
from taco
start with taco_prntid is null
connect by prior taco_id = taco_prntid
) x
I can post some failed attempts to modify my query above but they're rather embarrassing failures. I haven't worked with hierarchical queries at all before beyond the basics so I'm hoping there is some keyword or concept I'm not aware I should be searching for.
I posted an answer myself down at the bottom to show what I ended up with ultimately. I'm leaving the other answer as accepted because they were able to make the data more clear for me and without it, I wouldn't have gotten anywhere.
Your inner query is correct. All you need is to pick only the rightmost number from the meat_id column of inner query, when flag is Y.
I have used REGEXP_SUBSTR function to get the rightmost number and CASE statement to check the flag.
SQL Fiddle
Query 1:
select taco_id,
taco_name,
taco_prntid,
case meat_inht
when 'N' then meat_id
when 'Y' then to_number(regexp_substr(meat_id2,'\d+\s*$'))
end meat_id,
meat_inht
from ( select taco_id,
taco_name,
taco_prntid,
meat_id,
meat_inht,
level-1 "level",
sys_connect_by_path(meat_id, ' ') meat_id2
from taco
start with taco_prntid is null
connect by prior taco_id = taco_prntid
)
order by 1
Results:
| TACO_ID | TACO_NAME | TACO_PRNTID | MEAT_ID | MEAT_INHT |
|---------|-----------|-------------|---------|-----------|
| 1 | 1 | (null) | 1 | N |
| 2 | 1.1 | 1 | 1 | Y |
| 3 | 1.1.1 | 2 | (null) | N |
| 4 | 1.2 | 1 | 2 | N |
| 5 | 1.2.1 | 4 | 2 | Y |
| 6 | 1.1.2 | 2 | 1 | Y |
Query 2:
select taco_id,
taco_name,
taco_prntid,
meat_id,
meat_inht,
level-1 "level",
sys_connect_by_path(meat_id, ' ') meat_id2
from taco
start with taco_prntid is null
connect by prior taco_id = taco_prntid
Results:
| TACO_ID | TACO_NAME | TACO_PRNTID | MEAT_ID | MEAT_INHT | LEVEL | MEAT_ID2 |
|---------|-----------|-------------|---------|-----------|-------|----------|
| 1 | 1 | (null) | 1 | N | 0 | 1 |
| 2 | 1.1 | 1 | (null) | Y | 1 | 1 |
| 3 | 1.1.1 | 2 | (null) | N | 2 | 1 |
| 6 | 1.1.2 | 2 | (null) | Y | 2 | 1 |
| 4 | 1.2 | 1 | 2 | N | 1 | 1 2 |
| 5 | 1.2.1 | 4 | (null) | Y | 2 | 1 2 |
This is what I've ended up with so far... after applying the logic in the accepted answer. I added a few more things so that I can join the result up against my meat table. the upper case could be optimized a little bit but I am so over this part of the query so.... it's going to have to stay for now.
select x.taco_id,
x.taco_name,
x.taco_prntname,
meat_id
,case when to_number(regexp_substr(meat_id,'\d+\s*$'))=0 then null else
to_number(regexp_substr(meat_id,'\d+\s*$')) end as meat_id
from ( select taco_id,
taco_name,
taco_prntname,
level-1 "level",
sys_connect_by_path(
case when meat_inht='N' then nvl(to_char(meat_id),'0') else '' end
,' ') meat_id
from taco join jobdtl on jobdtl.jobdtl_id=taco.jobdtl_id
start with taco_prntid is null
connect by prior taco_id = taco_prntid
) x
(do you ever wonder, when you read questions like this, what the real schema is? obviously I am not working on a taco project. or does it even matter as long as the general relationships and concept is preserved?)

Resources