I have data like below:
SELECT
mtrans.merch_num,
mtrans.card_num
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%' AND person_org_code='P' AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30;
+-----------+----------------------------+
| merch_num | card_num |
+-----------+----------------------------+
| 1 | 4658XXXXXXXXXXXXXXXXXXURMX |
| 2 | 4658XXXXXXXXXXXXXXXXXXIE6X |
| 2 | 4658XXXXXXXXXXXXXXXXXXDA8X |
| 2 | 4658XXXXXXXXXXXXXXXXXX7D1X |
| 2 | 4658XXXXXXXXXXXXXXXXXXTJ2X |
| 2 | 4658XXXXXXXXXXXXXXXXXXQQWX |
| 2 | 4659XXXXXXXXXXXXXXXXXXY4EX |
| 2 | 4658XXXXXXXXXXXXXXXXXXRDOX |
| 2 | 4658XXXXXXXXXXXXXXXXXX0O3X |
| 2 | 4658XXXXXXXXXXXXXXXXXXNVBX |
+-----------+----------------------------+
I want to aggregate trans_amt by merch_num only if I get unique card_num more than 1.
In simple Query I can do it:
SELECT
mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
SUM(mtrans.trans_amt) AS total_age_less_30_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%' AND person_org_code='P' AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30
GROUP BY
mtrans.merch_num having count(distinct mtrans.card_num) > 1;
+-----------+---------------+---------------------+
| merch_num | process_month | total_age_less_30_1 |
+-----------+---------------+---------------------+
| 2 | Nov-2017 | 2147.5 |
+-----------+---------------+---------------------+
Here I am able to skip merchant - 5493036 as it doesn't have unique cards more than 1.
But I have multiple conditions in where & want to write 1 query only.
Using case statement I am able to do it like below:
SELECT mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_less_30_1,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 40)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code='P'
GROUP BY
mtrans.merch_num
+-----------+---------------+---------------------+-------------------+
| merch_num | process_month | total_age_less_30_1 | total_age_30_40_1 |
+-----------+---------------+---------------------+-------------------+
| 3 | Nov-2017 | 0 | 0 |
| 4 | Nov-2017 | 0 | 0 |
| 1 | Nov-2017 | 2.49 | 203.68 |
| 2 | Nov-2017 | 2147.5 | 4907 |
| 5 | Nov-2017 | 0 | 0 |
+-----------+---------------+---------------------+-------------------+
I want to make 2.49 as NULL as for that merchant, more than 1 unique card is not present.
I am not able to apply having condition to check if unique card no is more than 1 then only I have to show the sum(trans_amt)
when I apply and condition in case statement, I get below error:
SELECT
mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30 and count(distinct mtrans.card_num) > 1)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_less_30_1,
NVL(SUM(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 40 and count(distinct mtrans.card_num) > 1)
THEN mtrans.trans_amt ELSE 0 END), NULL)
AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code='P'
GROUP BY
mtrans.merch_num;
ERROR: AnalysisException: aggregate function must not contain aggregate parameters: sum(CASE WHEN (round(datediff(mtrans.transaction_date, cdemo.date_birth) / 365) < 30 AND count(DISTINCT mtrans.card_num) > 1) THEN mtrans.trans_amt ELSE 0 END)
Can someone help?
The error seems to be because you have count inside the SUM statement. This is what you must try, Let me know how it goes :
SELECT
mtrans.merch_num,
FROM_UNIXTIME(UNIX_TIMESTAMP(),'MMM-yyyy') AS process_month,
NVL(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 30 and count(distinct mtrans.card_num) > 1)
THEN SUM(mtrans.trans_amt) ELSE 0 END, NULL)
AS total_age_less_30_1,
NVL(CASE
WHEN (ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date,cdemo.date_birth)/365) < 40 and count(distinct mtrans.card_num) > 1)
THEN SUM(mtrans.trans_amt) ELSE 0 END, NULL)
AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code='P'
GROUP BY
mtrans.merch_num;
I would suggest doing it in a better way as follows.
(PS: I didn't have any hive access, so I am doing this using Postgresql using regular SQL. So, it should be easier to adapt to Hive SQL).
Here is my SQL Table and records inserted in the table.
CREATE TEMPORARY TABLE hivetest (
merchant_id INTEGER,
card_number TEXT,
customer_dob TIMESTAMP,
transaction_dt TIMESTAMP,
transaction_amt DECIMAL
);
INSERT INTO hivetest VALUES
(1, 'A', '1997-12-01', '2017-11-01', 10.0),
(2, 'A', '1997-12-01', '2017-11-01', 11.0),
(2, 'B', '1980-12-01', '2017-11-01', 12.0),
(3, 'A', '1997-12-01', '2017-11-01', 13.0),
(3, 'A', '1997-12-01', '2017-11-01', 14.0),
(4, 'A', '1997-12-01', '2017-11-01', 15.0),
(4, 'C', '1980-12-01', '2017-11-01', 16.0);
First, you need to join the tables and generate a dataset that gives you the transaction_age (transaction_dt - customer_dob). I have most of the data for date subtraction in this single table, but simple INNER JOIN(s) should suffice to achieve this. Anyways, here is the query for the same.
SELECT
merchant_id, card_number, DATE(customer_dob) customer_dob, DATE(transaction_dt) transaction_dt,
DATE_PART('year', DATE(transaction_dt)) - DATE_PART('year', DATE(customer_dob)) transaction_age,
transaction_amt
FROM hivetest ORDER BY 1;
This results in the data as follows.
+-------------+-------------+--------------+----------------+-----------------+----------------+
| merchant_id | card_number | customer_dob | transaction_dt | transaction_age |transaction_amt |
+-------------+-------------+--------------+----------------+-----------------+----------------+
| 1 | A | 1997-12-01 | 2017-11-01 | 20 | 10.0 |
| 2 | A | 1997-12-01 | 2017-11-01 | 20 | 11.0 |
| 2 | B | 1980-12-01 | 2017-11-01 | 37 | 12.0 |
| 3 | A | 1997-12-01 | 2017-11-01 | 20 | 13.0 |
| 3 | A | 1997-12-01 | 2017-11-01 | 20 | 14.0 |
| 4 | A | 1997-12-01 | 2017-11-01 | 20 | 15.0 |
| 4 | C | 1980-12-01 | 2017-11-01 | 37 | 16.0 |
+-------------+-------------+--------------+----------------+-----------------+----------------+
The above dataset will allow you to categorise the SUM of transaction amounts based on the transaction_age as you want. The trick is to have the above query in a sub-query and use the results of this subquery to categorize. Here is the query to do the same.
SELECT
merchant_id,
-- Transaction Age less than 30
SUM(CASE WHEN transaction_age <= 30 THEN 1 ELSE 0 END) count_30,
SUM(CASE WHEN transaction_age <= 30 THEN transaction_amt ELSE 0 END) sum_30,
-- Transaction Age between 30 and 40
SUM(CASE WHEN transaction_age > 30 AND transaction_age <= 40 THEN 1 ELSE 0 END) case_30_40,
SUM(CASE WHEN transaction_age > 30 AND transaction_age <= 40 THEN transaction_amt ELSE 0 END) sum_30_40
FROM
(
SELECT
merchant_id, transaction_amt,
DATE_PART('year', DATE(transaction_dt)) - DATE_PART('year', DATE(customer_dob)) transaction_age
FROM hivetest
) m
GROUP BY merchant_id ORDER BY 1;
This results in the categorised output as below which gives you the count of transactions and sum of transaction amounts for each category for each merchant:
+-------------+----------+--------+------------+-----------+
| merchant_id | count_30 | sum_30 | case_30_40 | sum_30_40 |
+-------------+----------+--------+------------+-----------+
| 1 | 1 | 10.0 | 0 | 0 |
| 2 | 1 | 11.0 | 1 | 12.0 |
| 3 | 2 | 27.0 | 0 | 0 |
| 4 | 1 | 15.0 | 1 | 16.0 |
+-------------+----------+--------+------------+-----------+
Now, this is our dataset which is more or less the final result. However, as per your requirement, you are only interested in merchants which have more than 1 unique cards (COUNT(DISTINCT card_number) > 1).
So, lets write another query which gives us this. Below is the query which calculates this and based on the criteria, it marks the flag as TRUE or FALSE indicating whether or not we are interested in that merchant or not.
SELECT
merchant_id,
CASE
WHEN COUNT(DISTINCT card_number) > 1 THEN
TRUE
ELSE
FALSE
END has_distinct_cards_gt_1
FROM hivetest GROUP BY merchant_id ORDER BY 1
This gives the output as below.
+-------------+-------------------------+
| merchant_id | has_distinct_cards_gt_1 |
+-------------+-------------------------+
| 1 | false |
| 2 | true |
| 3 | false |
| 4 | true |
+-------------+-------------------------+
Now, we are almost done. We just need to join these two tables and then based on the has_distinct_cards_gt_1, display the columns accordingly from the dataset generated previously.
Here is the final join query and resultset data generated.
SELECT
merchants_all.merchant_id,
-- Age < 30
CASE
WHEN merchants_cards.has_distinct_cards_gt_1 THEN
sum_30
ELSE
0
END total_sum_30,
-- Age in 30 and 40
CASE
WHEN merchants_cards.has_distinct_cards_gt_1 THEN
sum_30_40
ELSE
0
END total_sum_30_40
FROM
(
SELECT
merchant_id,
SUM(CASE WHEN transaction_age <= 30 THEN transaction_amt ELSE 0 END) sum_30,
SUM(CASE WHEN transaction_age > 30 AND transaction_age <= 40 THEN transaction_amt ELSE 0 END) sum_30_40
FROM
(
SELECT merchant_id, DATE_PART('year', DATE(transaction_dt)) - DATE_PART('year', DATE(customer_dob)) transaction_age, transaction_amt
FROM hivetest
) m
GROUP BY merchant_id
) merchants_all
JOIN
(
SELECT merchant_id, CASE WHEN COUNT(DISTINCT card_number) > 1 THEN TRUE ELSE FALSE END has_distinct_cards_gt_1
FROM hivetest GROUP BY merchant_id ORDER BY 1
) merchants_cards
ON
(merchants_all.merchant_id = merchants_cards.merchant_id);
And this generates your final data, which you need.
+-------------+--------------+-----------------+
| merchant_id | total_sum_30 | total_sum_30_40 |
+-------------+--------------+-----------------+
| 1 | 0 | 0 |
| 2 | 11.0 | 12.0 |
| 3 | 0 | 0 |
| 4 | 15.0 | 16.0 |
+-------------+--------------+-----------------+
Let me know if this helps.
COUNT inside SUM is the problem.
Here is a solution. I haven't tested it though.
It's not obvious which table person_org_code belongs to. If it is in merch_trans_daily, then add person_org_code = 'P' to the where clause in the view. Let's know whether it works!
WITH mtrans_count AS
(SELECT merch_num,
COUNT(1) AS cnt
FROM a_sbp_db.merch_trans_daily
WHERE mtrans.transaction_date LIKE '2017-09%'
)
SELECT mtrans.merch_num
,FROM_UNIXTIME(UNIX_TIMESTAMP(), 'MMM-yyyy') AS process_month
,NVL(SUM(CASE
WHEN (
ROUND(DATEDIFF(mtrans.transaction_date, cdemo.date_birth) / 365) < 30
AND mtrans_count.cnt > 1
)
THEN mtrans.trans_amt
ELSE 0
END), NULL) AS total_age_less_30_1
,NVL(SUM(CASE
WHEN (
ROUND(DATEDIFF(mtrans.transaction_date, cdemo.date_birth) / 365) >= 30
AND ROUND(DATEDIFF(mtrans.transaction_date, cdemo.date_birth) / 365) < 40
AND mtrans_count.cnt > 1
)
THEN mtrans.trans_amt
ELSE 0
END), NULL) AS total_age_30_40_1
FROM a_sbp_db.merch_trans_daily mtrans
INNER JOIN a_sbp_db.product_holding ph ON mtrans.card_num = ph.acc_num
INNER JOIN a_sbp_db.cust_demo cdemo ON cdemo.cust_id = ph.cust_id
INNER JOIN mtrans_count ON mtrans_count.merch_num = mtrans.merch_num
WHERE mtrans.transaction_date LIKE '2017-09%'
AND person_org_code = 'P'
GROUP BY mtrans.merch_num;
Related
I want to count the total amount of pending tickets for each day in this week. I was only able to get it for one day at a time. I have this query right now:
SELECT (n.TOTAL - v.TODAY) + d.GISTER AS GISTER
FROM
(
-- Counts yesterday
SELECT
COUNT(ID) AS Gister
FROM FRESHDESK_API
-- 4 = resolved 5 = closed
-- Both count as closed
WHERE STATUS IN(4, 5)
AND TRUNC(UPDATED_AT) = TRUNC(SYSDATE - 1)
) d
CROSS JOIN
(
-- Total pending
SELECT
COUNT(ID) AS TOTAL
FROM FRESHDESK_API
-- 3 is pending
WHERE STATUS IN(3)
) n
CROSS JOIN
(
-- Pending tickets today
SELECT
COUNT(ID) AS TODAY
FROM FRESHDESK_API
-- 3 is pending
WHERE STATUS IN(3)
AND TRUNC(UPDATED_AT) = TRUNC(SYSDATE)
) v
I want to get a result like this:
+----------------------------------+---------+----------+
| day | pending_tickets |
+----------------------------------+---------+----------+
| Monday | 20 |
| Tuesday | 22 |
| Wednesday | 25 |
| Thursday | 24 |
| Friday | 19 |
+----------------------------------+---------+----------+
The table is someting like this (left the unused data out):
+----------------------------------+---------+----------+---------+-----------+----------+----------+
| id | created_at | updated_at | status |
+----------------------------------+---------+----------+----------+----------+----------+----------+
| | | | |
| | | | |
| | | | |
| | | | |
| | | | |
+----------------------------------+---------+----------+---------+-----------+---------+-----------+
You can use left join and group by as follows:
Select to_char(tday.updated_at, 'day') as updated_at,
count(tday.id) - count(yday.id) as pending_tickets
From FRESHDESK_API tday
Left join FRESHDESK_API yday
On trunc(tday.UPDATED_AT) = trunc(yday.UPDATED_AT - 1)
And trunc(yday.UPDATED_AT + 1, 'iw') = trunc(sysdate, 'iw')
And yday.status in (4,5)
Where trunc(tday.UPDATED_AT, 'iw') = trunc(sysdate, 'iw')
And tday.status = 3
Group by to_char(tday.updated_at, 'day'), trunc(tday.updated_at)
Order by trunc(tday.updated_at);
How to convert below SQL server recursive query in vertica. I know that vertica does not support recursive query. i tried using sum() over with lag but i am still not able to acheive final expected output.
with Product as (
select * from (
VALUES
(1, '2018-12-25','2019-01-05' ),
(1, '2019-03-01','2019-03-10' ),
(1, '2019-03-15','2019-03-19' ),
(1, '2019-03-22','2019-03-28' ),
(1, '2019-03-30','2019-04-02' ),
(1, '2019-04-10','2019-04-15' ),
(1, '2019-04-18','2019-04-25' )
) as a1 (ProductId ,ProductStartDt ,ProductEndDt)
), OrderedProduct as (
select *, ROW_NUMBER() over (order by ProductStartDt) as RowNum
from Product
), DateGroupsInterim (RowNum, GroupNum, GrpStartDt, Indx) as (
select RowNum, 1, ProductEndDt,1
from OrderedProduct
where RowNum=1
union all
select OrderedProduct.RowNum,
CASE WHEN OrderedProduct.ProductStartDt <= dateadd(day, 15, dgi.GrpStartDt)
THEN dgi.GroupNum
ELSE dgi.GroupNum + 1
END,
CASE WHEN OrderedProduct.ProductStartDt <= dateadd(day, 15, dgi.GrpStartDt)
THEN dgi.GrpStartDt
ELSE OrderedProduct.ProductEndDt
END,
CASE WHEN OrderedProduct.ProductStartDt <= dateadd(day, 15, dgi.GrpStartDt)
THEN 0
ELSE 1
END
from DateGroupsInterim dgi
join OrderedProduct on OrderedProduct.RowNum=dgi.RowNum+1
) select OrderedProduct.ProductId, OrderedProduct.ProductStartDt, OrderedProduct.ProductEndDt, DateGroupsInterim.GrpStartDt, DateGroupsInterim.GroupNum, Indx
from DateGroupsInterim
JOIN OrderedProduct on OrderedProduct.RowNum = DateGroupsInterim.RowNum
order by 2
Below is how the expected output looks like.
The operation you want to do is also called "sessionization" - which is the operation of splitting a time series into groups/ sub time series that have a certain meaning together.
The way you describe it, it does not seem to be possible:
The next group relies exactly on both the start of its previous group (15 min later than the start of the first row of the previous group) and the end of the previous group's last row. This needs to be a loop or a recursion, which is not offered by Vertica.
I managed to join the table with itself and get a session id for consecutive rows within 15 minutes. But, as of now, they're overlapping, and I found no way to determine which group I want to keep...
Like so:
WITH product(productid ,productstartdt ,productenddt) AS (
SELECT 1, DATE '2018-12-25',DATE '2019-01-05'
UNION ALL SELECT 1, DATE '2019-03-01',DATE '2019-03-10'
UNION ALL SELECT 1, DATE '2019-03-15',DATE '2019-03-19'
UNION ALL SELECT 1, DATE '2019-03-22',DATE '2019-03-28'
UNION ALL SELECT 1, DATE '2019-03-30',DATE '2019-04-02'
UNION ALL SELECT 1, DATE '2019-04-10',DATE '2019-04-15'
UNION ALL SELECT 1, DATE '2019-04-18',DATE '2019-04-25'
)
,
groups AS (
SELECT
a.productstartdt AS in_productstartdt
, b.*
, CONDITIONAL_CHANGE_EVENT(a.productstartdt) OVER(PARTITION BY a.productid ORDER BY a.productstartdt) AS grp
FROM product a
LEFT JOIN product b
ON a.productid = b.productid
AND a.productstartdt <= b.productstartdt
AND (a.productstartdt=b.productstartdt OR b.productstartdt <= a.productenddt + 15)
)
SELECT * FROM groups;
-- out in_productstartdt | productid | productstartdt | productenddt | grp
-- out -------------------+-----------+----------------+--------------+-----
-- out 2018-12-25 | 1 | 2018-12-25 | 2019-01-05 | 0
-- out 2019-03-01 | 1 | 2019-03-01 | 2019-03-10 | 1
-- out 2019-03-01 | 1 | 2019-03-22 | 2019-03-28 | 1
-- out 2019-03-01 | 1 | 2019-03-15 | 2019-03-19 | 1
-- out 2019-03-15 | 1 | 2019-03-15 | 2019-03-19 | 2
-- out 2019-03-15 | 1 | 2019-03-22 | 2019-03-28 | 2
-- out 2019-03-15 | 1 | 2019-03-30 | 2019-04-02 | 2
-- out 2019-03-22 | 1 | 2019-03-22 | 2019-03-28 | 3
-- out 2019-03-22 | 1 | 2019-03-30 | 2019-04-02 | 3
-- out 2019-03-22 | 1 | 2019-04-10 | 2019-04-15 | 3
-- out 2019-03-30 | 1 | 2019-04-10 | 2019-04-15 | 4
-- out 2019-03-30 | 1 | 2019-03-30 | 2019-04-02 | 4
-- out 2019-04-10 | 1 | 2019-04-10 | 2019-04-15 | 5
-- out 2019-04-10 | 1 | 2019-04-18 | 2019-04-25 | 5
-- out 2019-04-18 | 1 | 2019-04-18 | 2019-04-25 | 6
-- out (15 rows)
-- out
-- out Time: First fetch (15 rows): 35.454 ms. All rows formatted: 35.503 ms
What is the next difficulty is how to get rid of grp-s 2, 3, and 5 ....
I'm trying to do some transformations in my input flatfile. The real problem that i facing here is that my input file consist of 111 Fields .So how could i do the transformation for these many fields.
I have an Option to use UDF's but how could i pass th ose 111 fields to my UDF! Is that possible i.e, Is there is any way that we could pass the entire fields in my tables to my UDF ?
This is my input file
A|Adding||Testing|DV005| |7425478987|10 | |Jayendran | |Arumugam |V| |MALE|19711028|101 |N|01| |Candy| |1312 WEST 10TH STREET | |AUSTIN |TX| |78703 |840 | |5127768623| |8009238-12345678912|A|B|H|01500|03000|Chocalates |8009238||RAPID 7 LLC |20130501|00000000| |000| | | | | | | | | | | |N |BUS|20150901|20160831|0000000000|0000000001| |8009238-999940185-002348025-CAR|960230702-CAR-002348025-20150901|Y |CAR|20160531|20160730|0000000011|0000001321|8009238-999940185-002348025-TRAIN|960230702-TRAIN-002348025-20150901|N |TRAIN|20150901|20160831|0000000000|0000000000| | |N |VAN|20150901|20160831| |0000000000|0000000000| | | |N |TRUCK|20150101|20991231| | |N |JEEP| | |0000000000|0000000000| | |Y |PLANE|20150901|20160831| |20160319002530000001
Here's my sample output
Testing DV005 JayendranArumugam MALE
CAR2016053120160730
TRAIN0000000000000000
VAN0000000000000000
TRUCK0000000000000000
JEEP0000000000000000
PLANE2015090120160831
Please help me here to find my solution
Thanks in advance
Jay
create external table mytable (rec string)
location '/... put the location here ...'
tblproperties ('serialization.last.column.takes.rest'='true')
;
select explode
(
array
(
concat_ws(' ',f[3],f[4],concat(f[9],f[11]),f[14])
,concat(f[ 67] ,case when f[ 66] = 'Y' then concat(f[ 68] ,f[ 69]) else '0000000000000000' end)
,concat(f[ 75] ,case when f[ 74] = 'Y' then concat(f[ 76] ,f[ 77]) else '0000000000000000' end)
,concat(f[ 83] ,case when f[ 82] = 'Y' then concat(f[ 84] ,f[ 85]) else '0000000000000000' end)
,concat(f[ 93] ,case when f[ 92] = 'Y' then concat(f[ 94] ,f[ 95]) else '0000000000000000' end)
,concat(f[ 99] ,case when f[ 98] = 'Y' then concat(f[100] ,f[101]) else '0000000000000000' end)
,concat(f[107] ,case when f[106] = 'Y' then concat(f[108] ,f[109]) else '0000000000000000' end)
)
)
from (select split(rec,'\\s*\\|\\s*') as f
from mytable
) t
;
+--------------------------------------+
| col |
+--------------------------------------+
| Testing DV005 JayendranArumugam MALE |
| CAR2016053120160730 |
| TRAIN0000000000000000 |
| VAN0000000000000000 |
| TRUCK0000000000000000 |
| JEEP0000000000000000 |
| PLANE2015090120160831 |
+--------------------------------------+
Having a difficult time phrasing this question. Let me know if there's a better title.
I have a query that produces data like this:
+----------+----------+----------+----------+----------+
| KEY | FEB_GRP1 | JAN_GRP1 | FEB_GRP2 | JAN_GRP2 |
+----------+----------+----------+----------+----------+
| 50840992 | 1 | 1 | 0 | 0 |
| 50840921 | 0 | 1 | 1 | 0 |
| 50848995 | 0 | 0 | 0 | 0 |
+----------+----------+----------+----------+----------+
Alternatively, I can produce data like this:
+----------+------+------+
| KEY | JAN | FEB |
+----------+------+------+
| 50840992 | <50 | ~<50 |
| 50840921 | <50 | <50 |
| 50848995 | ~<50 | ~<50 |
| 50840885 | <50 | <50 |
+----------+------+------+
Where <50 should be counter as "group 1" and ~<50 should be counter as "group 2".
And I want it to be like this:
+-------+------+------+
| MONTH | GRP1 | GRP2 |
+-------+------+------+
| JAN | 2 | 0 |
| FEB | 1 | 1 |
+-------+------+------+
I can already get JAN_GRP1_SUM just by summing JAN_GRP1, but I want that to just be a data point, not a column itself.
My query (generates the first diagram):
SELECT *
FROM (
SELECT KEY,
CASE WHEN "FEB-1-2016" = '<50' THEN 1 ELSE 0 END AS FEB_GRP1,
CASE WHEN "FEB-1-2016" != '<50' THEN 1 ELSE 0 END AS FEB_GRP2,
CASE WHEN "JAN-1-2016" = '<50' THEN 1 ELSE 0 END AS JAN_GRP1,
CASE WHEN "JAN-1-2016" != '<50' THEN 1 ELSE 0 END AS JAN_GRP2
FROM MY_TABLE);
Your data model doesn't make much sense, but from what you've shown you can do:
select 'JAN' as month,
count(case when "JAN-1-2016" = '<50' then 1 end) as grp1,
count(case when "JAN-1-2016" != '<50' then 1 end) as grp2
from my_table
union all
select 'FEB' as month,
count(case when "FEB-1-2016" = '<50' then 1 end) as grp1,
count(case when "FEB-1-2016" != '<50' then 1 end) as grp2
from my_table;
That doesn't scale well - if you have more months you need to add another union branch for each one.
If your query is based on a view or a previously calculated summary then it will probably be much easier to go back to the original data.
If you are stuck with this then another possible approach, which might be more manageable if you actually have more than two months to look at, could be to unpivot the data:
select *
from my_table
unpivot(value for month in ("JAN-1-2016" as date '2016-01-01',
"FEB-1-2016" as date '2016-02-01') --, etc. for other months
);
and then aggregate that:
select to_char(month, 'MON', 'NLS_DATE_LANGUAGE=ENGLISH') as month,
count(case when value = '<50' then 1 end) as grp1,
count(case when value != '<50' then 1 end) as grp2
from (
select *
from my_table
unpivot(value for month in ("JAN-1-2016" as date '2016-01-01',
"FEB-1-2016" as date '2016-02-01') --, etc. for other months
)
)
group by month;
Still not pretty and Oracle is doing pretty much the same thing under the hood I think, but fewer case expressions to create and maintain - the drudge part is the unpivot pairs. You might need to include the year in the `'month' field, depending on the range of data you have.
Let's say I have the following hive table as input, let's call it connections:
userid | timestamp
--------|-------------
1 | 1433258019
1 | 1433258020
2 | 1433258080
2 | 1433258083
2 | 1433258088
2 | 1433258170
[...] | [...]
With the following query:
SELECT
userid,
timestamp,
timestamp - LAG(timestamp, 1, 0) OVER w AS timediff
CASE
WHEN timediff > 60
THEN 'new_session'
ELSE 'same_session'
END AS session_state
FROM connections
WINDOW w PARTITION BY userid ORDER BY timestamp ASC;
I'm generating the following output:
userid | timestamp | timediff | session_state
--------|-------------|------------|---------------
1 | 1433258019 | 1433258019 | new_session
1 | 1433258020 | 1 | same_session
2 | 1433258080 | 1433258080 | new_session
2 | 1433258083 | 3 | same_session
2 | 1433258088 | 5 | same_session
2 | 1433258170 | 82 | new_session
[...] | [...] | [...] | [...]
How would I do to generate that:
userid | timestamp | timediff | sessionid
--------|-------------|------------------------------
1 | 1433258019 | 1433258019 | user1-session-1
1 | 1433258020 | 1 | user1-session-1
2 | 1433258080 | 1433258080 | user2-session-1
2 | 1433258083 | 3 | user2-session-1
2 | 1433258088 | 5 | user2-session-1
2 | 1433258170 | 82 | user2-session-2
[...] | [...] | [...] | [...]
Is that possible using only HQL and "famous" UDFs (I'd rather not use custom UDFs or reducer scripts) ?
Interesting question. Per your comment to #Madhu, I added the line 2 1433258172 to your example. What you need is to increment every time timediff > 60 is satisfied. The easiest way to do this is to flag it and then cumulatively sum over the window.
Query:
select userid
, timestamp
, concat('user', userid, '-session-', s_sum) sessionid
from (
select *
, sum( counter ) over (partition by userid
order by timestamp asc
rows between unbounded preceding and current row) s_sum
from (
select *
, case when timediff > 60 then 1 else 0 end as counter
from (
select userid
, timestamp
, timestamp - lag(timestamp, 1, 0) over (partition by userid
order by timestamp asc) timediff
from connections ) x ) y ) z
Output:
1 1433258019 user1-session-1
1 1433258020 user1-session-1
2 1433258080 user2-session-1
2 1433258083 user2-session-1
2 1433258088 user2-session-1
2 1433258170 user2-session-2
2 1433258172 user2-session-2
Use the following
select concat_ws('-',name, city) from employee; the first parameter of concat_ws is separator. name and city are column names for employee table. See that they are of type strings. You can look here for more
This works:
SELECT
userid,
timestamp,
timediff,
CONCAT(
'user',
userid,
'-',
'session-',
CAST(timediff / 60 AS INT) + 1
) AS session_id
FROM (
SELECT
userid,
timestamp,
timestamp - LAG(timestamp, 1, timestamp) OVER w AS timediff
FROM connections
WINDOW w AS (
PARTITION BY userid
ORDER BY timestamp ASC
)
) a;
OUTPUT:
userid timestamp timediff session_state
1 1433258019 0.0 user1-session-1
1 1433258020 1.0 user1-session-1
2 1433258080 0.0 user2-session-1
2 1433258083 3.0 user2-session-1
2 1433258088 5.0 user2-session-1
2 1433258170 82.0 user2-session-2
3 1433258270 0.0 user3-session-1
you can try something like this if timediff is not required:
select userid,timestamp ,session_count+ concat('user',userid,'-','session-',cast(LAG(session_count-1,1,0) over w1 as string)) AS session_state
--LAG(session_count-1,1,0) over w1 AS session_count_new
FROM
(select
userid,
timestamp,
timediff,
cast (timediff/60 as int)+1 as session_count