Trying to query a json hive table built on top of json data. Using json2Hive was able to generate DDL and was able to create table after removing unnecessary fields.
create external table user_tables.sample_json_table (
`apps` struct<
`app`: array<struct<
`id`: string,
`queue`: string,
`finalstatus`: string,
`trackingurl`: string,
`applicationtype`: string,
`applicationtags`: string,
`startedtime`: string,
`launchtime`: string,
`finishedtime`: string,
`memoryseconds`: string,
`vcoreseconds`: string,
`resourcesecondsmap`: struct<
`entry`: struct<
`key`: string,
`value`: string
>
>
>
>
>
)
row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
location '/xyz/location/;
Now, stuck trying to figure out how to query each field from the below schema ?
checked several articles but all of them are case specific, and need a generic explanation or example how to query each field under array/struct :)
I only care about the multiple 'app' subsection entries and would like them to be imported onto another table with separate fields for each fields.
Sample json data:
{"apps":{"app":[{"id":"application_282828282828_12717","user":"xyz","name":"xyz-4b6bdae2-1a0c-4772-bd8e-0d7454268b82","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12717/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"ABC,xyz_20221107070124_2beb5d90-24c7-4b1b-b977-3c9af1397195,userid=dummy","priority":0,"startedtime":1667822485626,"launchtime":1667822485767,"finishedtime":1667822553365,"elapsedtime":67739,"amcontainerlogs":"http://dingdong:8042/node/containerlogs/container_e65_282828282828_12717_01_000001/xyz","amhosthttpaddress":"dingdong:8042","amrpcaddress":"dingdong:46457","masternodeid":"dingdong:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":1264304,"vcoreseconds":79,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"1264304"},"entry":{"key":"vcores","value":"79"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12724","user":"xyz","name":"xyz-94962a3e-d230-4fd0-b68b-01b59dd3299d","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12724/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"ZZZ_,xyz_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy","priority":0,"startedtime":1667822585231,"launchtime":1667822585437,"finishedtime":1667822631435,"elapsedtime":46204,"amcontainerlogs":"http://ding:8042/node/containerlogs/container_e65_282828282828_12724_01_000002/xyz","amhosthttpaddress":"ding:8042","amrpcaddress":"ding:46648","masternodeid":"ding:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":5603339,"vcoreseconds":430,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"5603339"},"entry":{"key":"vcores","value":"430"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"time_out","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12736","user":"xyz","name":"xyz-1a9c73ef-2992-40a5-aaad-9f0688bb04f4","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12736/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"BLAHBLAH,xyz_20221107070609_8d261352-3efa-46c5-a5a0-8a3cd745d180,userid=dummy","priority":0,"startedtime":1667822771170,"launchtime":1667822773663,"finishedtime":1667822820351,"elapsedtime":49181,"amcontainerlogs":"http://dong:8042/node/containerlogs/container_e65_282828282828_12736_01_000001/xyz","amhosthttpaddress":"dong:8042","amrpcaddress":"dong:34266","masternodeid":"dong:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":1300011,"vcoreseconds":89,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"1300011"},"entry":{"key":"vcores","value":"89"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12735","user":"xyz","name":"xyz-d5f56a0a-9c6b-4651-8f88-6eaff5953777","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12735/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"HAHAHA_,xyz_20221107070605_a082d9d8-912f-4278-a2ef-5dfe66089fd7,userid=dummy","priority":0,"startedtime":1667822766897,"launchtime":1667822766999,"finishedtime":1667822796759,"elapsedtime":29862,"amcontainerlogs":"http://dung:8042/node/containerlogs/container_e65_282828282828_12735_01_000001/xyz","amhosthttpaddress":"dung:8042","amrpcaddress":"dung:42765","masternodeid":"dung:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":669695,"vcoreseconds":44,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"669695"},"entry":{"key":"vcores","value":"44"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}}]}}
sample query output :
id | queue | finalStatus | trackingurl |....
-----------------------------------------------------------
application_282828282828_12717 | root.users.dummy | succeeded | ...
application_282828282828_12724 | root.users.dummy2 | failed | ....
For anyone looking to perform something similar ,I found this article very helpful with clear explanation: https://community.cloudera.com/t5/Support-Questions/Complex-Json-transformation-using-Hive-functions/m-p/236476
Below is the query to parse using LATERAL VIEW EXPLODE in case people on the same boat:
select ex1.* from user_tables.sample_json_table cym LATERAL VIEW OUTER inline(cym.apps.app) ex1;
| id | queue | finalstatus | trackingurl | applicationtype | applicationtags | startedtime | launchtime | finishedtime | memoryseconds | vcoreseconds | resourcesecondsmap |
| ------------------------------- | ----------------- | ----------- | ------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------- | ------------- | ------------- | ------------- | ------------ | ---------------------------------------- |
| application_1667627410794_12717 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12717/ | tez | \_xyz,test-app-24c7-4b1b-b977-3c9af1397195,userid=dummy1 | 1667822485626 | 1667822485767 | 1667822553365 | 1264304 | 79 | {"entry":{"key":"vcores","value":"79"}} |
| application_1667627410794_12724 | root.users.dummy3 | succeeded | http://dang:8088/proxy/application_1667627410794_12724/ | tez | \_generate_stuff,hive_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy3 | 1667822585231 | 1667822585437 | 1667822631435 | 5603339 | 430 | {"entry":{"key":"vcores","value":"430"}} |
| application_1667627410794_12736 | root.users.dummy1 | succeeded | http://dang:8088/proxy/application_1667627410794_12736/ | tez | \_sample_job,test-zzz-3efa-46c5-a5a0-8a3cd745d180,userid=dummy1 | 1667822771170 | 1667822773663 | 1667822820351 | 1300011 | 89 | {"entry":{"key":"vcores","value":"89"}} |
| application_1667627410794_12735 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12735/ | tez | \_mixed_article,placebo_2-912f-4278-a2ef-5dfe66089fd7,userid=dummy2 | 1667822766897 | 1667822766999 | 1667822796759 | 669695 | 44 | {"entry":{"key":"vcores","value":"44"}} |
Add. Note: Although my requirement no longer needs it, but If anyone can suggest how to further parse the last field resourcesecondsmap to populate map key value would be great to know! basically use key value as field and value as actual value in field:
Desired Output:
| id | queue | finalstatus | trackingurl | applicationtype | applicationtags | startedtime | launchtime | finishedtime | memoryseconds | vcoreseconds | vcores-value |
| ------------------------------- | ----------------- | ----------- | ------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------- | ------------- | ------------- | ------------- | ------------ | ------------ |
| application_1667627410794_12717 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12717/ | tez | \_xyz,test-app-24c7-4b1b-b977-3c9af1397195,userid=dummy1 | 1667822485626 | 1667822485767 | 1667822553365 | 1264304 | 79 | 79 |
| application_1667627410794_12724 | root.users.dummy3 | succeeded | http://dang:8088/proxy/application_1667627410794_12724/ | tez | \_generate_stuff,hive_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy3 | 1667822585231 | 1667822585437 | 1667822631435 | 5603339 | 430 | 430 |
I have a set of data ("My Data") shown below, how to shift the data from rows to columns in Power Query?
("My Preferred Answer") would be the final output.
My Data:
| FruitName | Price | Quantity |
| --------- | ----- | -------- |
| Apple | 1 | 1 |
| Banana | 2 | 1 |
| Orange | 3 | 1 |
| Colour | *null* | *null* |
| Apple | Red | *null* |
| Banana | Yellow | *null* |
| Orange | Orange | *null* |
My Preferred Answer:
| FruitName | Price | Quantity | Colour |
| --------- | ----- | -------- | ------ |
| Apple | 1 | 1 | Red |
| Banana | 2 | 1 | Yellow |
| Orange | 3 | 1 | Orange |
Read the code comments to better understand the algorithm.
Split the table at the "colour" line
Join the colours with the inventory table
M Code
let
//Change Table name in next line to the actual table name in your workbook
Source = Excel.CurrentWorkbook(){[Name="Table28"]}[Content],
//Separate table for Color and Inventory
splitTable = Table.SplitAt(Source, List.PositionOf(Source[FruitName],"Colour")),
//Remove the row with the table split word "colour"
//Remove the Quantity column
//Rename the Price column
//Set the data types
colourTable = Table.TransformColumnTypes(
Table.RenameColumns(
Table.RemoveColumns(
Table.RemoveFirstN(splitTable{1},1),"Quantity"),{"Price","Colour"}),
{{"FruitName",type text},{"Colour",type text}}),
//Set the data types
inventoryTable = Table.TransformColumnTypes(splitTable{0},{
{"FruitName", type text},
{"Price",Currency.Type},
{"Quantity", Int64.Type}
}),
//Join the colour column
joined = Table.NestedJoin(inventoryTable,"FruitName",colourTable,"FruitName","Joined",JoinKind.LeftOuter),
#"Expanded Joined" = Table.ExpandTableColumn(joined, "Joined", {"Colour"}, {"Colour"})
in
#"Expanded Joined"