Related
Trying to query a json hive table built on top of json data. Using json2Hive was able to generate DDL and was able to create table after removing unnecessary fields.
create external table user_tables.sample_json_table (
`apps` struct<
`app`: array<struct<
`id`: string,
`queue`: string,
`finalstatus`: string,
`trackingurl`: string,
`applicationtype`: string,
`applicationtags`: string,
`startedtime`: string,
`launchtime`: string,
`finishedtime`: string,
`memoryseconds`: string,
`vcoreseconds`: string,
`resourcesecondsmap`: struct<
`entry`: struct<
`key`: string,
`value`: string
>
>
>
>
>
)
row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
location '/xyz/location/;
Now, stuck trying to figure out how to query each field from the below schema ?
checked several articles but all of them are case specific, and need a generic explanation or example how to query each field under array/struct :)
I only care about the multiple 'app' subsection entries and would like them to be imported onto another table with separate fields for each fields.
Sample json data:
{"apps":{"app":[{"id":"application_282828282828_12717","user":"xyz","name":"xyz-4b6bdae2-1a0c-4772-bd8e-0d7454268b82","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12717/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"ABC,xyz_20221107070124_2beb5d90-24c7-4b1b-b977-3c9af1397195,userid=dummy","priority":0,"startedtime":1667822485626,"launchtime":1667822485767,"finishedtime":1667822553365,"elapsedtime":67739,"amcontainerlogs":"http://dingdong:8042/node/containerlogs/container_e65_282828282828_12717_01_000001/xyz","amhosthttpaddress":"dingdong:8042","amrpcaddress":"dingdong:46457","masternodeid":"dingdong:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":1264304,"vcoreseconds":79,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"1264304"},"entry":{"key":"vcores","value":"79"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12724","user":"xyz","name":"xyz-94962a3e-d230-4fd0-b68b-01b59dd3299d","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12724/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"ZZZ_,xyz_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy","priority":0,"startedtime":1667822585231,"launchtime":1667822585437,"finishedtime":1667822631435,"elapsedtime":46204,"amcontainerlogs":"http://ding:8042/node/containerlogs/container_e65_282828282828_12724_01_000002/xyz","amhosthttpaddress":"ding:8042","amrpcaddress":"ding:46648","masternodeid":"ding:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":5603339,"vcoreseconds":430,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"5603339"},"entry":{"key":"vcores","value":"430"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"time_out","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12736","user":"xyz","name":"xyz-1a9c73ef-2992-40a5-aaad-9f0688bb04f4","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12736/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"BLAHBLAH,xyz_20221107070609_8d261352-3efa-46c5-a5a0-8a3cd745d180,userid=dummy","priority":0,"startedtime":1667822771170,"launchtime":1667822773663,"finishedtime":1667822820351,"elapsedtime":49181,"amcontainerlogs":"http://dong:8042/node/containerlogs/container_e65_282828282828_12736_01_000001/xyz","amhosthttpaddress":"dong:8042","amrpcaddress":"dong:34266","masternodeid":"dong:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":1300011,"vcoreseconds":89,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"1300011"},"entry":{"key":"vcores","value":"89"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12735","user":"xyz","name":"xyz-d5f56a0a-9c6b-4651-8f88-6eaff5953777","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12735/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"HAHAHA_,xyz_20221107070605_a082d9d8-912f-4278-a2ef-5dfe66089fd7,userid=dummy","priority":0,"startedtime":1667822766897,"launchtime":1667822766999,"finishedtime":1667822796759,"elapsedtime":29862,"amcontainerlogs":"http://dung:8042/node/containerlogs/container_e65_282828282828_12735_01_000001/xyz","amhosthttpaddress":"dung:8042","amrpcaddress":"dung:42765","masternodeid":"dung:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":669695,"vcoreseconds":44,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"669695"},"entry":{"key":"vcores","value":"44"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}}]}}
sample query output :
id | queue | finalStatus | trackingurl |....
-----------------------------------------------------------
application_282828282828_12717 | root.users.dummy | succeeded | ...
application_282828282828_12724 | root.users.dummy2 | failed | ....
For anyone looking to perform something similar ,I found this article very helpful with clear explanation: https://community.cloudera.com/t5/Support-Questions/Complex-Json-transformation-using-Hive-functions/m-p/236476
Below is the query to parse using LATERAL VIEW EXPLODE in case people on the same boat:
select ex1.* from user_tables.sample_json_table cym LATERAL VIEW OUTER inline(cym.apps.app) ex1;
| id | queue | finalstatus | trackingurl | applicationtype | applicationtags | startedtime | launchtime | finishedtime | memoryseconds | vcoreseconds | resourcesecondsmap |
| ------------------------------- | ----------------- | ----------- | ------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------- | ------------- | ------------- | ------------- | ------------ | ---------------------------------------- |
| application_1667627410794_12717 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12717/ | tez | \_xyz,test-app-24c7-4b1b-b977-3c9af1397195,userid=dummy1 | 1667822485626 | 1667822485767 | 1667822553365 | 1264304 | 79 | {"entry":{"key":"vcores","value":"79"}} |
| application_1667627410794_12724 | root.users.dummy3 | succeeded | http://dang:8088/proxy/application_1667627410794_12724/ | tez | \_generate_stuff,hive_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy3 | 1667822585231 | 1667822585437 | 1667822631435 | 5603339 | 430 | {"entry":{"key":"vcores","value":"430"}} |
| application_1667627410794_12736 | root.users.dummy1 | succeeded | http://dang:8088/proxy/application_1667627410794_12736/ | tez | \_sample_job,test-zzz-3efa-46c5-a5a0-8a3cd745d180,userid=dummy1 | 1667822771170 | 1667822773663 | 1667822820351 | 1300011 | 89 | {"entry":{"key":"vcores","value":"89"}} |
| application_1667627410794_12735 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12735/ | tez | \_mixed_article,placebo_2-912f-4278-a2ef-5dfe66089fd7,userid=dummy2 | 1667822766897 | 1667822766999 | 1667822796759 | 669695 | 44 | {"entry":{"key":"vcores","value":"44"}} |
Add. Note: Although my requirement no longer needs it, but If anyone can suggest how to further parse the last field resourcesecondsmap to populate map key value would be great to know! basically use key value as field and value as actual value in field:
Desired Output:
| id | queue | finalstatus | trackingurl | applicationtype | applicationtags | startedtime | launchtime | finishedtime | memoryseconds | vcoreseconds | vcores-value |
| ------------------------------- | ----------------- | ----------- | ------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------- | ------------- | ------------- | ------------- | ------------ | ------------ |
| application_1667627410794_12717 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12717/ | tez | \_xyz,test-app-24c7-4b1b-b977-3c9af1397195,userid=dummy1 | 1667822485626 | 1667822485767 | 1667822553365 | 1264304 | 79 | 79 |
| application_1667627410794_12724 | root.users.dummy3 | succeeded | http://dang:8088/proxy/application_1667627410794_12724/ | tez | \_generate_stuff,hive_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy3 | 1667822585231 | 1667822585437 | 1667822631435 | 5603339 | 430 | 430 |
Yesterday, I was create my first datasource Druid from Hive. Today, I'm not sure that works...
First, I ran the following code for create my Db :
SET hive.druid.broker.address.default = 10.20.173.30:8082;
SET hive.druid.metadata.username = druid;
SET hive.druid.metadata.password = druid_password;
SET hive.druid.metadata.db.type = postgresql;
SET hive.druid.metadata.uri = jdbc:postgresql://10.20.173.31:5432/druid;
CREATE EXTERNAL TABLE test (
`__time` TIMESTAMP,
`userId` STRING,
`lang` STRING,
`location` STRING,
`name` STRING
)
STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
I can see this datasource on my Hive architecture. How can I know that this datasource is a Druid Datasource and not a Hive table.
I tested this but I don't know if it's a Druid datasource.
DESCRIBE FORMATTED test;
Result
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
| col_name | data_type | comment |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
| # col_name | data_type | comment |
| __time | timestamp | from deserializer |
| userid | string | from deserializer |
| lang | string | from deserializer |
| location | string | from deserializer |
| name | string | from deserializer |
| # Detailed Table Information | NULL | NULL |
| Database: | druid_datasources | NULL |
| OwnerType: | USER | NULL |
| Owner: | hive | NULL |
| CreateTime: | Tue Oct 15 12:42:22 CEST 2019 | NULL |
| LastAccessTime: | UNKNOWN | NULL |
| Retention: | 0 | NULL |
| Location: | hdfs://10.20.173.30:8020/warehouse/tablespace/external/hive/druid_datasources.db/test | NULL |
| Table Type: | EXTERNAL_TABLE | NULL |
| Table Parameters: | NULL | NULL |
| | COLUMN_STATS_ACCURATE | {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"__time\":\"true\",\"lang\":\"true\",\"location\":\"true\",\"name\":\"true\",\"userid\":\"true\"}} |
| | EXTERNAL | TRUE |
| | bucketing_version | 2 |
| | druid.datasource | druid_datasources.test ||
| | numFiles | 0 |
| | numRows | 0 |
| | rawDataSize | 0 |
| | storage_handler | org.apache.hadoop.hive.druid.DruidStorageHandler |
| | totalSize | 0 |
| | transient_lastDdlTime | 1571136142 |
| | NULL | NULL |
| # Storage Information | NULL | NULL |
| SerDe Library: | org.apache.hadoop.hive.druid.serde.DruidSerDe | NULL |
| InputFormat: | null | NULL |
| OutputFormat: | null | NULL |
| Compressed: | No | NULL |
| Num Buckets: | -1 | NULL |
| Bucket Columns: | [] | NULL |
| Sort Columns: | [] | NULL |
| Storage Desc Params: | NULL | NULL |
| | serialization.format | 1 |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
I did well or it's a Hive table with Druid parameters ?
Someone can explain me more about Hive/Druid interactions ?
Thanks :D
I think you registered your druid datasource in hive. Now you can run your queries using hive server on top of this table.
Your table definition look correct to me I think you managed to integrate druid datasoruce with hive. You can see druid related properties in table.
Now when you query the table it will use processing engine depending on the query it will use hive server along with druid. It can use combination of both or one of them on standalone basis to execute query. It depends whether that query can be converted to druid query or not.
You can refer to this doc for more info on Hive/Druid interactions : https://cwiki.apache.org/confluence/display/Hive/Druid+Integration (refer:Querying Druid from Hive)
I'm considering whether I should format a table in my sqlite database in "wide or "long" format. Examples of these formats are included at the end of the question.
I anticipate that the majority of my requests will be of the form:
SELECT * FROM table
WHERE
series in (series1, series100);
or the analog for selecting by columns in wide format.
I also anticipate that there will be a large number of columns, even enough to need to increase the column limit.
Are there any general guidelines for selecting a table layout that will optimize query performance for this sort of case?
(Examples of each)
"Wide" format:
| date | series1 | series2 | ... | seriesN |
| ---------- | ------- | ------- | ---- | ------- |
| "1/1/1900" | 15 | 24 | 43 | 23 |
| "1/2/1900" | 15 | null | null | 23 |
| ... | 15 | null | null | 23 |
| "1/2/2019" | 12 | 12 | 4 | null |
"Long" format:
| date | series | value |
| ---------- | ------- | ----- |
| "1/1/1900" | series1 | 15 |
| "1/2/1900" | series1 | 15 |
| ... | series1 | 43 |
| "1/2/2019" | series1 | 12 |
| "1/1/1900" | series2 | 15 |
| "1/2/1900" | series2 | 15 |
| ... | series2 | 43 |
| "1/2/2019" | series2 | 12 |
| ... | ... | ... |
| "1/1/1900" | seriesN | 15 |
| "1/2/1900" | seriesN | 15 |
| ... | seriesN | 43 |
| "1/2/2019" | seriesN | 12 |
The "long" format is the preferred way to go here, for so many reasons. First, if you use the "wide" format and there is ever a need to add more series, then you would have to add new columns to the database table. While this is not too much of a hassle, in general once you put a schema into production, you want to avoid further schema changes.
Second, the "long" format makes reporting and querying much easier. For example, suppose you wanted to get a count of rows/data points for each series. Then you would only need something like:
SELECT series, COUNT(*) AS cnt
FROM yourTable
GROUP BY series;
To get this report with the "wide" format, you would need a lot more code, and it would be as verbose as your sample data above.
The thing to keep in mind here is that SQL databases are built to operate on sets of records (read: across rows). They can also process things column wise, but they are not generally setup to do this.
I am learning hive and read an article about when to use HIVE external table and mentioned the statement below.
To query data stored in external system such as amazon s3
- Avoid brining in that data into HDFS
Can anyone elaborate above statement. "Avoid brining in that data into HDFS"? Load data local command will help to load local file into HDFS and HIVE is applying the format on the top.
Is it possible to access the data which is out of HDFS?
is it possible to access the data which is out of HDFS?
HIve can read data on any Hadoop Compatible filesystem, not only HDFS.
Can someone elaborate above statement. "Avoid brining in that data into HDFS "?
With the example of S3, you can create an external table with a location of s3a://bucket/path, there's no need to bring it to HDFS unless you really needed the speed of reading HDFS compared to S3. However, to persist a dataset in an ephemeral cloud cluster, results should be written back to whatever long-term storage is provided.
It is possible. You can try this yourself. On CDH, I have a file extn\t.txt
[cloudera#quickstart ~]$ pwd
/home/cloudera
[cloudera#quickstart ~]$ cat extn/t.txt
something
[cloudera#quickstart ~]$
I can now create an external table to access this file as follows
create external table tbl(line string)
location 'file:///home/cloudera/extn'
Describe table
INFO : OK
+-----------+------------+----------+--+
| col_name | data_type | comment |
+-----------+------------+----------+--+
| line | string | |
+-----------+------------+----------+--+
1 row selected (0.152 seconds)
0: jdbc:hive2://localhost:10000>
Select
INFO : OK
+------------+--+
| tbl.line |
+------------+--+
| something |
+------------+--+
1 row selected (0.134 seconds)
0: jdbc:hive2://localhost:10000>
Describe formatted
+-------------------------------+----------------------------------------------------+-----------------------+--+
| col_name | data_type | comment |
+-------------------------------+----------------------------------------------------+-----------------------+--+
| # col_name | data_type | comment |
| | NULL | NULL |
| line | string | |
| | NULL | NULL |
| # Detailed Table Information | NULL | NULL |
| Database: | default | NULL |
| Owner: | cloudera | NULL |
| CreateTime: | Tue Feb 20 12:49:25 PST 2018 | NULL |
| LastAccessTime: | UNKNOWN | NULL |
| Protect Mode: | None | NULL |
| Retention: | 0 | NULL |
| Location: | file:/home/cloudera/extn | NULL |
| Table Type: | EXTERNAL_TABLE | NULL |
| Table Parameters: | NULL | NULL |
| | COLUMN_STATS_ACCURATE | false |
| | EXTERNAL | TRUE |
| | numFiles | 0 |
| | numRows | -1 |
| | rawDataSize | -1 |
| | totalSize | 0 |
| | transient_lastDdlTime | 1519159765 |
| | NULL | NULL |
| # Storage Information | NULL | NULL |
| SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL |
| InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL |
| OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL |
| Compressed: | No | NULL |
| Num Buckets: | -1 | NULL |
| Bucket Columns: | [] | NULL |
| Sort Columns: | [] | NULL |
| Storage Desc Params: | NULL | NULL |
| | serialization.format | 1 |
+-------------------------------+----------------------------------------------------+-----------------------+
Load data is different. Please check this External Table vs Load Data
I have huge dataset with 1000 columns stored on HDFS. I want to create a hive table to filter and work on the data.
CREATE EXTERNAL TABLE IF NOT EXISTS tablename(
var1 INT,var2 STRING, var2 STRING)
COMMENT 'testbykasa'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/folder1/';
For smaller no. of columns(~ 5-10), I manually specify the column name and column type. Is there a way to get hive create the table by inferring the column name and datatype, without manually specifying it.
Demo
mydata.csv
2,2,8,1,5,1,8,1,4,1,3,4,9,2,8,2,6,5,3,1,5,5,8,0,1,6,0,7,1,4
2,6,8,7,7,9,9,3,8,7,3,1,9,1,7,5,9,7,1,2,5,7,0,5,1,2,6,4,0,4
0,0,1,3,6,5,6,2,4,2,4,9,0,4,9,8,1,0,2,8,4,7,8,3,9,7,8,9,5,5
3,4,9,1,8,7,4,2,1,0,4,3,1,4,6,6,7,4,9,9,6,7,9,5,2,2,8,0,2,9
3,4,8,9,9,1,5,2,7,4,7,1,4,9,8,9,3,3,2,3,3,5,4,8,6,5,8,8,6,4
4,0,6,9,3,2,4,2,9,4,6,8,8,2,6,7,1,7,3,1,6,6,5,2,9,9,4,6,9,7
7,0,9,3,7,6,5,5,7,2,4,2,7,4,6,1,0,9,8,2,5,7,1,4,0,4,3,9,4,3
2,8,3,7,7,3,3,6,9,3,5,5,0,7,5,3,6,2,9,0,8,2,3,0,6,2,4,3,2,6
3,2,0,8,8,8,1,8,4,0,5,2,5,0,2,0,4,1,2,2,1,0,2,8,6,7,2,2,7,0
0,5,9,1,0,3,1,9,3,6,2,1,5,0,6,6,3,8,2,8,0,0,1,9,1,5,5,2,4,8
create external table mycsv (rec string)
row format delimited
stored as textfile
tblproperties ('serialization.last.column.takes.rest'='true')
;
select pe.pos + 1 as col
,count(distinct pe.val) as count_distinct_val
from mycsv
lateral view posexplode(split(rec,',')) pe
group by pe.pos
;
+------+---------------------+
| col | count_distinct_val |
+------+---------------------+
| 1 | 5 |
| 2 | 6 |
| 3 | 6 |
| 4 | 5 |
| 5 | 7 |
| 6 | 8 |
| 7 | 7 |
| 8 | 7 |
| 9 | 6 |
| 10 | 7 |
| 11 | 6 |
| 12 | 7 |
| 13 | 7 |
| 14 | 6 |
| 15 | 6 |
| 16 | 9 |
| 17 | 7 |
| 18 | 9 |
| 19 | 5 |
| 20 | 6 |
| 21 | 7 |
| 22 | 5 |
| 23 | 8 |
| 24 | 7 |
| 25 | 5 |
| 26 | 6 |
| 27 | 7 |
| 28 | 8 |
| 29 | 8 |
| 30 | 8 |
+------+---------------------+
Yes, it is possible, but not with SQL script. To do this I use a Python script that read the first line of the csv file and create a script dynamically sending to Hive using the pyhive library (and erasing the first line of the csv). To identify the types, is just use Python functions to discovery if is a String, a Number etc.
The problem with Python is that it just work on Python 2.7, so I recommend you considere try to do the same code on Scala.