query hive json serde table having nested ARRAY & STRUCT combination

query hive json serde table having nested ARRAY & STRUCT combination - hadoop

Trying to query a json hive table built on top of json data. Using json2Hive was able to generate DDL and was able to create table after removing unnecessary fields.
create external table user_tables.sample_json_table (
`apps` struct<
`app`: array<struct<
`id`: string,
`queue`: string,
`finalstatus`: string,
`trackingurl`: string,
`applicationtype`: string,
`applicationtags`: string,
`startedtime`: string,
`launchtime`: string,
`finishedtime`: string,
`memoryseconds`: string,
`vcoreseconds`: string,
`resourcesecondsmap`: struct<
`entry`: struct<
`key`: string,
`value`: string
>
>
>
>
>
)
row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
location '/xyz/location/;
Now, stuck trying to figure out how to query each field from the below schema ?
checked several articles but all of them are case specific, and need a generic explanation or example how to query each field under array/struct :)
I only care about the multiple 'app' subsection entries and would like them to be imported onto another table with separate fields for each fields.
Sample json data:
{"apps":{"app":[{"id":"application_282828282828_12717","user":"xyz","name":"xyz-4b6bdae2-1a0c-4772-bd8e-0d7454268b82","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12717/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"ABC,xyz_20221107070124_2beb5d90-24c7-4b1b-b977-3c9af1397195,userid=dummy","priority":0,"startedtime":1667822485626,"launchtime":1667822485767,"finishedtime":1667822553365,"elapsedtime":67739,"amcontainerlogs":"http://dingdong:8042/node/containerlogs/container_e65_282828282828_12717_01_000001/xyz","amhosthttpaddress":"dingdong:8042","amrpcaddress":"dingdong:46457","masternodeid":"dingdong:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":1264304,"vcoreseconds":79,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"1264304"},"entry":{"key":"vcores","value":"79"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12724","user":"xyz","name":"xyz-94962a3e-d230-4fd0-b68b-01b59dd3299d","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12724/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"ZZZ_,xyz_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy","priority":0,"startedtime":1667822585231,"launchtime":1667822585437,"finishedtime":1667822631435,"elapsedtime":46204,"amcontainerlogs":"http://ding:8042/node/containerlogs/container_e65_282828282828_12724_01_000002/xyz","amhosthttpaddress":"ding:8042","amrpcaddress":"ding:46648","masternodeid":"ding:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":5603339,"vcoreseconds":430,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"5603339"},"entry":{"key":"vcores","value":"430"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"time_out","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12736","user":"xyz","name":"xyz-1a9c73ef-2992-40a5-aaad-9f0688bb04f4","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12736/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"BLAHBLAH,xyz_20221107070609_8d261352-3efa-46c5-a5a0-8a3cd745d180,userid=dummy","priority":0,"startedtime":1667822771170,"launchtime":1667822773663,"finishedtime":1667822820351,"elapsedtime":49181,"amcontainerlogs":"http://dong:8042/node/containerlogs/container_e65_282828282828_12736_01_000001/xyz","amhosthttpaddress":"dong:8042","amrpcaddress":"dong:34266","masternodeid":"dong:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":1300011,"vcoreseconds":89,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"1300011"},"entry":{"key":"vcores","value":"89"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}},{"id":"application_282828282828_12735","user":"xyz","name":"xyz-d5f56a0a-9c6b-4651-8f88-6eaff5953777","queue":"root.users.dummy","state":"finished","finalstatus":"succeeded","progress":100.0,"trackingui":"history","trackingurl":"http://dang:8088/proxy/application_282828282828_12735/","diagnostics":"session stats:submitteddags=1, successfuldags=1, faileddags=0, killeddags=0\n","clusterid":282828282828,"applicationtype":"aquaman","applicationtags":"HAHAHA_,xyz_20221107070605_a082d9d8-912f-4278-a2ef-5dfe66089fd7,userid=dummy","priority":0,"startedtime":1667822766897,"launchtime":1667822766999,"finishedtime":1667822796759,"elapsedtime":29862,"amcontainerlogs":"http://dung:8042/node/containerlogs/container_e65_282828282828_12735_01_000001/xyz","amhosthttpaddress":"dung:8042","amrpcaddress":"dung:42765","masternodeid":"dung:8041","allocatedmb":-1,"allocatedvcores":-1,"reservedmb":-1,"reservedvcores":-1,"runningcontainers":-1,"memoryseconds":669695,"vcoreseconds":44,"queueusagepercentage":0.0,"clusterusagepercentage":0.0,"resourcesecondsmap":{"entry":{"key":"memory-mb","value":"669695"},"entry":{"key":"vcores","value":"44"}},"preemptedresourcemb":0,"preemptedresourcevcores":0,"numnonamcontainerpreempted":0,"numamcontainerpreempted":0,"preemptedmemoryseconds":0,"preemptedvcoreseconds":0,"preemptedresourcesecondsmap":{},"logaggregationstatus":"succeeded","unmanagedapplication":false,"amnodelabelexpression":"","timeouts":{"timeout":[{"type":"lifetime","expirytime":"unlimited","remainingtimeinseconds":-1}]}}]}}
sample query output :
id | queue | finalStatus | trackingurl |....
-----------------------------------------------------------
application_282828282828_12717 | root.users.dummy | succeeded | ...
application_282828282828_12724 | root.users.dummy2 | failed | ....

For anyone looking to perform something similar ,I found this article very helpful with clear explanation: https://community.cloudera.com/t5/Support-Questions/Complex-Json-transformation-using-Hive-functions/m-p/236476
Below is the query to parse using LATERAL VIEW EXPLODE in case people on the same boat:
select ex1.* from user_tables.sample_json_table cym LATERAL VIEW OUTER inline(cym.apps.app) ex1;
| id | queue | finalstatus | trackingurl | applicationtype | applicationtags | startedtime | launchtime | finishedtime | memoryseconds | vcoreseconds | resourcesecondsmap |
| ------------------------------- | ----------------- | ----------- | ------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------- | ------------- | ------------- | ------------- | ------------ | ---------------------------------------- |
| application_1667627410794_12717 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12717/ | tez | \_xyz,test-app-24c7-4b1b-b977-3c9af1397195,userid=dummy1 | 1667822485626 | 1667822485767 | 1667822553365 | 1264304 | 79 | {"entry":{"key":"vcores","value":"79"}} |
| application_1667627410794_12724 | root.users.dummy3 | succeeded | http://dang:8088/proxy/application_1667627410794_12724/ | tez | \_generate_stuff,hive_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy3 | 1667822585231 | 1667822585437 | 1667822631435 | 5603339 | 430 | {"entry":{"key":"vcores","value":"430"}} |
| application_1667627410794_12736 | root.users.dummy1 | succeeded | http://dang:8088/proxy/application_1667627410794_12736/ | tez | \_sample_job,test-zzz-3efa-46c5-a5a0-8a3cd745d180,userid=dummy1 | 1667822771170 | 1667822773663 | 1667822820351 | 1300011 | 89 | {"entry":{"key":"vcores","value":"89"}} |
| application_1667627410794_12735 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12735/ | tez | \_mixed_article,placebo_2-912f-4278-a2ef-5dfe66089fd7,userid=dummy2 | 1667822766897 | 1667822766999 | 1667822796759 | 669695 | 44 | {"entry":{"key":"vcores","value":"44"}} |
Add. Note: Although my requirement no longer needs it, but If anyone can suggest how to further parse the last field resourcesecondsmap to populate map key value would be great to know! basically use key value as field and value as actual value in field:
Desired Output:
| id | queue | finalstatus | trackingurl | applicationtype | applicationtags | startedtime | launchtime | finishedtime | memoryseconds | vcoreseconds | vcores-value |
| ------------------------------- | ----------------- | ----------- | ------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------- | ------------- | ------------- | ------------- | ------------ | ------------ |
| application_1667627410794_12717 | root.users.dummy2 | succeeded | http://dang:8088/proxy/application_1667627410794_12717/ | tez | \_xyz,test-app-24c7-4b1b-b977-3c9af1397195,userid=dummy1 | 1667822485626 | 1667822485767 | 1667822553365 | 1264304 | 79 | 79 |
| application_1667627410794_12724 | root.users.dummy3 | succeeded | http://dang:8088/proxy/application_1667627410794_12724/ | tez | \_generate_stuff,hive_20221107070301_e6f788db-e39c-49b6-97d5-6a02ff994c00,userid=dummy3 | 1667822585231 | 1667822585437 | 1667822631435 | 5603339 | 430 | 430 |

Related

how can a table of all databases be sent to elasticsearch?

Here's my situation.
Food_database is in mysql.
There are 130 tables in food_database
I would like to send 130 tables to elasticsearch via logstash_jdbc.
-> how can a table of all databases be sent to elasticsearch?
my conf file (attempt)
input {
jdbc {
clean_run => true
jdbc_driver_library => "C:\ElasticSearch\mysql-connector-java-8.0.23\mysql-connector-java-8.0.23.jar"
jdbc_driver_class => "com.mysql.jdbc.Driver"
jdbc_connection_string => "jdbc:mysql://localhost:3306/food_database?useSSL=false&user=root&password=1234"
jdbc_user => "root"
jdbc_password => "1234"
schedule => "* * * * *"
statement => "select * from ??????"
#use_column_value => true
#tracking_column => "jobid"
}
}
output{
elasticsearch {
hosts => "localhost:9200"
index => "test_indexfile"
}
stdout {
codec => rubydebug
}
}
But I don't know how to send all 130 tables in food_databases.
I found a similar question through googling, but I couldn't solve it.
-> save whole database to elasticsearch using logstash
-> https://dzone.com/articles/migrating-mysql-data-to-elasticsearch-using-logsta
Please help me.
update posting (tables in food_database)
+--------------------------------------+
| Tables_in_food_database |
+--------------------------------------+
| access_token |
| activity |
| address |
| answer_abuse_reason |
| answer_report_abuse |
| attribute |
| attribute_group |
| banner |
| banner_group |
| banner_image |
| banner_image_description |
| blog |
| blog_related |
| category |
| category_commission |
| category_description |
| category_path |
| contact |
| country |
| coupon |
| coupon_product_category |
| coupon_usage |
| coupon_usage_product |
| currency |
| customer |
| customer_activity |
| customer_cart |
| customer_document |
| customer_group |
| customer_ip |
| customer_transaction |
| customer_wishlist |
| delivery_allocation |
| delivery_location |
| delivery_location_to_location |
| delivery_person |
| delivery_person_to_location |
| delivery_status |
| email_template |
| geo_zone |
| jobs |
| language |
| login_log |
| manufacturer |
| migrations |
| order |
| order_cancel_reason |
| order_history |
| order_log |
| order_product |
| order_product_log |
| order_status |
| order_total |
| page |
| page_group |
| payment |
| payment_archive |
| payment_items |
| payment_items_archive |
| paypal_order |
| paypal_order_transaction |
| permission_module |
| permission_module_group |
| plugins |
| price_update_file_log |
| product |
| product_answer |
| product_answer_like_dislike |
| product_attribute |
| product_description |
| product_discount |
| product_image |
| product_price_log |
| product_question |
| product_rating |
| product_related |
| product_special |
| product_stock_alert |
| product_tag |
| product_tire_price |
| product_to_category |
| product_varient |
| product_varient_option |
| product_varient_option_details |
| product_varient_option_image |
| product_view_log |
| quotation |
| razorpay_order |
| razorpay_order_transaction |
| service |
| service_category |
| service_category_path |
| service_enquiry |
| service_image |
| service_to_category |
| sessions |
| settings |
| settlement |
| settlement_item |
| site_filter |
| site_filter_category |
| site_filter_section |
| site_filter_section_item |
| sku |
| stock_log |
| stock_status |
| stripe_order |
| stripe_order_transaction |
| tax |
| trend |
| trend_image |
| trend_recommend |
| user_group |
| users |
| varients |
| varients_value |
| vendor |
| vendor_category |
| vendor_coupon |
| vendor_coupon_product_category |
| vendor_global_setting |
| vendor_invoice |
| vendor_invoice_item |
| vendor_order_archive |
| vendor_order_archive_log |
| vendor_order_products |
| vendor_order_status |
| vendor_orders |
| vendor_orders_log |
| vendor_payment |
| vendor_payment_archive |
| vendor_product |
| widget |
| widget_item |
| zone |
| zone_to_geo_zone |
+--------------------------------------+
136 rows in set (0.00 sec)
I would like to send all the values of my goals 136 tables to elasticsearch via logstash.

If running a script next to logstash would be an option I would go for the following approach:
Create a bash script (or whatever language your preference has), put this in cron to do a simple 'show tables' and use the output in order to create 130 config files only containing the INPUT part for logstash with a naming convention like 'INPUT_tablename.conf'. This script should create the config as shown above, for each table that exists.
Make sure it lists the INPUT_* files in the directory and deletes the ones that no longer exists.
Make sure that when a file already exists it does not touch it
have your FILTER.conf and OUTPUT.conf in the same directory
Put you logstash in auto reload config mode
By doing it this way you seperate the thing you are struggling with and allows the database to have changes in tables, new ones that are added, and old ones that might be deleted or renamed.
I've learned to do it this way on clusters that I know will become very large and where I need to learn when the maximum io is being hit so i know when to add new nodes to which layer without killing the complete setup.

NiFi CaptureChangeMySQL converts varchar columns to nulls

I have problem with Apache NiFi 1.12.1. For some unknown for me reason CaptureChangeMySQL returns many nulls. Basically, only columns which are int, return correct values. I'm new in a matter of using NiFi so I might miss some obvious thing in configuration.
I have following table:
create table inventory.abc
(
id int auto_increment
primary key,
first_name varchar(100) not null,
last_name varchar(100) not null,
age int not null
);
Processor config:
Bin logs settings:
mysql> show variables like '%bin%';
+--------------------------------------------+--------------------------------+
| Variable_name | Value |
+--------------------------------------------+--------------------------------+
| bind_address | * |
| binlog_cache_size | 32768 |
| binlog_checksum | CRC32 |
| binlog_direct_non_transactional_updates | OFF |
| binlog_error_action | ABORT_SERVER |
| binlog_format | ROW |
| binlog_group_commit_sync_delay | 0 |
| binlog_group_commit_sync_no_delay_count | 0 |
| binlog_gtid_simple_recovery | ON |
| binlog_max_flush_queue_time | 0 |
| binlog_order_commits | ON |
| binlog_row_image | FULL |
| binlog_rows_query_log_events | OFF |
| binlog_stmt_cache_size | 32768 |
| binlog_transaction_dependency_history_size | 25000 |
| binlog_transaction_dependency_tracking | COMMIT_ORDER |
| innodb_api_enable_binlog | OFF |
| innodb_locks_unsafe_for_binlog | OFF |
| log_bin | ON |
| log_bin_basename | /var/lib/mysql/mysql-bin |
| log_bin_index | /var/lib/mysql/mysql-bin.index |
| log_bin_trust_function_creators | OFF |
| log_bin_use_v1_row_events | OFF |
| log_statements_unsafe_for_binlog | ON |
| max_binlog_cache_size | 18446744073709547520 |
| max_binlog_size | 1073741824 |
| max_binlog_stmt_cache_size | 18446744073709547520 |
| sql_log_bin | ON |
| sync_binlog | 1 |
+--------------------------------------------+--------------------------------+
29 rows in set (0.00 sec)
And I get results like this:
Any idea why I get so many nulls in output? I thought it might be related to Distributed Map Cache Client but since this option is not mandatory I don't think that's a problem.

How inspect Druid datasources with Hive

Yesterday, I was create my first datasource Druid from Hive. Today, I'm not sure that works...
First, I ran the following code for create my Db :
SET hive.druid.broker.address.default = 10.20.173.30:8082;
SET hive.druid.metadata.username = druid;
SET hive.druid.metadata.password = druid_password;
SET hive.druid.metadata.db.type = postgresql;
SET hive.druid.metadata.uri = jdbc:postgresql://10.20.173.31:5432/druid;
CREATE EXTERNAL TABLE test (
`__time` TIMESTAMP,
`userId` STRING,
`lang` STRING,
`location` STRING,
`name` STRING
)
STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
I can see this datasource on my Hive architecture. How can I know that this datasource is a Druid Datasource and not a Hive table.
I tested this but I don't know if it's a Druid datasource.
DESCRIBE FORMATTED test;
Result
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
| col_name | data_type | comment |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
| # col_name | data_type | comment |
| __time | timestamp | from deserializer |
| userid | string | from deserializer |
| lang | string | from deserializer |
| location | string | from deserializer |
| name | string | from deserializer |
| # Detailed Table Information | NULL | NULL |
| Database: | druid_datasources | NULL |
| OwnerType: | USER | NULL |
| Owner: | hive | NULL |
| CreateTime: | Tue Oct 15 12:42:22 CEST 2019 | NULL |
| LastAccessTime: | UNKNOWN | NULL |
| Retention: | 0 | NULL |
| Location: | hdfs://10.20.173.30:8020/warehouse/tablespace/external/hive/druid_datasources.db/test | NULL |
| Table Type: | EXTERNAL_TABLE | NULL |
| Table Parameters: | NULL | NULL |
| | COLUMN_STATS_ACCURATE | {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"__time\":\"true\",\"lang\":\"true\",\"location\":\"true\",\"name\":\"true\",\"userid\":\"true\"}} |
| | EXTERNAL | TRUE |
| | bucketing_version | 2 |
| | druid.datasource | druid_datasources.test ||
| | numFiles | 0 |
| | numRows | 0 |
| | rawDataSize | 0 |
| | storage_handler | org.apache.hadoop.hive.druid.DruidStorageHandler |
| | totalSize | 0 |
| | transient_lastDdlTime | 1571136142 |
| | NULL | NULL |
| # Storage Information | NULL | NULL |
| SerDe Library: | org.apache.hadoop.hive.druid.serde.DruidSerDe | NULL |
| InputFormat: | null | NULL |
| OutputFormat: | null | NULL |
| Compressed: | No | NULL |
| Num Buckets: | -1 | NULL |
| Bucket Columns: | [] | NULL |
| Sort Columns: | [] | NULL |
| Storage Desc Params: | NULL | NULL |
| | serialization.format | 1 |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
I did well or it's a Hive table with Druid parameters ?
Someone can explain me more about Hive/Druid interactions ?
Thanks :D

I think you registered your druid datasource in hive. Now you can run your queries using hive server on top of this table.
Your table definition look correct to me I think you managed to integrate druid datasoruce with hive. You can see druid related properties in table.
Now when you query the table it will use processing engine depending on the query it will use hive server along with druid. It can use combination of both or one of them on standalone basis to execute query. It depends whether that query can be converted to druid query or not.
You can refer to this doc for more info on Hive/Druid interactions : https://cwiki.apache.org/confluence/display/Hive/Druid+Integration (refer:Querying Druid from Hive)

Splitting a table into multiple tables in Power BI

I have this following data in a single table. I need to split this table into multiple tables based on the YearMonth Column. Is there a way to automate this task.
+------------+-----------+
| Year_Month | Part# |
+------------+-----------+
| 2014-03 | CCH057169 |
| 2014-03 | CCH057276 |
| 2014-03 | CCH057303 |
| 2014-03 | CCH057430 |
| 2014-04 | CCH057409 |
| 2014-04 | CCH057497 |
| 2014-04 | CCH057570 |
| 2014-04 | CCH057583 |
| 2014-04 | CCH057650 |
| 2014-04 | CCH057696 |
| 2014-04 | CCH057707 |
| 2014-04 | CCH057798 |
| 2014-05 | CCH057701 |
| 2014-06 | CCH057235 |
| 2014-06 | CCH057280 |
| 2014-06 | CCH057693 |
| 2014-06 | CCH057707 |
| 2014-06 | CCH057721 |
| 2014-07 | CCH057235 |
| 2014-07 | CCH057427 |
| 2014-08 | CCH057650 |
| 2014-08 | CCH057696 |
| 2014-08 | CCH057798 |
| 2014-09 | CCH057303 |
| 2014-09 | CCH057482 |
| 2014-09 | CCH057668 |
| 2014-09 | CCH057744 |
| 2014-09 | CCH057776 |
| 2014-10 | CCH057668 |
| 2014-10 | CCH057696 |
| 2014-11 | CCH057390 |
| 2014-11 | CCH057409 |
| 2014-11 | CCH057679 |
| 2014-11 | CCH057700 |
| 2014-11 | CCH057721 |
| 2014-11 | CCH057749 |
| 2014-11 | CCH057896 |
| 2014-12 | CCH057169 |
| 2014-12 | CCH057693 |
| 2014-12 | CCH057696 |
| 2014-12 | CCH057708 |
| 2014-12 | CCH057876 |
| 2014-12 | CCH057896 |
| 2015-01 | CCH057630 |
| 2015-01 | CCH057679 |
| 2015-01 | CCH057700 |
| 2015-01 | CCH057776 |
| 2015-02 | CCH057409 |
| 2015-02 | CCH057482 |
+------------+-----------+
More Information:
I am getting the data from Oracle Database. The Purpose of this data is to compare between two given Dates and provide new records. Is there a way that I select two dates on the form (Slicer) and then the query has to fetch the data based on the date selection on the form.

Might be splitting hairs, but from a database (RDBMS) perspective, slicers change what is summarized and displayed, not what is queried. So, you might want to take some steps so you don't do something like getting the introduction date of every product Amazon has ever offered.
It sounds like you want to use a slicer with a range slider, which only works on numbers and dates. So, first add a column computed from the Year_Month column for a date, say the end of the month, and call it Month.
Month = EOMONTH(DATEVALUE([Year_Month] & "-01"), 0)
Then all you need to do is create a grid with the part# field and create a slicer for the Month field, which is initially configured with a range slider.

Hive - External table creation

I am learning hive and read an article about when to use HIVE external table and mentioned the statement below.
To query data stored in external system such as amazon s3
- Avoid brining in that data into HDFS
Can anyone elaborate above statement. "Avoid brining in that data into HDFS"? Load data local command will help to load local file into HDFS and HIVE is applying the format on the top.
Is it possible to access the data which is out of HDFS?

is it possible to access the data which is out of HDFS?
HIve can read data on any Hadoop Compatible filesystem, not only HDFS.
Can someone elaborate above statement. "Avoid brining in that data into HDFS "?
With the example of S3, you can create an external table with a location of s3a://bucket/path, there's no need to bring it to HDFS unless you really needed the speed of reading HDFS compared to S3. However, to persist a dataset in an ephemeral cloud cluster, results should be written back to whatever long-term storage is provided.

It is possible. You can try this yourself. On CDH, I have a file extn\t.txt
[cloudera#quickstart ~]$ pwd
/home/cloudera
[cloudera#quickstart ~]$ cat extn/t.txt
something
[cloudera#quickstart ~]$
I can now create an external table to access this file as follows
create external table tbl(line string)
location 'file:///home/cloudera/extn'
Describe table
INFO : OK
+-----------+------------+----------+--+
| col_name | data_type | comment |
+-----------+------------+----------+--+
| line | string | |
+-----------+------------+----------+--+
1 row selected (0.152 seconds)
0: jdbc:hive2://localhost:10000>
Select
INFO : OK
+------------+--+
| tbl.line |
+------------+--+
| something |
+------------+--+
1 row selected (0.134 seconds)
0: jdbc:hive2://localhost:10000>
Describe formatted
+-------------------------------+----------------------------------------------------+-----------------------+--+
| col_name | data_type | comment |
+-------------------------------+----------------------------------------------------+-----------------------+--+
| # col_name | data_type | comment |
| | NULL | NULL |
| line | string | |
| | NULL | NULL |
| # Detailed Table Information | NULL | NULL |
| Database: | default | NULL |
| Owner: | cloudera | NULL |
| CreateTime: | Tue Feb 20 12:49:25 PST 2018 | NULL |
| LastAccessTime: | UNKNOWN | NULL |
| Protect Mode: | None | NULL |
| Retention: | 0 | NULL |
| Location: | file:/home/cloudera/extn | NULL |
| Table Type: | EXTERNAL_TABLE | NULL |
| Table Parameters: | NULL | NULL |
| | COLUMN_STATS_ACCURATE | false |
| | EXTERNAL | TRUE |
| | numFiles | 0 |
| | numRows | -1 |
| | rawDataSize | -1 |
| | totalSize | 0 |
| | transient_lastDdlTime | 1519159765 |
| | NULL | NULL |
| # Storage Information | NULL | NULL |
| SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL |
| InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL |
| OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL |
| Compressed: | No | NULL |
| Num Buckets: | -1 | NULL |
| Bucket Columns: | [] | NULL |
| Sort Columns: | [] | NULL |
| Storage Desc Params: | NULL | NULL |
| | serialization.format | 1 |
+-------------------------------+----------------------------------------------------+-----------------------+
Load data is different. Please check this External Table vs Load Data

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

query hive json serde table having nested ARRAY & STRUCT combination - hadoop

Related

how can a table of all databases be sent to elasticsearch?

NiFi CaptureChangeMySQL converts varchar columns to nulls

How inspect Druid datasources with Hive

Splitting a table into multiple tables in Power BI

Hive - External table creation

Categories

Resources