Hbase:Need suitable jar files for cloudera-quickstart-vm-5.4.2-0 - hadoop

I am trying to load data from flat file to Hbase through API.But I am getting following error
========================================================
java.lang.NumberFormatException.forInputString
(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:492)
at java.lang.Integer.parseInt(Integer.java:527)
at org.apache.hadoop.hbase.HServerAddress.(HServerAddress.java:63)
at org.apache.hadoop.hbase.MasterAddressTracker.getMasterAddress(MasterAddressTracker.java:63)
at org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation.getMaster(HConnectionManager.java:354)
at org.apache.hadoop.hbase.client.HBaseAdmin.(HBaseAdmin.java:94)
at Hbase.readFromFile.main(readFromFile.java:16)
Code :
package Hbase;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
public class readFromFile {
public static void main(String[] args) throws IOException{
if(args.length==1)
{
Configuration conf = HBaseConfiguration.create(new Configuration());
HBaseAdmin hba = new HBaseAdmin(conf);
if(!hba.tableExists(args[0])){
HTableDescriptor ht = new HTableDescriptor(args[0]);
ht.addFamily(new HColumnDescriptor("sample"));
ht.addFamily(new HColumnDescriptor("region"));
ht.addFamily(new HColumnDescriptor("time"));
ht.addFamily(new HColumnDescriptor("product"));
ht.addFamily(new HColumnDescriptor("sale"));
ht.addFamily(new HColumnDescriptor("profit"));
hba.createTable(ht);
System.out.println("New Table Created");
HTable table = new HTable(conf,args[0]);
File f = new File("/home/training/Desktop/data");
BufferedReader br = new BufferedReader(new FileReader(f));
String line = br.readLine();
int i =1;
String rowname="row";
while(line!=null && line.length()!=0){
System.out.println("Ok till here");
StringTokenizer tokens = new StringTokenizer(line,",");
rowname = "row"+i;
Put p = new Put(Bytes.toBytes(rowname));
p.add(Bytes.toBytes("sample"),Bytes.toBytes("sampleNo."),
Bytes.toBytes(Integer.parseInt(tokens.nextToken())));
p.add(Bytes.toBytes("region"),Bytes.toBytes("country"),
Bytes.toBytes(tokens.nextToken()));
p.add(Bytes.toBytes("region"),Bytes.toBytes("state"),
Bytes.toBytes(tokens.nextToken()));
p.add(Bytes.toBytes("region"),Bytes.toBytes("city"),
Bytes.toBytes(tokens.nextToken()));
p.add(Bytes.toBytes("time"),Bytes.toBytes("year"),
Bytes.toBytes(Integer.parseInt(tokens.nextToken())));
p.add(Bytes.toBytes("time"),Bytes.toBytes("month"),
Bytes.toBytes(tokens.nextToken()));
p.add(Bytes.toBytes("product"),Bytes.toBytes("productNo."),
Bytes.toBytes(tokens.nextToken()));
p.add(Bytes.toBytes("sale"),Bytes.toBytes("quantity"),
Bytes.toBytes(Integer.parseInt(tokens.nextToken())));
p.add(Bytes.toBytes("profit"),Bytes.toBytes("earnings"),
Bytes.toBytes(tokens.nextToken()));
i++;
table.put(p);
line = br.readLine();
}
br.close();
table.close();
}
else
System.out.println("Table Already exists.
Please enter another table name");
}
else
System.out.println("Please Enter the table
name through command line");
}
}
Please let me know whether we need to add any suitable jars ..I am using cloudera cloudera-quickstart-vm-5.4.2-0
Thanks,
VJ

If you read the error, it says that the Integer.parseInt method raised a NumberFormatException. This means that you attempted to convert a String of invalid format into an Integer. In your code, you call that method in this line:
Bytes.toBytes(Integer.parseInt(tokens.nextToken())));
You need look at the tokens you're passing into this method via tokens.nextToken() and ensure that each can be converted to an Integer.

I think the problem is with the cloudera jar versions used. Please check on it, that should work.

Related

Call BigQuery stored procedure(Routine) using spring boot

I'm trying to call a Google BigQuery stored procedure (Routine) using Spring boot. I tried all the methods of the routines to extract data. However, it didn't help.
Has anyone ever created and called a BigQuery stored procedure (Routine) through the Spring boot? If so, how?
public static Boolean executeInsertQuery(String query, TableId tableId, String jobName) {
log.info("Starting {} truncate query", jobName);
BigQuery bigquery = GCPConfig.getBigQuery(); // bqClient
// query configuration
QueryJobConfiguration queryConfig = QueryJobConfiguration.newBuilder(query)
.setUseLegacySql(false)
.setAllowLargeResults(true)
.setDestinationTable(tableId) .setWriteDisposition(JobInfo.WriteDisposition.WRITE_TRUNCATE).build();
try {
// build the query job.
QueryJob queryJob = new QueryJob.Builder(queryConfig).bigQuery(bigquery).jobName(jobName).build();
QueryJob.Result result = queryJob.execute();
} catch (JobException e) {
log.error("{} unsuccessful. job id: {}, job name: {}. exception: {}", jobName, e.getJobId(),
e.getJobName(), e.toString());
return false;
}
}
package ops.google.com;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryError;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.cloud.bigquery.EncryptionConfiguration;
import com.google.cloud.bigquery.InsertAllRequest;
import com.google.cloud.bigquery.InsertAllResponse;
import com.google.cloud.bigquery.QueryJobConfiguration;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.bigquery.TableResult;
import com.google.common.collect.ImmutableList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.auth.oauth2.ServiceAccountCredentials;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class SelectFromBigQueryFunction {
private static final Logger logger = LogManager.getLogger(SelectFromBigQueryFunction.class);
public boolean tableSelectFromJoin(String key_path) {
String projectID = "ProjectID";
String datasetName = "DataSetName";
String tableName1 = "sample_attribute_type";
String tableName2 = "sample_attribute_value";
boolean status = false;
try {
//Call BQ Function/Routines, functinon name->bq_function_name
//String query = String.format("SELECT DataSetName.bq_function_name(1, 1)");
//Call BQ Stored Procedure, procedure name-> bq_stored_procedure_name
String query = String.format("CALL DataSetName.bq_stored_procedure_name()");
File credentialsPath = new File(key_path);
FileInputStream serviceAccountStream = new FileInputStream(credentialsPath);
GoogleCredentials credentials = ServiceAccountCredentials.fromStream(serviceAccountStream);
// Initialize client that will be used to send requests. This client only needs to be created
BigQuery bigquery = BigQueryOptions.newBuilder()
.setProjectId(projectID)
.setCredentials(credentials)
.build().getService();
QueryJobConfiguration queryConfig = QueryJobConfiguration.newBuilder(query).build();
TableResult results = bigquery.query(queryConfig);
results.iterateAll().forEach(row -> row.forEach(val -> System.out.printf("%s,", val.toString())));
logger.info("Query performed successfully with encryption key.");
status = true;
} catch (BigQueryException | InterruptedException e) {
logger.error("Query not performed \n" + e.toString());
}catch(Exception e){
logger.error("Some Exception \n" + e.toString());
}return status;
}
}

Access Data from REST API in HIVE

Is there a way to create a hive table where the location for that hive table will be a http JSON REST API? I don't want to import the data every time in HDFS.
I had encountered similar situation in a project couple of years ago. This is the sort of low-key way of ingesting data from Restful to HDFS and then you use Hive analytics to implement the business logic.I hope you are familiar with core Java, Map Reduce (if not you might look into Hortonworks Data Flow, HDF which is a product of Hortonworks).
Step 1: Your data ingestion workflow should not be tied to your Hive workflow that contains business logic. This should be executed independently in timely manner based on your requirement (volume & velocity of data flow) and monitored regularly. I am writing this code on a text editor. WARN: It's not compiled or tested!!
The code below is using a Mapper which would take in the url or tweak it to accept the list of urls from the FS. The payload or requested data is stored as text file in the specified job output directory (forget the structure of data this time).
Mapper Class:
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class HadoopHttpClientMap extends Mapper<LongWritable, Text, Text, Text> {
private int file = 0;
private String jobOutDir;
private String taskId;
#Override
protected void setup(Context context) throws IOException,InterruptedException {
super.setup(context);
jobOutDir = context.getOutputValueClass().getName();
taskId = context.getJobID().toString();
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
Path httpDest = new Path(jobOutDir, taskId + "_http_" + (file++));
InputStream is = null;
OutputStream os = null;
URLConnection connection;
try {
connection = new URL(value.toString()).openConnection();
//implement connection timeout logics
//authenticate.. etc
is = connection.getInputStream();
os = FileSystem.getLocal(context.getConfiguration()).create(httpDest,true);
IOUtils.copyBytes(is, os, context.getConfiguration(), true);
} catch(Throwable t){
t.printStackTrace();
}finally {
IOUtils.closeStream(is);
IOUtils.closeStream(os);
}
context.write(value, null);
//context.write(new Text (httpDest.getName()), new Text (os.toString()));
}
}
Mapper Only Job:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class HadoopHttpClientJob {
private static final String data_input_directory = “YOUR_INPUT_DIR”;
private static final String data_output_directory = “YOUR_OUTPUT_DIR”;
public HadoopHttpClientJob() {
}
public static void main(String... args) {
try {
Configuration conf = new Configuration();
Path test_data_in = new Path(data_input_directory, "urls.txt");
Path test_data_out = new Path(data_output_directory);
#SuppressWarnings("deprecation")
Job job = new Job(conf, "HadoopHttpClientMap" + System.currentTimeMillis());
job.setJarByClass(HadoopHttpClientJob.class);
FileSystem fs = FileSystem.get(conf);
fs.delete(test_data_out, true);
job.setMapperClass(HadoopHttpClientMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, test_data_in);
FileOutputFormat.setOutputPath(job, test_data_out);
job.waitForCompletion(true);
}catch (Throwable t){
t.printStackTrace();
}
}
}
Step 2: Create external table in Hive based on the HDFS directory. Remember to use Hive SerDe for the JSON data (in your case) then you can copy the data from external table into managed master tables. This is the step where you implement your incremental logics, compression..
Step 3: Point your hive queries (which you might have already created) to the master table to implement your business needs.
Note: If you are supposedly referring to realtime analysis or streaming api, you might have to change your application's architecture. Since you have asked architectural question, I am using my best educated guess to support you. Please go through this once. If you feel you can implement this in your application then you can ask the specific question, I will try my best to address them.

Hbase mapside join- One of the tables is not getting read? read from hbase and right result into hbase

I am trying to do mapside join of two tables located in Hbase. My aim is to keep record of the small table in hashmap and compare with the big table, and once matched, write record in a table in hbase again. I wrote the similar code for join operation using both Mapper and Reducer and it worked well and both tables are scanned in mapper class. But since reduce side join is not efficient at all, I want to join the tables in mapper side only. In the following code "commented if block" is just to see that it returns false always and first table (small one) is not getting read. Any hints helps are appreciated. I am using sandbox of HDP.
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
//import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.util.Tool;
import com.sun.tools.javac.util.Log;
import java.io.IOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapred.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
public class JoinDriver extends Configured implements Tool {
static int row_index = 0;
public static class JoinJobMapper extends TableMapper<ImmutableBytesWritable, Put> {
private static byte[] big_table_bytarr = Bytes.toBytes("big_table");
private static byte[] small_table_bytarr = Bytes.toBytes("small_table");
HashMap<String,String> myHashMap = new HashMap<String, String>();
byte[] c1_value;
byte[] c2_value;
String big_table;
String small_table;
String big_table_c1;
String big_table_c2;
String small_table_c1;
String small_table_c2;
Text mapperKeyS;
Text mapperValueS;
Text mapperKeyB;
Text mapperValueB;
public void map(ImmutableBytesWritable rowKey, Result columns, Context context) {
TableSplit currentSplit = (TableSplit) context.getInputSplit();
byte[] tableName = currentSplit.getTableName();
try {
Put put = new Put(Bytes.toBytes(++row_index));
// put small table into hashmap - myhashMap
if (Arrays.equals(tableName, small_table_bytarr)) {
c1_value = columns.getValue(Bytes.toBytes("s_cf"), Bytes.toBytes("s_cf_c1"));
c2_value = columns.getValue(Bytes.toBytes("s_cf"), Bytes.toBytes("s_cf_c2"));
small_table_c1 = new String(c1_value);
small_table_c2 = new String(c2_value);
mapperKeyS = new Text(small_table_c1);
mapperValueS = new Text(small_table_c2);
myHashMap.put(small_table_c1,small_table_c2);
} else if (Arrays.equals(tableName, big_table_bytarr)) {
c1_value = columns.getValue(Bytes.toBytes("b_cf"), Bytes.toBytes("b_cf_c1"));
c2_value = columns.getValue(Bytes.toBytes("b_cf"), Bytes.toBytes("b_cf_c2"));
big_table_c1 = new String(c1_value);
big_table_c2 = new String(c2_value);
mapperKeyB = new Text(big_table_c1);
mapperValueB = new Text(big_table_c2);
// if (set.containsKey(big_table_c1)){
put.addColumn(Bytes.toBytes("join"), Bytes.toBytes("join_c1"), Bytes.toBytes(big_table_c1));
context.write(new ImmutableBytesWritable(mapperKeyB.getBytes()), put );
put.addColumn(Bytes.toBytes("join"), Bytes.toBytes("join_c2"), Bytes.toBytes(big_table_c2));
context.write(new ImmutableBytesWritable(mapperKeyB.getBytes()), put );
put.addColumn(Bytes.toBytes("join"), Bytes.toBytes("join_c3"),Bytes.toBytes((myHashMap.get(big_table_c1))));
context.write(new ImmutableBytesWritable(mapperKeyB.getBytes()), put );
// }
}
} catch (Exception e) {
// TODO : exception handling logic
e.printStackTrace();
}
}
}
public int run(String[] args) throws Exception {
List<Scan> scans = new ArrayList<Scan>();
Scan scan1 = new Scan();
scan1.setAttribute("scan.attributes.table.name", Bytes.toBytes("small_table"));
System.out.println(scan1.getAttribute("scan.attributes.table.name"));
scans.add(scan1);
Scan scan2 = new Scan();
scan2.setAttribute("scan.attributes.table.name", Bytes.toBytes("big_table"));
System.out.println(scan2.getAttribute("scan.attributes.table.name"));
scans.add(scan2);
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJar("MSJJ.jar");
job.setJarByClass(JoinDriver.class);
TableMapReduceUtil.initTableMapperJob(scans, JoinJobMapper.class, ImmutableBytesWritable.class, Put.class, job);
TableMapReduceUtil.initTableReducerJob("joined_table", null, job);
job.setNumReduceTasks(0);
job.waitForCompletion(true);
return 0;
}
public static void main(String[] args) throws Exception {
JoinDriver runJob = new JoinDriver();
runJob.run(args);
}
}
By reading your problem statement I believe you have got some wrong idea about uses of Multiple HBase table input.
I suggest you load small table in a HashMap, in setup method of mapper class. Then use map only job on big table, in map method you can fetch corresponding values from the HashMap which you loaded earlier.
Let me know how this works out.

Load Data into Hbase outside Client Node

Thanks in advance.
We are loading data into Hbase using Java. It's pretty straight and works fine when we run the program on the client node (edge node). But we want to run this program remotely (outside the hadoop cluster) within our network to load the data.
Is there anything required to do this in terms of security on the hadoop cluster? When I run the program outside the cluster it's hanging..
Please advise. Greatly appreciate your help.
Thanks
Code here
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import com.dev.stp.cvsLoadEventConfig;
import com.google.protobuf.ServiceException;
public class LoadData {
static String ZKHost;
static String ZKPort;
private static Configuration config = null;
private static String tableName;
public LoadData (){
//Set Application Config
LoadDataConfig conn = new LoadDataConfig();
ZKHost = conn.getZKHost();
ZKPort = conn.getZKPort();
config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", ZKHost);
config.set("hbase.zookeeper.property.clientPort", ZKPort);
config.set("zookeeper.znode.parent", "/hbase-unsecure");
tableName = "E_DATA";
}
//Insert Record
try {
HTable table = new HTable(config, tableName);
Put put = new Put(Bytes.toBytes(eventId));
put.add(Bytes.toBytes("E_DETAILS"), Bytes.toBytes("E_NAME"),Bytes.toBytes("test data 1"));
put.add(Bytes.toBytes("E_DETAILS"), Bytes.toBytes("E_TIMESTAMP"),Bytes.toBytes("test data 2"));
table.put(put);
table.close();
} catch (IOException e) {
e.printStackTrace();
}
}

How to resolve ORA-01460 Error?

I am uploading photos to oracle database.The code for my AddPhotoServlet class is as follows.
import java.io.IOException;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.List;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.fileupload.servlet.ServletFileUpload;
public class AddPhotoServlet extends HttpServlet {
protected void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
response.setContentType("text/html;charset=UTF-8");
PrintWriter out = response.getWriter();
try {
// Apache Commons-Fileupload library classes
DiskFileItemFactory factory = new DiskFileItemFactory();
ServletFileUpload sfu = new ServletFileUpload(factory);
/*if (! ServletFileUpload.isMultipartContent(request)) {
System.out.println("sorry. No file uploaded");
return;
}*/
// parse request
List items = sfu.parseRequest(request);
FileItem id = (FileItem) items.get(0);
String photoid = id.getString();
FileItem title = (FileItem) items.get(1);
String phototitle = title.getString();
// get uploaded file
FileItem file = (FileItem) items.get(2);
// Connect to Oracle
Class.forName("oracle.jdbc.driver.OracleDriver");
Connection con =DriverManager.getConnection("jdbc:oracle:thin:#localhost:1521:xe", "cloud", "cloud");
con.setAutoCommit(false);
PreparedStatement ps = con.prepareStatement("insert into photos values(?,?,?)");
ps.setString(1, photoid);
ps.setString(2, phototitle);
// size must be converted to int otherwise it results in error
ps.setBinaryStream(3, file.getInputStream(), (int) file.getSize());
ps.executeUpdate();
con.commit();
con.close();
out.println("Proto Added Successfully. <p> <a href='listphotos'>List Photos </a>");
}
When i run the html page in the server while uploading it throws the error :
ORA-01460:unimplemented or unreasonable conversion requested.
Can anyone please help me with this ?
I have created my table as :
create table photos
(
id number(5) primary key,
title varchar2(50),
photo blob
);
Check out the oracle jdbc developer's guide for working with streams and lobs. There are quite a few options on how the data is handled that has different performance/memory implications.
One portion of the second doc (on lobs) covers the use of temporary lob objects by using Connection.createBlob() and writing the content to that object and then setting that blob object on the statement.

Resources