I try to make a connection between pyspark and oracle sql, so I could load tables in this way;
I am using the following code:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
import os
spark_config = SparkConf().setMaster("local").setAppName("Project_SQL")
sc = SparkContext(conf = spark_config)
sqlctx = SQLContext(sc)
os.environ['SPARK_CLASSPATH'] = "C:\Program Files (x86)\Oracle\SQL Developer 4.0.1\jdbc\lib.jdbc6.jar"
df = sqlctx.read.format("jdbc").options(url="jdbc:oracle:thin:#<>:<>:<>"
, driver = "oracle.jdbc.driver.OracleDriver"
, dbtable = "account"
, user="...."
, password="...").load()
But I get the following error.
An error occurred while calling o29.load.:
java.sql.SQLRecoverableException: IO Error: The Network Adapter could not establish the connection
Could anyone help me to fix this? Do you think it is because of the firewall?
Related
env
spark 3.1.2
hive 3.1.2
hadoop 3.2.1
problem
spark sql with hive metastore connection
for example, make table A with hdfs caching data in memory.
and make table B with non-hdfs caching data in hdfs.( but data content is same. )
I executed same query with table A and B.
I expected A table's query execution time must be faster than B table's query, but it didn't.
Actually query execution time between A and B table was almost same.
Is there anything to do enable HDFS caching with SparkSQL using hive metastore?
example code
from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as Fd
from pyspark.sql.window import Window as W
spark_executor_cores = 8
spark_executor_memory = '3g'
spark_instances = 50
spark_shuffle_partitions = 1000
spark_default_parallelism = 1000
conf = SparkConf()
conf.setAppName("test application")
conf.set('spark.yarn.queue', 'default')
conf.set('spark.executor.memory', str(spark_executor_memory))
conf.set('spark.executor.instances', str(spark_instances))
conf.set('spark.shuffle.sort.bypassMergeThreshold', spark_instances * int(spark_executor_cores))
conf.set("spark.dynamicAllocation.enabled", "false")
conf.set("spark.sql.shuffle.partitions", str(spark_shuffle_partitions))
conf.set("spark.default.parallelism", str(spark_default_parallelism))
conf.set('spark.sql.adaptive.enabled', 'true')
conf.set('spark.sql.adaptive.coalescePartitions.enabled', 'true')
conf.set('spark.sql.adaptive.localShuffleReader.enabled', 'true')
conf.set('spark.sql.adaptive.skewJoin.enabled', 'true')
conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
conf.set('spark.sql.execution.arrow.pyspark.fallback.enabled', 'true')
conf.set('spark.sql.warehouse.dir', metastore_dir)
conf.set('spark.hadoop.javax.jdo.option.ConnectionURL', metastore_url)
ss = SparkSession.builder.enableHiveSupport().config(conf=conf).getOrCreate()
sc = SparkContext.getOrCreate()
spark = ss
sql = spark.sql
# example query
# partition = hour, logtype
sql("""
SELECT <columns>,
...
...
FROM <table name>
WHERE hour = <hour>
AND logtype = <logtype>
group by logtype
""").show()
I'm going to read tables from mariadb database using pyspark.And an error occur while running the below code
'''
jdbcHostname = "localhost"
jdbcDatabase = "pucsl"
jdbcPort = 3307
jdbcUrl = "jdbc:mariadb://{0}:{1}/{2}?user={3}&password={4}".format(jdbcHostname, jdbcPort, jdbcDatabase, "root", "ravi")
df = spark.read.jdbc(url=jdbcUrl, table="m00_02_lic_lic_reln",properties={"driver": 'com.mariadb.jdbc.Driver'})
Currently Spark does not correctly recognize mariadb specific jdbc connect strings and so the jdbc:mysql syntax must be used. The followings shows a simple pyspark script to query the results from ColumnStore UM server columnstore_1 into a spark dataframe:
from pyspark import SparkContext
from pyspark.sql import DataFrameReader, SQLContext
url = 'jdbc:mysql://columnstore_1:3306/test'
properties = {'user': 'root', 'driver': 'org.mariadb.jdbc.Driver'}
sc = SparkContext("local", "ColumnStore Simple Query Demo")
sqlContext = SQLContext(sc)
df = DataFrameReader(sqlContext).jdbc(url='%s' % url, table='results', properties=properties)
df.show()
p.s~ I believe you have successfully added MariaDB jar in place(Something like /spark3.1.2/lib/maridabjar...)
I wrote one lambda function to access the MySQL database and fetch the data i.e to fetch the number of users, but any real-time update is not fetched, unless the connection is re-established.
And closing the connection inside the lambda_handler before returning, results in connection error upon its next call.
The query which I am using is -> select count(*) from users
import os
import pymysql
import json
import logging
endpoint = os.environ.get('DBMS_endpoint')
username = os.environ.get('DBMS_username')
password = os.environ.get('DBMS_password')
database_name = os.environ.get('DBMS_name')
DBport = int(os.environ.get('DBMS_port'))
logger = logging.getLogger()
logger.setLevel(logging.INFO)
try:
connection = pymysql.connect(endpoint, user=username, passwd=password, db=database_name, port=DBport)
logger.info("SUCCESS: Connection to RDS mysql instance succeeded")
except:
logger.error("ERROR: Unexpected error: Could not connect to MySql instance.")
def lambda_handler(event, context):
try:
cursor = connection.cursor()
............some.work..........
............work.saved..........
cursor.close()
connection.close()
return .....
except:
print("ERROR")
The above code results in connection error after its second time usage,
First time it works fine and gives the output but the second time when I run the lambda function it results in connection error.
Upon removal of this line ->
connection.close()
The code works fine but the real-time data which was inserted into the DB is not fetched by the lambda,
but when I don't use the lambda function for 2 minutes, then after using it again, the new value is fetched by it.
So,
In order to rectify this problem,
I placed the connect() inside the lambda_handler and the problem is solved and it also fetches the real-time data upon insertion.
import os
import pymysql
import json
import logging
endpoint = os.environ.get('DBMS_endpoint')
username = os.environ.get('DBMS_username')
password = os.environ.get('DBMS_password')
database_name = os.environ.get('DBMS_name')
DBport = int(os.environ.get('DBMS_port'))
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def lambda_handler(event, context):
try:
try:
connection = pymysql.connect(endpoint, user=username, passwd=password, db=database_name, port=DBport)
except:
logger.error("ERROR: Unexpected error: Could not connect to MySql instance.")
cursor = connection.cursor()
............some.work..........
............work.saved..........
cursor.close()
connection.close()
return .....
except:
print("ERROR")
So, I want to know, whether is it right to do this, or there is some other way to solve this problem, I trying to solve this for few-days and finally this solution is working, but not sure whether will it be a good practice to do this or not.
Any problems will occur if the number of connections to database increases?
Or any kind of resource problem?
I am using pyspark for spark streaming. I am able to stream and create the dataframe properly with no issues. I was also able to insert data into Impala table created with only a few(5) sampled columns out of the overall columns(72) in the message from Kafka. But when I create a new a table with proper data types and columns, similarly the dataframe now has all the columns mentioned in the message of Kafka stream. I get the below exception.
java.sql.SQLFeatureNotSupportedException: [Cloudera]JDBC Driver does not support this optional feature.
at com.cloudera.impala.exceptions.ExceptionConverter.toSQLException(Unknown Source)
at com.cloudera.impala.jdbc.common.SPreparedStatement.checkTypeSupported(Unknown Source)
at com.cloudera.impala.jdbc.common.SPreparedStatement.setNull(Unknown Source)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:627)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:782)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:782)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2064)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2064)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I have searched a lot on this, but could not find any solution on this. I enabled debug logs as well, still it won't mention what feature does the driver not support.
Any help or proper guidance would be appreciated.
Thank you
Version details :
pyspark : 2.2.0
Kafka : 0.10.2
Cloudera : 5.15.0
Cloudera Impala : 2.12.0-cdh5.15.0
Cloudera Impala JDBC driver : 2.6.4
The code I have used :
import json
from pyspark import SparkContext,SparkConf,HiveContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import lit
from pyspark.sql.types import *
conf = SparkConf().setAppName("testkafkarecvstream")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10)
spark = SparkSession.builder.appName("testkafkarecvstream").getOrCreate()
jdbcUrl = "jdbc:impala://hostname:21050/dbName;AuthMech=0;"
fields = [
StructField("column_name01", StringType(), True),
StructField("column_name02", StringType(), True),
StructField("column_name03", DoubleType(), True),
StructField("column_name04", StringType(), True),
StructField("column_name05", IntegerType(), True),
StructField("column_name06", StringType(), True),
.....................
StructField("column_name72", StringType(), True),
]
schema = StructType(fields)
def make_rows(parts):
customRow = Row(column_name01=datatype(parts['column_name01']),
.....,
column_name72=datatype(parts['column_name72'])
)
return customRow
def createDFToParquet(rdd):
try:
df = spark.createDataFrame(rdd,schema)
df.show()df.write.jdbc(jdbcUrl,
table="table_name",
mode="append",)
except Exception as e:
print str(e)
zkNode = "zkNode_name:2181"
topic = "topic_name"
# Reciever method
kvs = KafkaUtils.createStream(ssc,
zkNode,
"consumer-group-id",
{topic:5},
{"auto.offset.reset" : "smallest"})
lines = kvs.map(lambda x: x[1])
conv = lines.map(lambda x: json.loads(x))
table = conv.map(makeRows)
table.foreachRDD(createDFToParquet)
table.pprint()
ssc.start()
ssc.awaitTermination()
I was trying to work with Oracle Database from Haskell and have faced with such problem.
So, there is this code.
module Main where
import Database.HDBC
import Database.HDBC.ODBC
main :: IO ()
main = do
let connectionString = "Driver={Microsoft ODBC for Oracle};Server=127.0.0.1;Uid=valera;Pwd=2562525;"
let ioconn = connectODBC connectionString
conn <- ioconn
vals <- quickQuery conn "SELECT * FROM PERSONS_TEST" []
print vals
return ()
Pretty simple, huh? But that won't work. With this connection string the error is
*** Exception: SqlError {seState = "[\"HY090\"]", seNativeError = -1, seErrorMsg = "sqlGetInfo SQL_TXN_CAPABLE: [\"0: [Microsoft][ODBC driver for Oracle]\\65533...
and then 65333 repeats many times. And with this
Provider=msdaora;Data Source=127.0.0.1;User Id=valera;Password=2562525;
the error is
*** Exception: SqlError {seState = "[\"IM002\"]", seNativeError = -1, seErrorMsg = "connectODBC/sqlDriverConnect: [\"0: [Microsoft][\\65533...
and 65333 repeats again till the end
I suppose, that the problem is in connection string, but I had tried a whole bunch of them (I've used http://www.connectionstrings.com/)
I'm using Haskell Platform 2011.4.0.0, GHC 7.0.4, Oracle Database XE 11.2 on Windows 7 64-bit. Microsoft MDAC SDK installed.
\65533 and so on is the symbols of ODBC driver error message string in your locale (RU?). I find the best way so on to develop in english locale system, thus error messages in ghci console shown in english language and can be read.