Unable to read json file from S3 using local PySpark - hadoop

I am trying to read a json file from S3 using PySpark locally. Here is the code
import os
import configparser
from pyspark.sql import SparkSession
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell'
config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
access_key = config.get("default", "aws_access_key_id")
secret_key = config.get("default", "aws_secret_access_key")
session_token = config.get("default", "aws_session_token")
if __name__ == "__main__":
spark = SparkSession \
.builder \
.appName("ReadGoogleTrendsData") \
.master("local[1]") \
.getOrCreate()
sc=spark.sparkContext
# hadoop_conf=sc._jsc.hadoopConfiguration()
# hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
# hadoop_conf.set("fs.s3n.awsAccessKeyId", access_key)
# hadoop_conf.set("fs.s3n.awsSecretAccessKey", secret_key)
# hadoop_conf.set("fs.s3n.awsSessionToken", session_token)
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", "s3.amazonaws.com")
hadoop_conf.set("fs.s3a.access.key", access_key)
hadoop_conf.set("fs.s3a.secret.key", secret_key)
hadoop_conf.set("fs.s3a.session.token", session_token)
hadoop_conf.set("fs.s3a.connection.ssl.enabled", "true")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
df = spark.read.json("s3a://gamma-rp/trends-data/trends-2021-07-01.json")
df.printSchema()
df.show(5)
I get the following error when I run this
File "/Users/rppatwa/Downloads/Spark/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o37.json.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 403, AWS Service: Amazon S3, AWS Request ID: NB1WXDBQJ5S848AM, AWS Error Code: null, AWS Error Message: Forbidden, S3 Extended Request ID: HeI/luU5fMpN1dehMo9+yDzMAXin2j7AZPo3STRqbvx56rDcUotMdze08cJz08s7P581ATdRmck=
at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:798)
at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:421)
at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:232)
at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3528)
at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:976)
at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:956)
at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:892)
at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:77)
at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1426)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:557)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:392)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
I looked up this error and it seems I do have permission to the bucket since I am able use the AWS cli cp command to download a file to my local machine.
Can you please let me know what I am missing here. Thanks so much!

It seems the issue is with your configuration. I have similar issue before, but I am having following jobs successfully able to save abd get data from S3 bucket.
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, TimestampType
spark = SparkSession \
.builder \
.appName("SparkExample") \
.getOrCreate()
spark_context = spark.sparkContext
spark_context._jsc.hadoopConfiguration().set("fs.s3a.access.key", '<KEY>')
spark_context._jsc.hadoopConfiguration().set("fs.s3a.secret.key", 'SECRET_KEY')
spark.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS")
schema = StructType().add("_id", StringType()) \
.add("employer", StringType()) \
.add("created_at", TimestampType()) \
.add("name", StringType())
employees = [{'_id': 1,
'employer': 'Microsoft',
'created_at': datetime.now(),
'name': 'Noel'
},
{'_id': 2,
'employer': 'Apple',
'created_at': datetime.now(),
'name': 'Steve'
},
]
df = spark.createDataFrame(employees, schema=schema)
df.write \
.format("json") \
.mode("append") \
.save("s3a://<YOUR BUCKET>/employeesjson")
collect = spark.read.format("json").load(
"s3a://<YOUR BUCKET>/employeesjson").collect()
print(len(collect))

I was able to resolve this issue by updating spark from using hadoop 2.7 to hadoop 3.2

Related

PySpark with io.github.spark-redshift-community: BasicAWSCredentialsProvider not found

I'm trying to load data from my redshift database using PySpark.
I'm using "io.github.spark-redshift-community" as connector. It's requires a "tempdir" parameter to use a S3. My code looks like the following:
import findspark
findspark.add_packages("io.github.spark-redshift-community:spark-redshift_2.12:5.0.3")
findspark.add_packages("com.amazonaws:aws-java-sdk-bundle:1.12.262")
findspark.add_packages("org.apache.hadoop:hadoop-aws:3.3.4")
findspark.init()
spark = SparkSession.builder.master("local[8]").appName("Dim_Customer").getOrCreate()
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", S3_ACCESS_KEY)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", S3_SECRET_KEY)
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3a.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
spark._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
df_read_1 = spark.read \
.format("io.github.spark_redshift_community.spark.redshift") \
.option("url", "jdbc:redshift://IP/DATABASE?user=USER&password=PASS") \
.option("dbtable", "table") \
.option("tempdir", "s3a://url/")\
.option("forward_spark_s3_credentials", "true") \
.load()
But I'm getting an error: Class org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider not found
I've fond some sources saying to change BasicAWSCredentialsProvider to SimpleAWSCredentialsProvider, but I get another error: NoSuchMethodError.
Could someone help me, please?
Is that any problem with the hadoop and aws-java-sdk versions?
Thank you in advance!

Problem connecting remote PrestoDB from local machine using pyspark

I'm trying to connecct to a presto DB installed in a remote server from my mac local machine using pyspark, below is my code. I have downloaded the presto driver and placed it under /user/name//Hadoop/spark-2.3.1-bin-hadoop2.7/jars ( I guess this is where I'm making a mistake, but not sure)
from pyspark.sql import SparkSession, HiveContext
from pyhive import presto, hive
def main():
spark = SparkSession.builder\
.appName("tests")\
.enableHiveSupport()\
.getOrCreate()
df_presto = spark.read.format("jdbc") \
.option("driver", "io.prestosql.jdbc.PrestoDriver")\
.option("url", "jdbc:presto://host.com:443/hive") \
.option("user", "user_name")\
.option("password", "password") \
.option("dbtable", "(select column from table_name limit 10) tmp") \
.load()
Preso driver : presto-jdbc-340.jar
When I tried to execute the code, I'm getting an error as below
Traceback (most recent call last):
File "/Users/user_name/Hadoop/spark-2.3.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/Users/user_name/Hadoop/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o38.load.
: org.apache.spark.sql.AnalysisException: java.lang.RuntimeException: java.lang.IllegalArgumentException: java.net.UnknownHostException: ip-10-120-99-149.ec2.internal;
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:106)
at org.apache.spark.sql.hive.HiveExternalCatalog.databaseExists(HiveExternalCatalog.scala:194)
Any idea how can I fix this?

PySpark: How do I solve 'python worker failed to connect back' error when using pyproj package in Pandas UDF? (Converting lat/long to UTM coordinates)

I have a json file with lat/long coordinates, which I try to convert to UTM ("x", "y") in PySpark.
The .json file looks like this:
{"positionmessage":{"latitude": 51.822872161865234,"longitude": 4.905614852905273}}
{"positionmessage":{"latitude": 51.819644927978516, "longitude": 4.961687088012695}}
I read the json file in pyspark and try to convert to UTM ('x', 'y'-coord) in PySpark with the following script:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType, DateType, FloatType, TimestampType, DoubleType
from pyspark.sql.functions import *
appName = "PySpark"
master = "local"
file_name = "lat_lon.JSON"
# Create Spark session
spark = SparkSession.builder \
.appName(appName) \
.master(master) \
.getOrCreate()
schema = StructType([
StructField("positionmessage",
StructType([
StructField('latitude', DoubleType(), True),
StructField('longitude', DoubleType(), True),
]))])
df = spark.read.schema(schema).json(file_name).select("positionmessage.*")
Until here no problem; the problem arises when I try to convert to UTM coordinates using the pyproj package (which worked in Pandas).
from pyspark.sql.functions import array, pandas_udf, PandasUDFType
from pyproj import Proj
from pandas import Series
# using decorator 'pandas_udf' to wrap the function.
#pandas_udf('array<double>', PandasUDFType.SCALAR)
def get_utm(x):
pp = Proj(proj='utm',zone=31,ellps='WGS84', preserve_units=False)
return Series([ pp(e[0], e[1]) for e in x ])
df = df.withColumn('utm', get_utm(array('longitude','latitude'))) \
.selectExpr("*", "utm[0] as X", "utm[1] as Y")
df.show()
I get the problem: " python worker failed to connect back", but there does not seem to be a problem with the code itself. What can the problem be?
You can use a plain UDF rather than Pandas UDF:
#udf(returnType=ArrayType(DoubleType()))
def get_utm(long, lat):
pp = Proj(proj='utm', zone=31, ellps='WGS84', preserve_units=False)
return pp(long, lat)
result = df.withColumn('utm', get_utm('longitude','latitude')).selectExpr("*", "utm[0] as X", "utm[1] as Y")

Passing external yml file in my spark-job/code not working throwing "Can't construct a java object for tag:yaml.org,2002"

I am using spark 2.4.1 version and java8. I am trying to load external property file while submitting my spark job using spark-submit.
As I am using below TypeSafe to load my property file.
<groupId>com.typesafe</groupId>
<artifactId>config</artifactId>
<version>1.3.1</version>
In my spark driver class MyDriver.java I am loading the YML file as below
String ymlFilename = args[1].toString();
Optional<QueryEntities> entities = InputYamlProcessor.process(ymlFilename);
I have all code here including InputYamlProcessor.java
https://gist.github.com/BdLearnerr/e4c47c5f1dded951b18844b278ea3441
This is working fine in my local but when I run on cluster this gives error
Error :
Can't construct a java object for tag:yaml.org,2002:com.snp.yml.QueryEntities; exception=Class not found: com.snp.yml.QueryEntities
in 'reader', line 1, column 1:
entities:
^
at org.yaml.snakeyaml.constructor.Constructor$ConstructYamlObject.construct(Constructor.java:345)
at org.yaml.snakeyaml.constructor.BaseConstructor.getSingleData(BaseConstructor.java:127)
at org.yaml.snakeyaml.Yaml.loadFromReader(Yaml.java:450)
at org.yaml.snakeyaml.Yaml.loadAs(Yaml.java:444)
at com.snp.yml.InputYamlProcessor.process(InputYamlProcessor.java:62)
Caused by: org.yaml.snakeyaml.error.YAMLException: Class not found: com.snp.yml.QueryEntities
at org.yaml.snakeyaml.constructor.Constructor.getClassForNode(Constructor.java:650)
at org.yaml.snakeyaml.constructor.Constructor$ConstructYamlObject.getConstructor(Constructor.java:331)
at org.yaml.snakeyaml.constructor.Constructor$ConstructYamlObject.construct(Constructor.java:341)
... 12 more
My spark job script is
$SPARK_HOME/bin/spark-submit \
--master yarn \
--deploy-mode cluster \
--name MyDriver \
--jars "/local/jars/*.jar" \
--files hdfs://files/application-cloud-dev.properties,hdfs://files/column_family_condition.yml \
--class com.sp.MyDriver \
--executor-cores 3 \
--executor-memory 9g \
--num-executors 5 \
--driver-cores 2 \
--driver-memory 4g \
--driver-java-options -Dconfig.file=./application-cloud-dev.properties \
--conf spark.executor.extraJavaOptions=-Dconfig.file=./application-cloud-dev.properties \
--conf spark.driver.extraClassPath=. \
--driver-class-path . \
ca-datamigration-0.0.1.jar application-cloud-dev.properties column_family_condition.yml
What am I doing wrong here? How to fix this issue ?
Any fix is highly thankful.
Tested :
I printed something like this inside the class , before the line where getting above... to check if the issue is really class not found.
public static void printTest() {
QueryEntity e1 = new QueryEntity();
e1.setTableName("tab1");
List<QueryEntity> li = new ArrayList<QueryEntity>();
li.add(e1);
QueryEntities ll = new QueryEntities();
ll.setEntitiesList(li);
ll.getEntitiesList().stream().forEach(e -> logger.error("e1 Name :" + e.getTableName()));
return;
}
Output :
19/09/18 04:40:33 ERROR yml.InputYamlProcessor: e1 Name :tab1
Can't construct a java object for tag:yaml.org,2002:com.snp.helpers.QueryEntities; exception=Class not found: com.snp.helpers.QueryEntities
in 'reader', line 1, column 1:
entitiesList:
at org.yaml.snakeyaml.constructor.Constructor$ConstructYamlObject.construct(Constructor.java:345)
What is wrong here ?
This has got nothing to do with QueryEntities
i.e. YAMLException: Class not found: com.snp.yml.QueryEntities
is YML constructor issue
Changed To
Yaml yaml = new Yaml(new CustomClassLoaderConstructor(com.snp.helpers.QueryEntities.class.getClassLoader()));
From
/*Constructor constructor = new Constructor(com.snp.helpers.QueryEntities.class);
Yaml yaml = new Yaml( constructor );*/

How to load Impala table directly to Spark using JDBC?

I am trying to write a spark job with Python that would open a jdbc connection with Impala and load a VIEW directly from Impala into a Dataframe. This question is pretty close but in scala: Calling JDBC to impala/hive from within a spark job and creating a table
How do I do this? There are plenty of examples for other datasources such as MySQL, PostgreSQL, etc. but I haven't seen one for Impala + Python + Kerberos. An example would be of great help. Thank you!
Tried this with information from the web but it didn't work.
SPARK Notebook
#!/bin/bash
export PYSPARK_PYTHON=/home/anave/anaconda2/bin/python
export HADOOP_CONF_DIR=/etc/hive/conf
export PYSPARK_DRIVER_PYTHON=/home/anave/anaconda2/bin/ipython
export PYSPARK_DRIVER_PYTHON_OPTS='notebook --ip=* --no-browser'
# use Java8
export JAVA_HOME=/usr/java/latest
export PATH=$JAVA_HOME/bin:$PATH
# JDBC Drivers for Impala
export CLASSPATH=/home/anave/impala_jdbc_2.5.30.1049/Cloudera_ImpalaJDBC41_2.5.30/*.jar:$CLASSPATH
export JDBC_PATH=/home/anave/impala_jdbc_2.5.30.1049/Cloudera_ImpalaJDBC41_2.5.30
# --jars $SRCDIR/spark-csv-assembly-1.4.0-SNAPSHOT.jar \
# --conf spark.sql.parquet.binaryAsString=true \
# --conf spark.sql.hive.convertMetastoreParquet=false
pyspark --master yarn-client \
--driver-memory 4G \
--executor-memory 2G \
# --num-executors 10 \
--jars /home/anave/spark-csv_2.11-1.4.0.jar $JDBC_PATH/*.jar
--driver-class-path $JDBC_PATH/*.jar
Python Code
properties = {
"driver": "com.cloudera.impala.jdbc41.Driver",
"AuthMech": "1",
# "KrbRealm": "EXAMPLE.COM",
# "KrbHostFQDN": "impala.example.com",
"KrbServiceName": "impala"
}
# imp_env is the hostname of the db, works with other impala queries ran inside python
url = "jdbc:impala:imp_env;auth=noSasl"
db_df = sqlContext.read.jdbc(url=url, table='summary', properties=properties)
I received this error msg (Full Error Log):
Py4JJavaError: An error occurred while calling o42.jdbc.
: java.lang.ClassNotFoundException: com.cloudera.impala.jdbc41.Driver
You can use
--jars $(echo /dir/of/jars/*.jar | tr ' ' ',')
instead of
--jars /home/anave/spark-csv_2.11-1.4.0.jar $JDBC_PATH/*.jar
or for another approach please see my answer
1st approach is to use spark-submit on below impala_jdbc_connection.py script like spark-submit --driver-class-path /opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/jars/ImpalaJDBC41.jar --jars /opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/jars/ImpalaJDBC41.jar --class com.cloudera.impala.jdbc41.Driver impala_jdbc_connection.py
impala_jdbc_connection.py
properties = {
"drivers": "com.cloudera.impala.jdbc41.Driver"
}
#initalize the spark session
spark = (
SparkSession.builder
.config("spark.jars.packages", "jar-packages-list")
.config("spark.sql.warehouse.dir","hdfs://dwh-hdp-node01.dev.ergo.liferunoffinsuranceplatform.com:8020/user/hive/warehouse")
.enableHiveSupport()
.getOrCreate()
)
db_df = spark.read.jdbc(url= 'jdbc:impala://host_ip_address:21050/database_name', table ='table_name', properties = properties)
db_df.show()
2nd approach is not a direct import from impala to spark but rather a conversion of results to spark dataframe
pip install impyla Source: https://github.com/cloudera/impyla
Connect to impala and fetch results from impala database and convert result to spark dataframe
from impala.dbapi import connect
conn = connect(host = 'IP_ADDRESS_OF_HOST', port=21050)
cursor = conn.cursor()
cursor.execute('select * from database.table')
res= cursor.fetchall() # convert res to spark dataframe
for data in res:
print(data)
Did this in Azure Databricks notebook after setting up the jar in the cluster libraries. Generally followed previous post except that d is upper case for Driver config. Worked great.
properties = {
"Driver": "com.cloudera.impala.jdbc41.Driver"
}
db_df = spark.read.jdbc(url= 'jdbc:impala://hostname.domain.net:21050/dbname;AuthMech=3;UID=xxxx;PWD=xxxx', table ='product', properties = properties)
db_df.show()
This works for me:
spark-shell --driver-class-path ImpalaJDBC41.jar --jars ImpalaJDBC41.jar
val jdbcURL = s"jdbc:impala://192.168.56.101:21050;AuthMech=0"
val connectionProperties = new java.util.Properties()
val hbaseDF = sqlContext.read.jdbc(jdbcURL, "impala_table", connectionProperties)

Resources