IllegalArgumentException: u'Wrong FS: file://spark-warehouse, expected: file:///' - jdbc

I am trying to load my Postgres database into Spark using PySpark:
from pyspark import SparkContext
from pyspark import SparkConf
from random import random
#spark conf
conf = SparkConf()
conf.setMaster("spark://spark-master:7077")
conf.setAppName('pyspark')
sc = SparkContext(conf=conf)
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
properties = {
"user": "postgres",
"password": "password123",
"driver": "org.postgresql.Driver"
}
url = "jdbc.postgresql://<POSTGRES_IP>/DB_NAME"
df = sqlContext.read.jdbc(url=url, table='myTable', properties=properties)
I get the following error, which I have no idea what it means.
/opt/spark/python/pyspark/sql/readwriter.pyc in jdbc(self, url, table, column, lowerBound, upperBound, numPartitions, predicates, properties)
420 jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates)
421 return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
--> 422 return self._df(self._jreader.jdbc(url, table, jprop))
423
424
/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/opt/spark/python/pyspark/sql/utils.pyc in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: u'Wrong FS: file://spark-warehouse, expected: file:///'

Specifying an existing directory for the sql warehouse directory setting should solve your issue. For example at job launch time:
./bin/spark-submit --conf spark.sql.warehouse.dir=/tmp/ \
... # other options
your_file.py \
[application-arguments]

Related

Getting error for basic Eland example (loading index from a locally installed ELK docker container)

We installed ELK in docker based on this example. Like:
docker run -d --name elasticsearchdb --net es-stack-network -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:6.8.13
docker run -d --name kibana-es-ui --net es-stack-network -e "ELASTICSEARCH_URL=http://elasticsearchdb:9200" -p 5601:5601 kibana:6.8.13
We then set up Elastic with the basic built-in data sets, including the flights dataset offered by default.
Then we tried using Eland to pull that data into a dataframe, and I think we're following the documentation correctly.
But with the code:
import eland as ed
index_name = 'flights'
ed_df = ed.DataFrame('localhost:9200', index_name)
we get this error:
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\elastic_transport\client_utils.py:198, in url_to_node_config(url)
192 raise ValueError(f"Could not parse URL {url!r}") from None
194 if any(
195 component in (None, "")
196 for component in (parsed_url.scheme, parsed_url.host, parsed_url.port)
197 ):
--> 198 raise ValueError(
199 "URL must include a 'scheme', 'host', and 'port' component (ie 'https://localhost:9200')"
200 )
202 headers = {}
203 if parsed_url.auth:
ValueError: URL must include a 'scheme', 'host', and 'port' component (ie 'https://localhost:9200')
So when we add http://, like so:
import eland as ed
index_name = 'flights'
ed_df = ed.DataFrame('http://localhost:9200', index_name)
We get this error:
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\elastic_transport\_node\_http_urllib3.py:199, in Urllib3HttpNode.perform_request(self, method, target, body, headers, request_timeout)
191 err = ConnectionError(str(e), errors=(e,))
192 self._log_request(
193 method=method,
194 target=target,
(...)
197 exception=err,
198 )
--> 199 raise err from None
201 meta = ApiResponseMeta(
202 node=self.config,
203 duration=duration,
(...)
206 headers=response_headers,
207 )
208 self._log_request(
209 method=method,
210 target=target,
(...)
214 response=data,
215 )
ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: ProtocolError(('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))))
So I thought, well, maybe it's serving on HTTPS by default for some reason, maybe not related but in the logs I saw:
05T17:17:04.734Z", "log.level": "WARN", "message":"received plaintext http traffic on an https channel, closing connection Netty4HttpChannel{localAddress=/172.18.0.3:9200, remoteAddress=/172.18.0.1:59788}", "ecs.version": "1.2.0","service.name":"ES_ECS","event.dataset":"elasticsearch.server","process.thread.name":"elasticsearch[21683dc12cff][transport_worker][T#14]","log.logger":"org.elasticsearch.xpack.security.transport.netty4.SecurityNetty4HttpServerTransport","elasticsearch.cluster.uuid":"XuzqXMk_QgShA3L5HnfXgw","elasticsearch.node.id":"H1CsKboeTyaFFjk2-1nw2w","elasticsearch.node.name":"21683dc12cff","elasticsearch.cluster.name":"docker-cluster"}
so I try replacing http with https and get this error:
TlsError: TLS error caused by: TlsError(TLS error caused by: SSLError([SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1129)))
So I look up this error and find this thread which says do something like:
import ssl
from elasticsearch.connection import create_ssl_context
ssl_context = create_ssl_context(<use `cafile`, or `cadata` or `capath` to set your CA or CAs)
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
es = Elasticsearch('localhost', ssl_context=context, timeout=60
But this isn't helpful because Eland handles elasticsearch instancing internally, I'm not controlling that.
This is a very basic scenario, so I'm sure the solution must be much simpler than all this. What can I do to make this work?
For who ever is still struggling, the following worked for me with a local Elastic cluster using Docker/docker-compose:
Following this guide you'd have the http_ca.crt file locally, using the command:
docker cp es01:/usr/share/elasticsearch/config/certs/http_ca.crt .
You can use the http_ca.crt file when creating your es_client:
from elasticsearch import Elasticsearch
es_client = Elasticsearch("https://localhost:9200",
ca_certs="/path/to/http_ca.crt",
basic_auth=("[elastic username]",
"[elastic password]"))
And then use the es_client to connect to eland:
import eland as ed
df = ed.DataFrame(es_client=es_client, es_index_pattern="[Your index]")
df.head()

Sagemaker Studio UnkownServiceError for Session

When I run the following code in Sagemaker Studio, I get the UnknownServiceError:
import boto3
import sagemaker
sagemaker_session = sagemaker.Session()
role = get_execution_role()
Any advice to overcome this would be greatly appreciated.
The error message in the following:
---------------------------------------------------------------------------
UnknownServiceError Traceback (most recent call last)
<ipython-input-47-2d2ae2d1e577> in <module>
1 import boto3
2 import sagemaker
----> 3 sagemaker_session = sagemaker.Session()
4 role = get_execution_role()
/opt/conda/lib/python3.7/site-packages/sagemaker/session.py in __init__(self, boto_session, sagemaker_client, sagemaker_runtime_client, sagemaker_featurestore_runtime_client, default_bucket)
124 sagemaker_client=sagemaker_client,
125 sagemaker_runtime_client=sagemaker_runtime_client,
--> 126 sagemaker_featurestore_runtime_client=sagemaker_featurestore_runtime_client,
127 )
128
/opt/conda/lib/python3.7/site-packages/sagemaker/session.py in _initialize(self, boto_session, sagemaker_client, sagemaker_runtime_client, sagemaker_featurestore_runtime_client)
164 else:
165 self.sagemaker_featurestore_runtime_client = self.boto_session.client(
--> 166 "sagemaker-featurestore-runtime"
167 )
168
/opt/conda/lib/python3.7/site-packages/boto3/session.py in client(self, service_name, region_name, api_version, use_ssl, verify, endpoint_url, aws_access_key_id, aws_secret_access_key, aws_session_token, config)
261 aws_access_key_id=aws_access_key_id,
262 aws_secret_access_key=aws_secret_access_key,
--> 263 aws_session_token=aws_session_token, config=config)
264
265 def resource(self, service_name, region_name=None, api_version=None,
/opt/conda/lib/python3.7/site-packages/botocore/session.py in create_client(self, service_name, region_name, api_version, use_ssl, verify, endpoint_url, aws_access_key_id, aws_secret_access_key, aws_session_token, config)
833 is_secure=use_ssl, endpoint_url=endpoint_url, verify=verify,
834 credentials=credentials, scoped_config=self.get_scoped_config(),
--> 835 client_config=config, api_version=api_version)
836 monitor = self._get_internal_component('monitor')
837 if monitor is not None:
/opt/conda/lib/python3.7/site-packages/botocore/client.py in create_client(self, service_name, region_name, is_secure, endpoint_url, verify, credentials, scoped_config, api_version, client_config)
76 'choose-service-name', service_name=service_name)
77 service_name = first_non_none_response(responses, default=service_name)
---> 78 service_model = self._load_service_model(service_name, api_version)
79 cls = self._create_client_class(service_name, service_model)
80 endpoint_bridge = ClientEndpointBridge(
/opt/conda/lib/python3.7/site-packages/botocore/client.py in _load_service_model(self, service_name, api_version)
114 def _load_service_model(self, service_name, api_version=None):
115 json_model = self._loader.load_service_model(service_name, 'service-2',
--> 116 api_version=api_version)
117 service_model = ServiceModel(json_model, service_name=service_name)
118 return service_model
/opt/conda/lib/python3.7/site-packages/botocore/loaders.py in _wrapper(self, *args, **kwargs)
130 if key in self._cache:
131 return self._cache[key]
--> 132 data = func(self, *args, **kwargs)
133 self._cache[key] = data
134 return data
/opt/conda/lib/python3.7/site-packages/botocore/loaders.py in load_service_model(self, service_name, type_name, api_version)
376 raise UnknownServiceError(
377 service_name=service_name,
--> 378 known_service_names=', '.join(sorted(known_services)))
379 if api_version is None:
380 api_version = self.determine_latest_version(
UnknownServiceError: Unknown service: 'sagemaker-featurestore-runtime'. Valid service names are: accessanalyzer, acm, acm-pca, alexaforbusiness, amplify, apigateway, apigatewaymanagementapi, apigatewayv2, appconfig, application-autoscaling, application-insights, appmesh, appstream, appsync, athena, autoscaling, autoscaling-plans, backup, batch, braket, budgets, ce, chime, cloud9, clouddirectory, cloudformation, cloudfront, cloudhsm, cloudhsmv2, cloudsearch, cloudsearchdomain, cloudtrail, cloudwatch, codeartifact, codebuild, codecommit, codedeploy, codeguru-reviewer, codeguruprofiler, codepipeline, codestar, codestar-connections, codestar-notifications, cognito-identity, cognito-idp, cognito-sync, comprehend, comprehendmedical, compute-optimizer, config, connect, connectparticipant, cur, dataexchange, datapipeline, datasync, dax, detective, devicefarm, directconnect, discovery, dlm, dms, docdb, ds, dynamodb, dynamodbstreams, ebs, ec2, ec2-instance-connect, ecr, ecs, efs, eks, elastic-inference, elasticache, elasticbeanstalk, elastictranscoder, elb, elbv2, emr, es, events, firehose, fms, forecast, forecastquery, frauddetector, fsx, gamelift, glacier, globalaccelerator, glue, greengrass, groundstation, guardduty, health, honeycode, iam, imagebuilder, importexport, inspector, iot, iot-data, iot-jobs-data, iot1click-devices, iot1click-projects, iotanalytics, iotevents, iotevents-data, iotsecuretunneling, iotsitewise, iotthingsgraph, ivs, kafka, kendra, kinesis, kinesis-video-archived-media, kinesis-video-media, kinesis-video-signaling, kinesisanalytics, kinesisanalyticsv2, kinesisvideo, kms, lakeformation, lambda, lex-models, lex-runtime, license-manager, lightsail, logs, machinelearning, macie, macie2, managedblockchain, marketplace-catalog, marketplace-entitlement, marketplacecommerceanalytics, mediaconnect, mediaconvert, medialive, mediapackage, mediapackage-vod, mediastore, mediastore-data, mediatailor, meteringmarketplace, mgh, migrationhub-config, mobile, mq, mturk, neptune, networkmanager, opsworks, opsworkscm, organizations, outposts, personalize, personalize-events, personalize-runtime, pi, pinpoint, pinpoint-email, pinpoint-sms-voice, polly, pricing, qldb, qldb-session, quicksight, ram, rds, rds-data, redshift, rekognition, resource-groups, resourcegroupstaggingapi, robomaker, route53, route53domains, route53resolver, s3, s3control, sagemaker, sagemaker-a2i-runtime, sagemaker-runtime, savingsplans, schemas, sdb, secretsmanager, securityhub, serverlessrepo, service-quotas, servicecatalog, servicediscovery, ses, sesv2, shield, signer, sms, sms-voice, snowball, sns, sqs, ssm, sso, sso-oidc, stepfunctions, storagegateway, sts, support, swf, synthetics, textract, transcribe, transfer, translate, waf, waf-regional, wafv2, workdocs, worklink, workmail, workmailmessageflow, workspaces, xray
I encountered the same issue and pip install sagemaker -U (to 2.20.0) resolved it for me, hopefully that's all you need to get around this.

How do I access SparkContext in Dataproc?

My goal is to use the elasticsearch-hadoop connector to load data directly into ES with pySpark.
I'm quite new to dataproc and pySpark and got stuck quite early.
I run a single node cluster (Image 1.3 ,Debian 9,Hadoop 2.9,Spark 2.3)
and this my code. I assume I need to install Java.
Thanks!
from pyspark.sql import SQLContext
from pyspark.sql.functions import lit
import os
from pyspark.sql import SparkSession
def install_java():
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null #install openjdk
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" #set environment
variable
!java -version #check java version
install_java()
conf = SparkConf().setAppName("testing").setMaster('ip-address')
sc = SparkContext.getOrCreate()
ExceptionTraceback (most recent call last)
<ipython-input-18-df37a24b7514> in <module>()
----> 1 sc = SparkContext.getOrCreate()
/usr/lib/spark/python/pyspark/context.pyc in getOrCreate(cls, conf)
361 with SparkContext._lock:
362 if SparkContext._active_spark_context is None:
--> 363 SparkContext(conf=conf or SparkConf())
364 return SparkContext._active_spark_context
365
/usr/lib/spark/python/pyspark/context.pyc in __init__(self, master, appName, sparkHome,
pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
127 " note this option will be removed in Spark 3.0")
128
--> 129 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
130 try:
131 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize,
serializer,
/usr/lib/spark/python/pyspark/context.pyc in _ensure_initialized(cls, instance, gateway,
conf)
310 with SparkContext._lock:
311 if not SparkContext._gateway:
--> 312 SparkContext._gateway = gateway or launch_gateway(conf)
313 SparkContext._jvm = SparkContext._gateway.jvm
314
/usr/lib/spark/python/pyspark/java_gateway.pyc in launch_gateway(conf)
44 :return: a JVM gateway
45 """
---> 46 return _launch_gateway(conf)
47
48
/usr/lib/spark/python/pyspark/java_gateway.pyc in _launch_gateway(conf, insecure)
106
107 if not os.path.isfile(conn_info_file):
--> 108 raise Exception("Java gateway process exited before sending its port
number")
109
110 with open(conn_info_file, "rb") as info:
Exception: Java gateway process exited before sending its port number
Ok, solved, I needed to stop the current context before I create my new SparkContext.
sc.stop()

AttributeError: 'NoneType' object has no attribute 'ReadAsArray' when loading an image, what could be the cause?

I'm trying to build a convolutional neural network for image classification in Python.
I run my code on CoLab and have loaded my data on Google Drive.
I can see all the files and folders in my google drive from python, but when I try to actually load an image it gives me the error in the title.
I'm using the skimage.io package, I'm actually just running a notebook I found on kaggle so the code should run fine, only difference I noticed is that the kaggle user was probably not working on CoLab with his data in GoogleDrive so I think maybe that's the problem, anyway here's my code:
from skimage.io import imread
img=imread('/content/drive/My Drive/CoLab/Data/chest_xray/train/PNEUMONIA/person53_bacteria_255.jpeg')
Which gives me the following error:
AttributeError: 'NoneType' object has no attribute 'ReadAsArray'
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-12-4a64aebb8504> in <module>()
----> 1 img=imread('/content/drive/My Drive/CoLab/Data/chest_xray/train/PNEUMONIA/person53_bacteria_255.jpeg')
4 frames
/usr/local/lib/python3.6/dist-packages/skimage/io/_io.py in imread(fname, as_gray, plugin, flatten, **plugin_args)
59
60 with file_or_url_context(fname) as fname:
---> 61 img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
62
63 if not hasattr(img, 'ndim'):
/usr/local/lib/python3.6/dist-packages/skimage/io/manage_plugins.py in call_plugin(kind, *args, **kwargs)
208 (plugin, kind))
209
--> 210 return func(*args, **kwargs)
211
212
/usr/local/lib/python3.6/dist-packages/imageio/core/functions.py in imread(uri, format, **kwargs)
221 reader = read(uri, format, "i", **kwargs)
222 with reader:
--> 223 return reader.get_data(0)
224
225
/usr/local/lib/python3.6/dist-packages/imageio/core/format.py in get_data(self, index, **kwargs)
345 self._checkClosed()
346 self._BaseReaderWriter_last_index = index
--> 347 im, meta = self._get_data(index, **kwargs)
348 return Array(im, meta) # Array tests im and meta
349
/usr/local/lib/python3.6/dist-packages/imageio/plugins/gdal.py in _get_data(self, index)
64 if index != 0:
65 raise IndexError("Gdal file contains only one dataset")
---> 66 return self._ds.ReadAsArray(), self._get_meta_data(index)
67
68 def _get_meta_data(self, index):
AttributeError: 'NoneType' object has no attribute 'ReadAsArray'
Frist instead of My Drive it should be MyDrive (no space).
If it still doesn't work, you can try the following:
%cd /content/drive/MyDrive/CoLab/Data/chest_xray/train/PNEUMONIA
img=imread('person53_bacteria_255.jpeg')```

Geopandas save shapefile in memory using bytesIO or equivelent (python3X)

Imagine that I am manipulating a shapefile in geopandas. I then want to load it using another library (like networkx) but since my file is large I dont want to have to save and reload it. Is there a way I can save it in memory? I imagine it would look something like this:
import geopandas
from io import BytesIO
writeBytes = BytesIO()
### load the demo
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
### do something trivial to the demo
world['geometry'] = world['geometry'].buffer(0.05)
### save to bytes IO so that I can do something else with it without having to save and read a file
world.to_file(writeBytes)
Running the above yields a TypeError: expected str, bytes or os.PathLike object, not _io.BytesIO
This is the full traceback:
TypeError Traceback (most recent call last)
<ipython-input-1-1ba22f23181a> in <module>
8 world['geometry'] = world['geometry'].buffer(0.05)
9 ### save to bytes IO so that I can do something else with it without having to save and read a file
---> 10 world.to_file(writeBytes)
~/.conda/envs/geopandas/lib/python3.7/site-packages/geopandas/geodataframe.py in to_file(self, filename, driver, schema, **kwargs)
427 """
428 from geopandas.io.file import to_file
--> 429 to_file(self, filename, driver, schema, **kwargs)
430
431 def to_crs(self, crs=None, epsg=None, inplace=False):
~/.conda/envs/geopandas/lib/python3.7/site-packages/geopandas/io/file.py in to_file(df, filename, driver, schema, **kwargs)
125 if schema is None:
126 schema = infer_schema(df)
--> 127 filename = os.path.abspath(os.path.expanduser(filename))
128 with fiona_env():
129 with fiona.open(filename, 'w', driver=driver, crs=df.crs,
~/.conda/envs/geopandas/lib/python3.7/posixpath.py in expanduser(path)
233 """Expand ~ and ~user constructions. If user or $HOME is unknown,
234 do nothing."""
--> 235 path = os.fspath(path)
236 if isinstance(path, bytes):
237 tilde = b'~'
Any assistance is appreciated, Thank You
geopandas.to_file() requires a file path, not a BytesIO object.
Use a temporary file (or folder for shape files)
For example: https://stackoverflow.com/a/70254174/2023941

Resources