How do I access SparkContext in Dataproc? - elasticsearch

My goal is to use the elasticsearch-hadoop connector to load data directly into ES with pySpark.
I'm quite new to dataproc and pySpark and got stuck quite early.
I run a single node cluster (Image 1.3 ,Debian 9,Hadoop 2.9,Spark 2.3)
and this my code. I assume I need to install Java.
Thanks!
from pyspark.sql import SQLContext
from pyspark.sql.functions import lit
import os
from pyspark.sql import SparkSession
def install_java():
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null #install openjdk
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" #set environment
variable
!java -version #check java version
install_java()
conf = SparkConf().setAppName("testing").setMaster('ip-address')
sc = SparkContext.getOrCreate()
ExceptionTraceback (most recent call last)
<ipython-input-18-df37a24b7514> in <module>()
----> 1 sc = SparkContext.getOrCreate()
/usr/lib/spark/python/pyspark/context.pyc in getOrCreate(cls, conf)
361 with SparkContext._lock:
362 if SparkContext._active_spark_context is None:
--> 363 SparkContext(conf=conf or SparkConf())
364 return SparkContext._active_spark_context
365
/usr/lib/spark/python/pyspark/context.pyc in __init__(self, master, appName, sparkHome,
pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
127 " note this option will be removed in Spark 3.0")
128
--> 129 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
130 try:
131 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize,
serializer,
/usr/lib/spark/python/pyspark/context.pyc in _ensure_initialized(cls, instance, gateway,
conf)
310 with SparkContext._lock:
311 if not SparkContext._gateway:
--> 312 SparkContext._gateway = gateway or launch_gateway(conf)
313 SparkContext._jvm = SparkContext._gateway.jvm
314
/usr/lib/spark/python/pyspark/java_gateway.pyc in launch_gateway(conf)
44 :return: a JVM gateway
45 """
---> 46 return _launch_gateway(conf)
47
48
/usr/lib/spark/python/pyspark/java_gateway.pyc in _launch_gateway(conf, insecure)
106
107 if not os.path.isfile(conn_info_file):
--> 108 raise Exception("Java gateway process exited before sending its port
number")
109
110 with open(conn_info_file, "rb") as info:
Exception: Java gateway process exited before sending its port number

Ok, solved, I needed to stop the current context before I create my new SparkContext.
sc.stop()

Related

How to fix ModuleNotFoundError: No module named 'binance'?

From Jupyter Notebook I ran pip install binance. Running from binance.client import Client gives the error above. I have renamed the binance.py file as mentioned in similar questions however I'm still getting the error. I haven't installed for one version of python while trying to run my code with another as mentioned in another question. Trying pip uninstall gives "WARNING: Skipping binance as it is not installed.".
How can I get the python-binance package to work?
Edit: Following Wayne's comment I tried %conda install -c conda-forge python-binance and encounter a new error when trying to import: No module named 'importlib.readers'
Edit 2: conda list and pip list both run without errors.
My traceback:
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[2], line 1
----> 1 from binance.client import Client
File ~\anaconda3\envs\py3\lib\site-packages\binance\__init__.py:9
1 """An unofficial Python wrapper for the Binance exchange API v3
2
3 .. moduleauthor:: Sam McHardy
4
5 """
7 __version__ = '1.0.16'
----> 9 from binance.client import Client, AsyncClient # noqa
10 from binance.depthcache import DepthCacheManager, OptionsDepthCacheManager, ThreadedDepthCacheManager # noqa
11 from binance.streams import BinanceSocketManager, ThreadedWebsocketManager # noqa
File ~\anaconda3\envs\py3\lib\site-packages\binance\client.py:7
5 import hashlib
6 import hmac
----> 7 import requests
8 import time
9 from operator import itemgetter
File ~\anaconda3\envs\py3\lib\site-packages\requests\__init__.py:147
144 import logging
145 from logging import NullHandler
--> 147 from . import packages, utils
148 from .__version__ import (
149 __author__,
150 __author_email__,
(...)
158 __version__,
159 )
160 from .api import delete, get, head, options, patch, post, put, request
File ~\anaconda3\envs\py3\lib\site-packages\requests\utils.py:58
54 from .structures import CaseInsensitiveDict
56 NETRC_FILES = (".netrc", "_netrc")
---> 58 DEFAULT_CA_BUNDLE_PATH = certs.where()
60 DEFAULT_PORTS = {"http": 80, "https": 443}
62 # Ensure that ', ' is used to preserve previous delimiter behavior.
File ~\anaconda3\envs\py3\lib\site-packages\certifi\core.py:71, in where()
58 global _CACERT_PATH
59 if _CACERT_PATH is None:
60 # This is slightly janky, the importlib.resources API wants you
61 # to manage the cleanup of this file, so it doesn't actually
(...)
69 # it will do the cleanup whenever it gets garbage collected, so
70 # we will also store that at the global level as well.
---> 71 _CACERT_CTX = get_path("certifi", "cacert.pem")
72 _CACERT_PATH = str(_CACERT_CTX.__enter__())
74 return _CACERT_PATH
File ~\anaconda3\envs\py3\lib\importlib\resources.py:119, in path(package, resource)
112 else:
113 return BytesIO(data)
116 def open_text(package: Package,
117 resource: Resource,
118 encoding: str = 'utf-8',
--> 119 errors: str = 'strict') -> TextIO:
120 """Return a file-like object opened for text reading of the resource."""
121 resource = _normalize_path(resource)
File ~\anaconda3\envs\py3\lib\importlib\_common.py:52, in get_resource_reader(package)
ModuleNotFoundError: No module named 'importlib.readers'
As suggested in the question comments, my problem was inconsistency of installed packages due to using pip instead of conda. Uninstalling and reinstalling Anaconda fixed the module not found error.

trainer.train() in Kaggle: StdinNotImplementedError: getpass was called, but this frontend does not support input requests

When saving a version in Kaggle, I get StdinNotImplementedError: getpass was called, but this frontend does not support input requests whenever I use the Transformers.Trainer class. The general code I use:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(params)
trainer = Trainer(params)
trainer.train()
And the specific cell I am running now:
from transformers import Trainer, TrainingArguments,EarlyStoppingCallback
early_stopping = EarlyStoppingCallback()
training_args = TrainingArguments(
output_dir=OUT_FINETUNED_MODEL_PATH,
num_train_epochs=20,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=0,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=100,
evaluation_strategy="steps",
eval_steps=100,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
callbacks=[early_stopping]
)
trainer.train()
When trainer.train() is called, I get the error below, which I do not get if I train with native PyTorch. I understood that the error arises since I am asked to input a password, but no password is asked when using native PyTorch code, nor when using the same code with trainer.train() on Google Colab.
Any solution would be ok, like:
Avoid being asked the password.
Enable input requests when saving a notebook on Kaggle. After that, if I understood correctly, I would need to go to https://wandb.ai/authorize (after having created an account) and copy the generated key to console. However, I do not understand why wandb should be necessary since I never explicitly used it so far.
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_init.py", line 741, in init
wi.setup(kwargs)
File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_init.py", line 155, in setup
wandb_login._login(anonymous=anonymous, force=force, _disable_warning=True)
File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_login.py", line 210, in _login
wlogin.prompt_api_key()
File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_login.py", line 144, in prompt_api_key
no_create=self._settings.force,
File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/lib/apikey.py", line 135, in prompt_api_key
key = input_callback(api_ask).strip()
File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 825, in getpass
"getpass was called, but this frontend does not support input requests."
IPython.core.error.StdinNotImplementedError: getpass was called, but this frontend does not support input requests.
wandb: ERROR Abnormal program exit
---------------------------------------------------------------------------
StdinNotImplementedError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_init.py in init(job_type, dir, config, project, entity, reinit, tags, group, name, notes, magic, config_exclude_keys, config_include_keys, anonymous, mode, allow_val_change, resume, force, tensorboard, sync_tensorboard, monitor_gym, save_code, id, settings)
740 wi = _WandbInit()
--> 741 wi.setup(kwargs)
742 except_exit = wi.settings._except_exit
/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_init.py in setup(self, kwargs)
154 if not settings._offline and not settings._noop:
--> 155 wandb_login._login(anonymous=anonymous, force=force, _disable_warning=True)
156
/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_login.py in _login(anonymous, key, relogin, host, force, _backend, _silent, _disable_warning)
209 if not key:
--> 210 wlogin.prompt_api_key()
211
/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_login.py in prompt_api_key(self)
143 no_offline=self._settings.force,
--> 144 no_create=self._settings.force,
145 )
/opt/conda/lib/python3.7/site-packages/wandb/sdk/lib/apikey.py in prompt_api_key(settings, api, input_callback, browser_callback, no_offline, no_create, local)
134 )
--> 135 key = input_callback(api_ask).strip()
136 write_key(settings, key, api=api)
/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py in getpass(self, prompt, stream)
824 raise StdinNotImplementedError(
--> 825 "getpass was called, but this frontend does not support input requests."
826 )
StdinNotImplementedError: getpass was called, but this frontend does not support input requests.
The above exception was the direct cause of the following exception:
Exception Traceback (most recent call last)
<ipython-input-82-4d1046ab80b8> in <module>
42 )
43
---> 44 trainer.train()
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1067 model.zero_grad()
1068
-> 1069 self.control = self.callback_handler.on_train_begin(self.args, self.state, self.control)
1070
1071 # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
/opt/conda/lib/python3.7/site-packages/transformers/trainer_callback.py in on_train_begin(self, args, state, control)
338 def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
339 control.should_training_stop = False
--> 340 return self.call_event("on_train_begin", args, state, control)
341
342 def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
/opt/conda/lib/python3.7/site-packages/transformers/trainer_callback.py in call_event(self, event, args, state, control, **kwargs)
386 train_dataloader=self.train_dataloader,
387 eval_dataloader=self.eval_dataloader,
--> 388 **kwargs,
389 )
390 # A Callback can skip the return of `control` if it doesn't change it.
/opt/conda/lib/python3.7/site-packages/transformers/integrations.py in on_train_begin(self, args, state, control, model, **kwargs)
627 self._wandb.finish()
628 if not self._initialized:
--> 629 self.setup(args, state, model, **kwargs)
630
631 def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
/opt/conda/lib/python3.7/site-packages/transformers/integrations.py in setup(self, args, state, model, **kwargs)
604 project=os.getenv("WANDB_PROJECT", "huggingface"),
605 name=run_name,
--> 606 **init_args,
607 )
608 # add config parameters (run may have been created manually)
/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_init.py in init(job_type, dir, config, project, entity, reinit, tags, group, name, notes, magic, config_exclude_keys, config_include_keys, anonymous, mode, allow_val_change, resume, force, tensorboard, sync_tensorboard, monitor_gym, save_code, id, settings)
779 if except_exit:
780 os._exit(-1)
--> 781 six.raise_from(Exception("problem"), error_seen)
782 return run
/opt/conda/lib/python3.7/site-packages/six.py in raise_from(value, from_value)
Exception: problem
You may want to try adding report_to="tensorboard" or any other reasonable string array in your TrainingArguments
https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
If you have multiple logger that you want to use report_to="all" (the default value)
try os.environ["WANDB_DISABLED"] = "true" such that wandb is always disabled.
see: https://huggingface.co/transformers/main_classes/trainer.html#transformers.TFTrainer.setup_wandb

Sagemaker Studio UnkownServiceError for Session

When I run the following code in Sagemaker Studio, I get the UnknownServiceError:
import boto3
import sagemaker
sagemaker_session = sagemaker.Session()
role = get_execution_role()
Any advice to overcome this would be greatly appreciated.
The error message in the following:
---------------------------------------------------------------------------
UnknownServiceError Traceback (most recent call last)
<ipython-input-47-2d2ae2d1e577> in <module>
1 import boto3
2 import sagemaker
----> 3 sagemaker_session = sagemaker.Session()
4 role = get_execution_role()
/opt/conda/lib/python3.7/site-packages/sagemaker/session.py in __init__(self, boto_session, sagemaker_client, sagemaker_runtime_client, sagemaker_featurestore_runtime_client, default_bucket)
124 sagemaker_client=sagemaker_client,
125 sagemaker_runtime_client=sagemaker_runtime_client,
--> 126 sagemaker_featurestore_runtime_client=sagemaker_featurestore_runtime_client,
127 )
128
/opt/conda/lib/python3.7/site-packages/sagemaker/session.py in _initialize(self, boto_session, sagemaker_client, sagemaker_runtime_client, sagemaker_featurestore_runtime_client)
164 else:
165 self.sagemaker_featurestore_runtime_client = self.boto_session.client(
--> 166 "sagemaker-featurestore-runtime"
167 )
168
/opt/conda/lib/python3.7/site-packages/boto3/session.py in client(self, service_name, region_name, api_version, use_ssl, verify, endpoint_url, aws_access_key_id, aws_secret_access_key, aws_session_token, config)
261 aws_access_key_id=aws_access_key_id,
262 aws_secret_access_key=aws_secret_access_key,
--> 263 aws_session_token=aws_session_token, config=config)
264
265 def resource(self, service_name, region_name=None, api_version=None,
/opt/conda/lib/python3.7/site-packages/botocore/session.py in create_client(self, service_name, region_name, api_version, use_ssl, verify, endpoint_url, aws_access_key_id, aws_secret_access_key, aws_session_token, config)
833 is_secure=use_ssl, endpoint_url=endpoint_url, verify=verify,
834 credentials=credentials, scoped_config=self.get_scoped_config(),
--> 835 client_config=config, api_version=api_version)
836 monitor = self._get_internal_component('monitor')
837 if monitor is not None:
/opt/conda/lib/python3.7/site-packages/botocore/client.py in create_client(self, service_name, region_name, is_secure, endpoint_url, verify, credentials, scoped_config, api_version, client_config)
76 'choose-service-name', service_name=service_name)
77 service_name = first_non_none_response(responses, default=service_name)
---> 78 service_model = self._load_service_model(service_name, api_version)
79 cls = self._create_client_class(service_name, service_model)
80 endpoint_bridge = ClientEndpointBridge(
/opt/conda/lib/python3.7/site-packages/botocore/client.py in _load_service_model(self, service_name, api_version)
114 def _load_service_model(self, service_name, api_version=None):
115 json_model = self._loader.load_service_model(service_name, 'service-2',
--> 116 api_version=api_version)
117 service_model = ServiceModel(json_model, service_name=service_name)
118 return service_model
/opt/conda/lib/python3.7/site-packages/botocore/loaders.py in _wrapper(self, *args, **kwargs)
130 if key in self._cache:
131 return self._cache[key]
--> 132 data = func(self, *args, **kwargs)
133 self._cache[key] = data
134 return data
/opt/conda/lib/python3.7/site-packages/botocore/loaders.py in load_service_model(self, service_name, type_name, api_version)
376 raise UnknownServiceError(
377 service_name=service_name,
--> 378 known_service_names=', '.join(sorted(known_services)))
379 if api_version is None:
380 api_version = self.determine_latest_version(
UnknownServiceError: Unknown service: 'sagemaker-featurestore-runtime'. Valid service names are: accessanalyzer, acm, acm-pca, alexaforbusiness, amplify, apigateway, apigatewaymanagementapi, apigatewayv2, appconfig, application-autoscaling, application-insights, appmesh, appstream, appsync, athena, autoscaling, autoscaling-plans, backup, batch, braket, budgets, ce, chime, cloud9, clouddirectory, cloudformation, cloudfront, cloudhsm, cloudhsmv2, cloudsearch, cloudsearchdomain, cloudtrail, cloudwatch, codeartifact, codebuild, codecommit, codedeploy, codeguru-reviewer, codeguruprofiler, codepipeline, codestar, codestar-connections, codestar-notifications, cognito-identity, cognito-idp, cognito-sync, comprehend, comprehendmedical, compute-optimizer, config, connect, connectparticipant, cur, dataexchange, datapipeline, datasync, dax, detective, devicefarm, directconnect, discovery, dlm, dms, docdb, ds, dynamodb, dynamodbstreams, ebs, ec2, ec2-instance-connect, ecr, ecs, efs, eks, elastic-inference, elasticache, elasticbeanstalk, elastictranscoder, elb, elbv2, emr, es, events, firehose, fms, forecast, forecastquery, frauddetector, fsx, gamelift, glacier, globalaccelerator, glue, greengrass, groundstation, guardduty, health, honeycode, iam, imagebuilder, importexport, inspector, iot, iot-data, iot-jobs-data, iot1click-devices, iot1click-projects, iotanalytics, iotevents, iotevents-data, iotsecuretunneling, iotsitewise, iotthingsgraph, ivs, kafka, kendra, kinesis, kinesis-video-archived-media, kinesis-video-media, kinesis-video-signaling, kinesisanalytics, kinesisanalyticsv2, kinesisvideo, kms, lakeformation, lambda, lex-models, lex-runtime, license-manager, lightsail, logs, machinelearning, macie, macie2, managedblockchain, marketplace-catalog, marketplace-entitlement, marketplacecommerceanalytics, mediaconnect, mediaconvert, medialive, mediapackage, mediapackage-vod, mediastore, mediastore-data, mediatailor, meteringmarketplace, mgh, migrationhub-config, mobile, mq, mturk, neptune, networkmanager, opsworks, opsworkscm, organizations, outposts, personalize, personalize-events, personalize-runtime, pi, pinpoint, pinpoint-email, pinpoint-sms-voice, polly, pricing, qldb, qldb-session, quicksight, ram, rds, rds-data, redshift, rekognition, resource-groups, resourcegroupstaggingapi, robomaker, route53, route53domains, route53resolver, s3, s3control, sagemaker, sagemaker-a2i-runtime, sagemaker-runtime, savingsplans, schemas, sdb, secretsmanager, securityhub, serverlessrepo, service-quotas, servicecatalog, servicediscovery, ses, sesv2, shield, signer, sms, sms-voice, snowball, sns, sqs, ssm, sso, sso-oidc, stepfunctions, storagegateway, sts, support, swf, synthetics, textract, transcribe, transfer, translate, waf, waf-regional, wafv2, workdocs, worklink, workmail, workmailmessageflow, workspaces, xray
I encountered the same issue and pip install sagemaker -U (to 2.20.0) resolved it for me, hopefully that's all you need to get around this.

Compatibility issues with H2O.ai Hadoop on MapR 6.0 via python API?

Having apparent compatibility issues running H2O (via the 3.18.0.2 MapR 5.2 driver (trying with the latest driver (3.20.0.7) as recommended in another SO post did not help the problem)) on MapR 6.0.
While able to start an H2O cluster on MapR 6.0 (via something like hadoop jar h2odriver.jar -nodes 3 -mapperXmx 6g -output hdfsOutputDirName
) and seem to be able to access h2o Flow UI, having problems accessing the cluster via python API (pip show h2o confirms matching package version to driver being used).
Is the MapR 5.2 driver (currently the latest MapR driver version offered by H2O) incompatible with MapR 6.0 (would not be asking if not for the fact that seem to be able to use the H2O Flow UI on cluster instance started on MapR 6.0)? Any workaround other than standalone driver version (would like to still be able to leverage YARN on hadoop cluster)?
The code and error being seen when trying to connect to the running H2O using the python APIis shown below.
# connect to h2o service
h2o.init(ip=h2o_cnxn_ip)
where the h2o_cnxn_ip is the IP and port generated after starting the h2o cluster on the MapR 6.0 system. Produces error
Checking whether there is an H2O instance running at http://172.18.0.123:54321...
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-5-1728877a03a2> in <module>()
1 # connect to h2o service
----> 2 h2o.init(ip=h2o_cnxn_ip)
/home/me/projects/myproject/lib/python2.7/site-packages/h2o/h2o.pyc in init(url, ip, port, https, insecure, username, password, cookies, proxy, start_h2o, nthreads, ice_root, enable_assertions, max_mem_size, min_mem_size, strict_version_check, ignore_config, extra_classpath, **kwargs)
250 auth=auth, proxy=proxy,cookies=cookies, verbose=True,
251 _msgs=("Checking whether there is an H2O instance running at {url}",
--> 252 "connected.", "not found."))
253 except H2OConnectionError:
254 # Backward compatibility: in init() port parameter really meant "baseport" when starting a local server...
/home/me/projects/myproject/lib/python2.7/site-packages/h2o/backend/connection.pyc in open(server, url, ip, port, https, auth, verify_ssl_certificates, proxy, cookies, verbose, _msgs)
316 conn._stage = 1
317 conn._timeout = 3.0
--> 318 conn._cluster = conn._test_connection(retries, messages=_msgs)
319 # If a server is unable to respond within 1s, it should be considered a bug. However we disable this
320 # setting for now, for no good reason other than to ignore all those bugs :(
/home/me/projects/myproject/lib/python2.7/site-packages/h2o/backend/connection.pyc in _test_connection(self, max_retries, messages)
558 raise H2OServerError("Local server was unable to start")
559 try:
--> 560 cld = self.request("GET /3/Cloud")
561 if cld.consensus and cld.cloud_healthy:
562 self._print(" " + messages[1])
/home/me/projects/myproject/lib/python2.7/site-packages/h2o/backend/connection.pyc in request(self, endpoint, data, json, filename, save_to)
400 auth=self._auth, verify=self._verify_ssl_cert, proxies=self._proxies)
401 self._log_end_transaction(start_time, resp)
--> 402 return self._process_response(resp, save_to)
403
404 except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
/home/me/projects/myproject/lib/python2.7/site-packages/h2o/backend/connection.pyc in _process_response(response, save_to)
711 if content_type == "application/json":
712 try:
--> 713 data = response.json(object_pairs_hook=H2OResponse)
714 except (JSONDecodeError, requests.exceptions.ContentDecodingError) as e:
715 raise H2OServerError("Malformed JSON from server (%s):\n%s" % (str(e), response.text))
/home/me/projects/myproject/lib/python2.7/site-packages/requests/models.pyc in json(self, **kwargs)
882 try:
883 return complexjson.loads(
--> 884 self.content.decode(encoding), **kwargs
885 )
886 except UnicodeDecodeError:
/usr/lib64/python2.7/json/__init__.pyc in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
349 if parse_constant is not None:
350 kw['parse_constant'] = parse_constant
--> 351 return cls(encoding=encoding, **kw).decode(s)
/usr/lib64/python2.7/json/decoder.pyc in decode(self, s, _w)
364
365 """
--> 366 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
367 end = _w(s, end).end()
368 if end != len(s):
/usr/lib64/python2.7/json/decoder.pyc in raw_decode(self, s, idx)
380 """
381 try:
--> 382 obj, end = self.scan_once(s, idx)
383 except StopIteration:
384 raise ValueError("No JSON object could be decoded")
/home/me/projects/myproject/lib/python2.7/site-packages/h2o/backend/connection.pyc in __new__(cls, keyvals)
823 for k, v in keyvals:
824 if k == "__meta" and isinstance(v, dict):
--> 825 schema = v["schema_name"]
826 break
827 if k == "__schema" and is_type(v, str):
KeyError: u'schema_name'
MapR 6 is not currently supported by H2O. Currently H2O supports up to MapR 5.2.
Please see the downloads page for supported Hadoop versions.

IllegalArgumentException: u'Wrong FS: file://spark-warehouse, expected: file:///'

I am trying to load my Postgres database into Spark using PySpark:
from pyspark import SparkContext
from pyspark import SparkConf
from random import random
#spark conf
conf = SparkConf()
conf.setMaster("spark://spark-master:7077")
conf.setAppName('pyspark')
sc = SparkContext(conf=conf)
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
properties = {
"user": "postgres",
"password": "password123",
"driver": "org.postgresql.Driver"
}
url = "jdbc.postgresql://<POSTGRES_IP>/DB_NAME"
df = sqlContext.read.jdbc(url=url, table='myTable', properties=properties)
I get the following error, which I have no idea what it means.
/opt/spark/python/pyspark/sql/readwriter.pyc in jdbc(self, url, table, column, lowerBound, upperBound, numPartitions, predicates, properties)
420 jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates)
421 return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
--> 422 return self._df(self._jreader.jdbc(url, table, jprop))
423
424
/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/opt/spark/python/pyspark/sql/utils.pyc in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: u'Wrong FS: file://spark-warehouse, expected: file:///'
Specifying an existing directory for the sql warehouse directory setting should solve your issue. For example at job launch time:
./bin/spark-submit --conf spark.sql.warehouse.dir=/tmp/ \
... # other options
your_file.py \
[application-arguments]

Resources