Reading data from an other disk with PySpark (Windows 10) [duplicate] - windows

This question already has answers here:
How to access local files in Spark on Windows?
(5 answers)
Closed 11 days ago.
I'm new to Spark and PySpark.
I downloaded data from here (1,75GB compression of multiple .csv files) and I stored them in my disk D, separetely from my Spark installation and my PySpark script on my disk C.
When I try to read them I have the following an error :
---------------------------------------------------------------------------
AnalysisException Traceback (most recent call last)
Cell In[12], line 3
1 df = spark.read.option("header", True) \
2 .option("inferSchema", True) \
----> 3 .csv("\airport_delay")
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\sql\readwriter.py:535, in DataFrameReader.csv(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue, locale, lineSep, pathGlobFilter, recursiveFileLookup, modifiedBefore, modifiedAfter, unescapedQuoteHandling)
533 if type(path) == list:
534 assert self._spark._sc._jvm is not None
--> 535 return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
536 elif isinstance(path, RDD):
538 def func(iterator):
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\sql\utils.py:196, in capture_sql_exception.<locals>.deco(*a, **kw)
192 converted = convert_exception(e.java_exception)
193 if not isinstance(converted, UnknownException):
194 # Hide where the exception came from that shows a non-Pythonic
195 # JVM exception message.
--> 196 raise converted from None
197 else:
198 raise
AnalysisException: Path does not exist: file:/C:/Users/Travail/Documents/PySpark/irport_delay
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[1]") \
.appName("Test1") \
.getOrCreate()
df = spark.read.option("header", True) \
.option("inferSchema", True) \
.csv("file:\\\D:\Dataset\airport_delay")
How can I read data from an other disk with PySpark ? Or is it a nonsense to do so ?
I tried :
- add/remove "file:\"
- read Spark Configuration Doc and looked for something similar to "spark.sql.warehouse.dir"

I tried to change all the "\" to "/" and it worked.
df = spark.read.option("header", True) \
.option("inferSchema", True) \
.csv("file:///D:/Dataset/airport_delay")

Related

Could ruamel.yaml support type descriptor like "num: !!float 4"?

I am learning using ruamel.yaml, and I am wondering whether it supports type descriptor as the original YAML like "num: !!float 4"?
The file is like:
num: !!float 4
I tried import a file like this, but met an error:
---------------------------------------------------------------------------<br>
ValueError Traceback (most recent call last)
Input In [22], in <cell line: 2>()
1 from ruamel import yaml
2 with open("net.yaml", "r", encoding="utf-8") as yaml_file:
----> 3 yaml_dict = yaml.round_trip_load(yaml_file)
4 yaml_dict
...
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:1469, in RoundTripConstructor.construct_mapping(self, node, maptyp, deep)
1462 if not isinstance(key, Hashable):
1463 raise ConstructorError(
1464 'while constructing a mapping',
1465 node.start_mark,
1466 'found unhashable key',
1467 key_node.start_mark,
1468 )
-> 1469 value = self.construct_object(value_node, deep=deep)
1470 if self.check_mapping_key(node, key_node, maptyp, key, value):
1471 if key_node.comment and len(key_node.comment) > 4 and key_node.comment[4]:
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:146, in BaseConstructor.construct_object(self, node, deep)
142 # raise ConstructorError(
143 # None, None, 'found unconstructable recursive node', node.start_mark
144 # )
145 self.recursive_objects[node] = None
--> 146 data = self.construct_non_recursive_object(node)
148 self.constructed_objects[node] = data
149 del self.recursive_objects[node]
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:181, in BaseConstructor.construct_non_recursive_object(self, node, tag)
179 constructor = self.__class__.construct_mapping
180 if tag_suffix is None:
--> 181 data = constructor(self, node)
182 else:
183 data = constructor(self, tag_suffix, node)
File ~/software/python/anaconda/anaconda3/envs/conda-general/lib/python3.10/site-packages/ruamel/yaml/constructor.py:1271, in RoundTripConstructor.construct_yaml_float(self, node)
1259 return ScalarFloat(
1260 sign * float(value_s),
1261 width=width,
(...)
1268 anchor=node.anchor,
1269 )
1270 width = len(value_so)
-> 1271 prec = value_so.index('.') # you can use index, this would not be float without dot
1272 lead0 = leading_zeros(value_so)
1273 return ScalarFloat(
1274 sign * float(value_s),
1275 width=width,
(...)
1279 anchor=node.anchor,
1280 )
ValueError: substring not found
Why do I get this error, and how do I get rid of it?
That is a bug in ruamel.yaml<=0.17.21. The comment on the offending line (1271) says
# you can use index, this would not be float without dot
Obviously the author of that comment didn't know what he was talking about, as in your case, when using !!float 4 you have a float without a dot...
It is trivial to "fix" that by replacing index with find in line 1271, and when doing so that will load your document and you can dump the data.
But the corresponding representer for dumping doesn't cope with that outputs the float as 4.0, dropping the tag.
You could temporarily fix this by registering a simpler float constructor (e.g. the simple one from the SafeLoader), although this will affect all floats:
import sys
import ruamel.yaml
yaml_str = """\
num: !!float 4
"""
yaml = ruamel.yaml.YAML()
yaml.constructor.add_constructor(
'tag:yaml.org,2002:float', ruamel.yaml.constructor.SafeConstructor.construct_yaml_float
)
data = yaml.load(yaml_str)
yaml.dump(data, sys.stdout)
which gives:
num: 4.0

Missing ',' in line when Biopython reads a nexus tree

I want to edit a tree that I got from BEAST2 treeannotator in nexus-format.
Usually I use the module Phylo from Biopython for such work but Phylo.read(r"filename.tree", "nexus") gave me the next exception:
---------------------------------------------------------------------------
NexusError Traceback (most recent call last)
Input In [29], in <cell line: 1>()
----> 1 Phylo.read(r"filename.tree", "nexus")
File ~\miniconda3\lib\site-packages\Bio\Phylo\_io.py:60, in read(file, format, **kwargs)
58 try:
59 tree_gen = parse(file, format, **kwargs)
---> 60 tree = next(tree_gen)
61 except StopIteration:
62 raise ValueError("There are no trees in this file.") from None
File ~\miniconda3\lib\site-packages\Bio\Phylo\_io.py:49, in parse(file, format, **kwargs)
34 """Parse a file iteratively, and yield each of the trees it contains.
35
36 If a file only contains one tree, this still returns an iterable object that
(...)
46
47 """
48 with File.as_handle(file) as fp:
---> 49 yield from getattr(supported_formats[format], "parse")(fp, **kwargs)
File ~\miniconda3\lib\site-packages\Bio\Phylo\NexusIO.py:40, in parse(handle)
32 def parse(handle):
33 """Parse the trees in a Nexus file.
34
35 Uses the old Nexus.Trees parser to extract the trees, converts them back to
(...)
38 eventually change Nexus to use the new NewickIO parser directly.)
39 """
---> 40 nex = Nexus.Nexus(handle)
42 # NB: Once Nexus.Trees is modified to use Tree.Newick objects, do this:
43 # return iter(nex.trees)
44 # Until then, convert the Nexus.Trees.Tree object hierarchy:
45 def node2clade(nxtree, node):
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:668, in Nexus.__init__(self, input)
665 self.options["gapmode"] = "missing"
667 if input:
--> 668 self.read(input)
669 else:
670 self.read(DEFAULTNEXUS)
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:718, in Nexus.read(self, input)
716 break
717 if title in KNOWN_NEXUS_BLOCKS:
--> 718 self._parse_nexus_block(title, contents)
719 else:
720 self._unknown_nexus_block(title, contents)
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:759, in Nexus._parse_nexus_block(self, title, contents)
757 for line in block.commandlines:
758 try:
--> 759 getattr(self, "_" + line.command)(line.options)
760 except AttributeError:
761 raise NexusError("Unknown command: %s " % line.command) from None
File ~\miniconda3\lib\site-packages\Bio\Nexus\Nexus.py:1144, in Nexus._translate(self, options)
1142 break
1143 elif c != ",":
-> 1144 raise NexusError("Missing ',' in line %s." % options)
1145 except NexusError:
1146 raise
NexusError: Missing ',' in line 1 AB298157.1_2015_-7.9133750332192605_114.8086828279248, 2 AB298158.1_2007_-8.41698974207…
Using Nexus.read(Nexus(), input=r"filename.tree") gave the same result. Please could anyone help with this? I cannot understand the reason of this error because nexus file looks correct.
The reason is that Biopython cannot read nexus trees with links, constituent from translations & a newick tree. So it is required previously to convert this to the form with full names into the tree (as hereinbelow).
Begin
tree TREE1 = (((your,tree),(in,(the, newick))),format);
End;
P.S. It is allowed in the newick format to surround the label with quotes, & some programmes or scripts add them to those names that have ambiguous characters. But it can lead to exceptions during the following phylogenetic analysis, for instance, in BEAST. I wish you would be careful with this.

How to install libspatialindex without sudo right

My code is
from geopandas.tools import sjoin
ref_bts_adm = sjoin(Airport, ref_adm, how='left', op = 'within')
The error message
--------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-10-5c31e9843a8a> in <module>
1 from geopandas.tools import sjoin
----> 2 ref_bts_adm = sjoin(Airport, ref_adm, how='left', op = 'within')
~/.local/lib/python3.6/site-packages/geopandas/tools/sjoin.py in sjoin(left_df, right_df, how, op, lsuffix, rsuffix)
87 _basic_checks(left_df, right_df, how, lsuffix, rsuffix)
88
---> 89 indices = _geom_predicate_query(left_df, right_df, op)
90
91 joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
~/.local/lib/python3.6/site-packages/geopandas/tools/sjoin.py in _geom_predicate_query(left_df, right_df, op)
171 # see discussion at https://github.com/geopandas/geopandas/pull/1421
172 predicate = "contains"
--> 173 sindex = left_df.sindex
174 input_geoms = right_df.geometry
175 else:
~/.local/lib/python3.6/site-packages/geopandas/base.py in sindex(self)
2628 [2]])
2629 """
-> 2630 return self.geometry.values.sindex
2631
2632 #property
~/.local/lib/python3.6/site-packages/geopandas/array.py in sindex(self)
307 def sindex(self):
308 if self._sindex is None:
--> 309 self._sindex = _get_sindex_class()(self.data)
310 return self._sindex
311
~/.local/lib/python3.6/site-packages/geopandas/sindex.py in _get_sindex_class()
20 return RTreeIndex
21 raise ImportError(
---> 22 "Spatial indexes require either `rtree` or `pygeos`. "
23 "See installation instructions at https://geopandas.org/install.html"
24 )
ImportError: Spatial indexes require either `rtree` or `pygeos`. See installation instructions at https://geopandas.org/install.html
Basically I have similar error with this
ImportError: Spatial indexes require either `rtree` or `pygeos` in geopanda but rtree is installed
But I don't have sudo right
I had a similar issue and solved it by installing libspatialindex via spack, e.g., spack install libspatialindex (note: this installs the library in your homedir and does not require sudo privileges). Then you can pip install pygeos and rtree, as indicated in the other answers.

JSONDecodeError: Unexpected UTF-8 BOM (decode using utf-8-sig): line 1 column 1 (char 0) ---While Tuning gpt2.finetune

Hope you all are doing good ,
I am working on fine tuning GPT 2 model to generate Title based on the content ,While working on it ,I have created a simple CSV files containing only the title to train the model , But while inputting this model to GPT 2 for fine tuning I am getting the following ERROR ,
JSONDecodeError Traceback (most recent call last)
in ()
10 steps=1000,
11 save_every=200,
---> 12 sample_every=25) # steps is max number of training steps
13
14 # gpt2.generate(sess)
3 frames
/usr/lib/python3.7/json/__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
336 if s.startswith('\ufeff'):
337 s = s.encode('utf8')[3:].decode('utf8')
--> 338 # raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)",
339 # s, 0)
340 else:
JSONDecodeError: Unexpected UTF-8 BOM (decode using utf-8-sig): line 1 column 1 (char 0)
Below is my code for the above :
import gpt_2_simple as gpt2
model_name = "120M" # "355M" for larger model (it's 1.4 GB)
gpt2.download_gpt2(model_name=model_name) # model is saved into current directory under /models/117M/
sess = gpt2.start_tf_sess()
gpt2.finetune(sess,
'titles.csv',
model_name=model_name,
steps=1000,
save_every=200,
sample_every=25) # steps is max number of training steps
I have tried all the basic mechanism of handing UTF -8 BOM but did not find any luck ,Hence requesting your help .It would be a great help from you all .
Try changing the model name because i see you input 120M and the gpt2 model is called 124M

`read_csv` doesn't see my file in Windows

I am new to using Pandas on Windows and I'm not sure what I am doing wrong here.
My data is located at 'C:\Users\me\data\lending_club\loan.csv'
path = 'C:\\Users\\me\\data\\lending_club\\loan.csv'
pd.read_csv(path)
And I get this error:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-107-b5792b17a3c3> in <module>()
1 path = 'C:\\Users\\me\\data\\lending_club\\loan.csv'
----> 2 pd.read_csv(path)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
707 skip_blank_lines=skip_blank_lines)
708
--> 709 return _read(filepath_or_buffer, kwds)
710
711 parser_f.__name__ = name
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
447
448 # Create the parser.
--> 449 parser = TextFileReader(filepath_or_buffer, **kwds)
450
451 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
816 self.options['has_index_names'] = kwds['has_index_names']
817
--> 818 self._make_engine(self.engine)
819
820 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1047 def _make_engine(self, engine='c'):
1048 if engine == 'c':
-> 1049 self._engine = CParserWrapper(self.f, **self.options)
1050 else:
1051 if engine == 'python':
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
1693 kwds['allow_leading_cols'] = self.index_col is not False
1694
-> 1695 self._reader = parsers.TextReader(src, **kwds)
1696
1697 # XXX
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: File b'C:\\Users\\me\\data\\lending_club\\loan.csv' does not exist
EDIT:
I re-installed Anaconda and the error went away. Not sure exactly what was going on but could potentially have been related to the initial install being global vs. user specific in my second install. Thanks for the help everybody!
Just use forward slash('/') instead of backslash('\')
path = 'C:/Users/me/data/lending_club/loan.csv'
Python only access to the current folder's files.
If you want to access files from an other folder, try this :
import sys
sys.path.insert(0, 'C:/Users/myFolder')
Those lines allow your script to access an other folder. Be carrefull, you shoud use slashes /, not backslashes \

Resources