Is it possible to specify the compression when using pyarrow write_dataset? - parquet

I would like to be able to control the type of compression used when partitioning (default is snappy).
import numpy.random
import pyarrow as pa
import pyarrow.dataset as ds
data = pa.table(
{
"day": numpy.random.randint(1, 31, size=100),
"month": numpy.random.randint(1, 12, size=100),
"year": [2000 + x // 10 for x in range(100)],
}
)
ds.write_dataset(
data,
"./tmp/partitioned",
format="parquet",
existing_data_behavior="delete_matching",
partitioning=ds.partitioning(
pa.schema(
[
("year", pa.int16()),
]
),
),
)
It is not clear to me, from the doc, if that's actually possible

There is an option to specify the file options.
file_options
pyarrow.dataset.FileWriteOptions, optional
FileFormat specific write options, created using the FileFormat.make_write_options() function.
You can use any of the compression options mentioned in the docs - snappy, gzip, brotli, zstd, lz4, none
Below code writes dataset using brotli compression.
import numpy.random
import pyarrow as pa
import pyarrow.dataset as ds
data = pa.table(
{
"day": numpy.random.randint(1, 31, size=100),
"month": numpy.random.randint(1, 12, size=100),
"year": [2000 + x // 10 for x in range(100)],
}
)
file_options = ds.ParquetFileFormat().make_write_options(compression='brotli')
ds.write_dataset(
data,
"./tmp/partitioned",
format="parquet",
existing_data_behavior="delete_matching",
file_options=file_options,
partitioning=ds.partitioning(
pa.schema(
[
("year", pa.int16()),
]
),
),
)

Related

PyArrow: Writing a parquet file with a particular schema

For testing purposes, I am trying to generate a file with dummy data, but with the following schema (schema of the real data):
pa.schema([
pa.field('field1', pa.int64()),
pa.field('field2', pa.list_(pa.field('element', pa.int64()))),
pa.field('field3', pa.list_(pa.field('element', pa.float64()))),
pa.field('field4', pa.list_(pa.field('element', pa.float64()))),
], )
I have the following code:
import pyarrow as pa
import pyarrow.parquet as pq
loc = "test.parquet"
data = {
"field1": [0],
"field2": [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]],
"field3": [[1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]],
"field4": [[2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9]]
}
schema1 = pa.schema([
pa.field('field1', pa.int64()),
pa.field('field2', pa.list_(pa.field('element', pa.int64()))),
pa.field('field3', pa.list_(pa.field('element', pa.float64()))),
pa.field('field4', pa.list_(pa.field('element', pa.float64()))),
], )
schema2 = pa.schema([
pa.field('field1', pa.int64()),
pa.field('field2', pa.list_(pa.int64())),
pa.field('field3', pa.list_(pa.float64())),
pa.field('field4', pa.list_(pa.float64())),
], )
writer = pq.ParquetWriter(loc, schema1)
writer.write(pa.table(data))
writer.close()
The dictionary in the code, when converted to a PyArrow table and written to a parquet file, generates a file whose schema matches schema2. Passing schema1 to the writer gives an error. How can I change the dictionary in such a way that its schema matches schema1 when converted to a table?
Semantically the schemas are the same, the name of the list item ("element") should not matter. This used to be an issue but has been fixed in pyarrow 11.0.0 (https://issues.apache.org/jira/browse/ARROW-14999)
So you can upgrade to pyarrow and it should work.
Alternatively you can make sure your table has got the correct schema by doing either:
writer.write(pa.table(data, schema=schema1))
Or casting by casting it:
writer.write(pa.table(data).cast(schema1))

My altair labels are cut off - how can I see the whole label? [duplicate]

bars = alt.Chart(df).mark_bar().encode(
x=alt.X('Pcnt:Q', axis=None),
y=alt.Y('Name',
axis=alt.Axis(domain=False,
ticks=False,
title=None,
labelPadding=15,
labelFontSize=16,
labelColor='#404040',
labelBaseline='middle',
# labelAngle= -45,
# labelExpr=axis_labels
),
sort=name_sort
)
)
text = bars.mark_text(align='left', baseline='middle', dx=3, size=14).\
encode(text=alt.Text('Pcnt:Q',format='.0%'))
Votes = (bars+text).properties(width=500,height=100
).properties(title={
"text": ["Who Shot First?"],
"subtitle": ["According to 834 respondents"],
"fontSize": 26, "color": '#353535',
"subtitleFontSize": 20, "subtitleColor": '#353535',
"anchor": 'start'}
).configure_mark(color='#008fd5'
).configure_view(strokeWidth=0
).configure_scale(bandPaddingInner=0.2
)
Votes
Currently (see below output), the third label in y-axis (i.e. "I don't understand this question") got truncated. I want to wrap it to make the whole label visible. Anyone can help? Thank you very much!
Desired chart is like this:
You can use labelLimit to control when the label is truncated:
import pandas as pd
import altair as alt
df = pd.DataFrame({
'label': ['Really long label here that will be truncated', 'Short label'],
'value': [4, 5]
})
alt.Chart(df).mark_bar().encode(
x='value',
y='label'
)
alt.Chart(df).mark_bar().encode(
x='value',
y=alt.Y('label', axis=alt.Axis(labelLimit=200))
)
You can also wrap on multiple lines by creating a list, as suggested in the comments:
from textwrap import wrap
# Wrap on whitespace with a max line length of 30 chars
df['label'] = df['label'].apply(wrap, args=[30])
alt.Chart(df).mark_bar().encode(
x='value',
y=alt.Y('label', axis=alt.Axis(labelFontSize=9)),
)

Azure Event Hub No Longer receiving messages : Eventhub has request but no messages

For some reason my Azure Event Hub is no longer receiving messages. It was working fine last night.
I am using Databricks Data Generator to send data to Azure Event Hubs with the following code:
import dbldatagen as dg
from pyspark.sql.types import IntegerType, StringType, FloatType
import json
from pyspark.sql.types import StructType, StructField, IntegerType, DecimalType, StringType, TimestampType, Row
from pyspark.sql.functions import *
import pyspark.sql.functions as F
num_rows = 1 * 10000 # number of rows to generate
num_partitions = 2 # number of Spark dataframe partitions
delay_reasons = ["Air Carrier", "Extreme Weather", "National Aviation System", "Security", "Late Aircraft"]
# will have implied column `id` for ordinal of row
flightdata_defn = (dg.DataGenerator(spark, name="flight_delay_data", rows=num_rows, partitions=num_partitions)
#.withColumn("body",StringType(), False)
.withColumn("flightNumber", "int", minValue=1000, uniqueValues=10000, random=True)
.withColumn("airline", "string", minValue=1, maxValue=500, prefix="airline", random=True, distribution="normal")
.withColumn("original_departure", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", interval="1 minute", random=True)
.withColumn("delay_minutes", "int", minValue=20, maxValue=600, distribution=dg.distributions.Gamma(1.0, 2.0))
.withColumn("delayed_departure", "timestamp", expr="cast(original_departure as bigint) + (delay_minutes * 60) ", baseColumn=["original_departure", "delay_minutes"])
.withColumn("reason", "string", values=delay_reasons, random=True)
)
df_flight_data = flightdata_defn.build(withStreaming=True, options={'rowsPerSecond': 100})
streamingDelays = (
df_flight_data
.groupBy(
#df_flight_data.body,
df_flight_data.flightNumber,
df_flight_data.airline,
df_flight_data.original_departure,
df_flight_data.delay_minutes,
df_flight_data.delayed_departure,
df_flight_data.reason,
window(df_flight_data.original_departure, "1 hour")
)
.count()
)
writeConnectionString = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)
checkpointLocation = "///checkpoint"
ehWriteConf = {
'eventhubs.connectionString' : writeConnectionString
}
# Write body data from a DataFrame to EventHubs. Events are distributed across partitions using round-robin model.
ds = streamingDelays \
.select(F.to_json(F.struct("*")).alias("body")) \
.writeStream.format("eventhubs") \
.options(**ehWriteConf) \
.outputMode("complete") \
.option("checkpointLocation", "...") \
.start()
# display(streamingDelays)
From the image you will notice that I'm bearly receiving and requests, and absolutely no messages. However, just yesterday I was getting both requests and messages.
I created a new Event Hub, but I'm
I'm sure its something very simple that I'm missing....
I should mention that my Databricks notebook appears to get stuck at 'Stream initializing...

How to change column datatype with pyarrow

I am reading a set of arrow files and am writing them to a parquet file:
import pathlib
from pyarrow import parquet as pq
from pyarrow import feather
import pyarrow as pa
base_path = pathlib.Path('../mydata')
fields = [
pa.field('value', pa.int64()),
pa.field('code', pa.dictionary(pa.int32(), pa.uint64(), ordered=False)),
]
schema = pa.schema(fields)
with pq.ParquetWriter('sample.parquet', schema) as pqwriter:
for file_path in base_path.glob('*.arrow'):
table = feather.read_table(file_path)
pqwriter.write_table(table)
My problem is that the code field in the arrow files is defined with an int8 index instead of int32. The range of int8 however is insufficient. Hence I defined a schema with a int32 index for the field code in the parquet file.
However, writing the arrow table to parquet now complains that the schemas do not match.
How can I change the datatype of the arrow column? I checked the pyarrow API and did not find a way to change the schema. Can this be done without roundtripping to pandas?
Arrow ChunkedArray has got a cast function, but unfortunately it doesn't work for what you want to do:
>>> table['code'].cast(pa.dictionary(pa.int32(), pa.uint64(), ordered=False))
Unsupported cast from dictionary<values=uint64, indices=int8, ordered=0> to dictionary<values=uint64, indices=int32, ordered=0> (no available cast function for target type)
Instead you can cast to pa.uint64() and encode it to dictionary:
>>> table['code'].cast(pa.uint64()).dictionary_encode().type
DictionaryType(dictionary<values=uint64, indices=int32, ordered=0>)
Here's a self contained example:
import pyarrow as pa
source_schema = pa.schema([
pa.field('value', pa.int64()),
pa.field('code', pa.dictionary(pa.int8(), pa.uint64(), ordered=False)),
])
source_table = pa.Table.from_arrays([
pa.array([1, 2, 3], pa.int64()),
pa.array([1, 2, 1000], pa.dictionary(pa.int8(), pa.uint64(), ordered=False)),
], schema=source_schema)
destination_schema = pa.schema([
pa.field('value', pa.int64()),
pa.field('code', pa.dictionary(pa.int32(), pa.uint64(), ordered=False)),
])
destination_data = pa.Table.from_arrays([
source_table['value'],
source_table['code'].cast(pa.uint64()).dictionary_encode(),
], schema=destination_schema)

Transform a list of files (JSON) to a dataframe

Spark Version: '2.0.0.2.5.0.0-1245'
So, my original question changed a bit but it's still the same issue.
What I want to do is load a huge amount of JSON files and transform those to a DataFrame - also probably save them as CSV or parquet file for further processing. Each JSON file represents one row in the final DataFrame.
import os
import glob
HDFS_MOUNT = # ...
DATA_SET_BASE = # ...
schema = StructType([
StructField("documentId", StringType(), True),
StructField("group", StringType(), True),
StructField("text", StringType(), True)
])
# Get the file paths
file_paths = glob.glob(os.path.join(HDFS_MOUNT, DATA_SET_BASE, '**/*.json'))
file_paths = [f.replace(HDFS_MOUNT + '/', '') for f in file_paths]
print('Found {:d} files'.format(len(file_paths))) # 676 files
sql = SQLContext(sc)
df = sql.read.json(file_paths, schema=schema)
print('Loaded {:d} rows'.format(df.count())) # 9660 rows (what !?)
Besides the fact that there are 9660 rows instead of 676 (number of available files) I also have the problem that the content seems to be None:
df.head(2)[0].asDict()
gives
{
'documentId': None,
'group': None,
'text': None,
}
Example Data
This is just fake data of course but it resembles the actual data.
Note: Some fields may be missing e.g. text must not always be present.
a.json
{
"documentId" : "001",
"group" : "A",
"category" : "indexed_document",
"linkIDs": ["adiojer", "asdi555", "1337"]
}
b.json
{
"documentId" : "002",
"group" : "B",
"category" : "indexed_document",
"linkIDs": ["linkId", "1000"],
"text": "This is the text of this document"
}
assuming that all your files has the same structure and are in the same directory:
df = sql_cntx.read.json('/hdfs/path/to/folder/*.json')
There might be a problem if any of the columns has Null values for all rows. Then spark will not be able to determine schema, so you have an option to tell spark which schema to use:
from pyspark import SparkContext, SQLContext
from pyspark.sql.types import StructType, StructField, StringType, LongType
sc = SparkContext(appName="My app")
sql_cntx = SQLContext(sc)
schema = StructType([
StructField("field1", StringType(), True),
StructField("field2", LongType(), True)
])
df = sql_cntx.read.json('/hdfs/path/to/folder/*.json', schema=schema)
UPD:
in case if file has multirows formatted json you can try this code:
sc = SparkContext(appName='Test')
sql_context = SQLContext(sc)
rdd = sc.wholeTextFiles('/tmp/test/*.json').values()
df = sql_context.read.json(rdd, schema=schema)
df.show()

Resources