first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,27 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not parquet'
pytestmark = [
pytest.mark.parquet,
pytest.mark.filterwarnings(
"ignore:Passing 'use_legacy_dataset=True':DeprecationWarning"
),
]

View File

@ -0,0 +1,189 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
import numpy as np
import pytest
import pyarrow as pa
from pyarrow.tests import util
legacy_filter_mark = pytest.mark.filterwarnings(
"ignore:Passing 'use_legacy:FutureWarning"
)
parametrize_legacy_dataset = pytest.mark.parametrize(
"use_legacy_dataset",
[pytest.param(True, marks=legacy_filter_mark),
pytest.param(False, marks=pytest.mark.dataset)]
)
parametrize_legacy_dataset_not_supported = pytest.mark.parametrize(
"use_legacy_dataset",
[pytest.param(True, marks=legacy_filter_mark),
pytest.param(False, marks=pytest.mark.skip)]
)
parametrize_legacy_dataset_fixed = pytest.mark.parametrize(
"use_legacy_dataset",
[pytest.param(True, marks=[pytest.mark.xfail, legacy_filter_mark]),
pytest.param(False, marks=pytest.mark.dataset)]
)
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not parquet'
pytestmark = pytest.mark.parquet
def _write_table(table, path, **kwargs):
# So we see the ImportError somewhere
import pyarrow.parquet as pq
from pyarrow.pandas_compat import _pandas_api
if _pandas_api.is_data_frame(table):
table = pa.Table.from_pandas(table)
pq.write_table(table, path, **kwargs)
return table
def _read_table(*args, **kwargs):
import pyarrow.parquet as pq
table = pq.read_table(*args, **kwargs)
table.validate(full=True)
return table
def _roundtrip_table(table, read_table_kwargs=None,
write_table_kwargs=None, use_legacy_dataset=False):
read_table_kwargs = read_table_kwargs or {}
write_table_kwargs = write_table_kwargs or {}
writer = pa.BufferOutputStream()
_write_table(table, writer, **write_table_kwargs)
reader = pa.BufferReader(writer.getvalue())
return _read_table(reader, use_legacy_dataset=use_legacy_dataset,
**read_table_kwargs)
def _check_roundtrip(table, expected=None, read_table_kwargs=None,
use_legacy_dataset=False, **write_table_kwargs):
if expected is None:
expected = table
read_table_kwargs = read_table_kwargs or {}
# intentionally check twice
result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs,
write_table_kwargs=write_table_kwargs,
use_legacy_dataset=use_legacy_dataset)
assert result.equals(expected)
result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs,
write_table_kwargs=write_table_kwargs,
use_legacy_dataset=use_legacy_dataset)
assert result.equals(expected)
def _roundtrip_pandas_dataframe(df, write_kwargs, use_legacy_dataset=False):
table = pa.Table.from_pandas(df)
result = _roundtrip_table(
table, write_table_kwargs=write_kwargs,
use_legacy_dataset=use_legacy_dataset)
return result.to_pandas()
def _random_integers(size, dtype):
# We do not generate integers outside the int64 range
platform_int_info = np.iinfo('int_')
iinfo = np.iinfo(dtype)
return np.random.randint(max(iinfo.min, platform_int_info.min),
min(iinfo.max, platform_int_info.max),
size=size).astype(dtype)
def _test_dataframe(size=10000, seed=0):
import pandas as pd
np.random.seed(seed)
df = pd.DataFrame({
'uint8': _random_integers(size, np.uint8),
'uint16': _random_integers(size, np.uint16),
'uint32': _random_integers(size, np.uint32),
'uint64': _random_integers(size, np.uint64),
'int8': _random_integers(size, np.int8),
'int16': _random_integers(size, np.int16),
'int32': _random_integers(size, np.int32),
'int64': _random_integers(size, np.int64),
'float32': np.random.randn(size).astype(np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
'strings': [util.rands(10) for i in range(size)],
'all_none': [None] * size,
'all_none_category': [None] * size
})
# TODO(PARQUET-1015)
# df['all_none_category'] = df['all_none_category'].astype('category')
return df
def make_sample_file(table_or_df):
import pyarrow.parquet as pq
if isinstance(table_or_df, pa.Table):
a_table = table_or_df
else:
a_table = pa.Table.from_pandas(table_or_df)
buf = io.BytesIO()
_write_table(a_table, buf, compression='SNAPPY', version='2.6',
coerce_timestamps='ms')
buf.seek(0)
return pq.ParquetFile(buf)
def alltypes_sample(size=10000, seed=0, categorical=False):
import pandas as pd
np.random.seed(seed)
arrays = {
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
# TODO(wesm): Test other timestamp resolutions now that arrow supports
# them
'datetime': np.arange("2016-01-01T00:00:00.001", size,
dtype='datetime64[ms]'),
'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
'str': pd.Series([str(x) for x in range(size)]),
'empty_str': [''] * size,
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'null': [None] * size,
'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
}
if categorical:
arrays['str_category'] = arrays['str'].astype('category')
return pd.DataFrame(arrays)

View File

@ -0,0 +1,87 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
from pyarrow.util import guid
@pytest.fixture(scope='module')
def datadir(base_datadir):
return base_datadir / 'parquet'
@pytest.fixture
def s3_bucket(s3_server):
boto3 = pytest.importorskip('boto3')
botocore = pytest.importorskip('botocore')
host, port, access_key, secret_key = s3_server['connection']
s3 = boto3.resource(
's3',
endpoint_url='http://{}:{}'.format(host, port),
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
config=botocore.client.Config(signature_version='s3v4'),
region_name='us-east-1'
)
bucket = s3.Bucket('test-s3fs')
try:
bucket.create()
except Exception:
# we get BucketAlreadyOwnedByYou error with fsspec handler
pass
return 'test-s3fs'
@pytest.fixture
def s3_example_s3fs(s3_server, s3_bucket):
s3fs = pytest.importorskip('s3fs')
host, port, access_key, secret_key = s3_server['connection']
fs = s3fs.S3FileSystem(
key=access_key,
secret=secret_key,
client_kwargs={
'endpoint_url': 'http://{}:{}'.format(host, port)
}
)
test_path = '{}/{}'.format(s3_bucket, guid())
fs.mkdir(test_path)
yield fs, test_path
try:
fs.rm(test_path, recursive=True)
except FileNotFoundError:
pass
@pytest.fixture
def s3_example_fs(s3_server):
from pyarrow.fs import FileSystem
host, port, access_key, secret_key = s3_server['connection']
uri = (
"s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}"
.format(access_key, secret_key, host, port)
)
fs, path = FileSystem.from_uri(uri)
fs.create_dir("mybucket")
yield fs, uri, path

View File

@ -0,0 +1,60 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import base64
import pyarrow.parquet.encryption as pe
class InMemoryKmsClient(pe.KmsClient):
"""This is a mock class implementation of KmsClient, built for testing only.
"""
def __init__(self, config):
"""Create an InMemoryKmsClient instance."""
pe.KmsClient.__init__(self)
self.master_keys_map = config.custom_kms_conf
def wrap_key(self, key_bytes, master_key_identifier):
"""Not a secure cipher - the wrapped key
is just the master key concatenated with key bytes"""
master_key_bytes = self.master_keys_map[master_key_identifier].encode(
'utf-8')
wrapped_key = b"".join([master_key_bytes, key_bytes])
result = base64.b64encode(wrapped_key)
return result
def unwrap_key(self, wrapped_key, master_key_identifier):
"""Not a secure cipher - just extract the key from
the wrapped key"""
expected_master_key = self.master_keys_map[master_key_identifier]
decoded_wrapped_key = base64.b64decode(wrapped_key)
master_key_bytes = decoded_wrapped_key[:16]
decrypted_key = decoded_wrapped_key[16:]
if (expected_master_key == master_key_bytes.decode('utf-8')):
return decrypted_key
raise ValueError("Incorrect master key used",
master_key_bytes, decrypted_key)
def verify_file_encrypted(path):
"""Verify that the file is encrypted by looking at its first 4 bytes.
If it's the magic string PARE
then this is a parquet with encrypted footer."""
with open(path, "rb") as file:
magic_str = file.read(4)
# Verify magic string for parquet with encrypted footer is PARE
assert magic_str == b'PARE'

View File

@ -0,0 +1,799 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from collections import OrderedDict
import io
import numpy as np
import pytest
import pyarrow as pa
from pyarrow import fs
from pyarrow.filesystem import LocalFileSystem, FileSystem
from pyarrow.tests import util
from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
parametrize_legacy_dataset,
_test_dataframe)
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _read_table, _write_table
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
from pyarrow.tests.pandas_examples import dataframe_with_lists
from pyarrow.tests.parquet.common import alltypes_sample
except ImportError:
pd = tm = None
def test_parquet_invalid_version(tempdir):
table = pa.table({'a': [1, 2, 3]})
with pytest.raises(ValueError, match="Unsupported Parquet format version"):
_write_table(table, tempdir / 'test_version.parquet', version="2.2")
with pytest.raises(ValueError, match="Unsupported Parquet data page " +
"version"):
_write_table(table, tempdir / 'test_version.parquet',
data_page_version="2.2")
@parametrize_legacy_dataset
def test_set_data_page_size(use_legacy_dataset):
arr = pa.array([1, 2, 3] * 100000)
t = pa.Table.from_arrays([arr], names=['f0'])
# 128K, 512K
page_sizes = [2 << 16, 2 << 18]
for target_page_size in page_sizes:
_check_roundtrip(t, data_page_size=target_page_size,
use_legacy_dataset=use_legacy_dataset)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_set_write_batch_size(use_legacy_dataset):
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
_check_roundtrip(
table, data_page_size=10, write_batch_size=1, version='2.4'
)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_set_dictionary_pagesize_limit(use_legacy_dataset):
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
_check_roundtrip(table, dictionary_pagesize_limit=1,
data_page_size=10, version='2.4')
with pytest.raises(TypeError):
_check_roundtrip(table, dictionary_pagesize_limit="a",
data_page_size=10, version='2.4')
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_chunked_table_write(use_legacy_dataset):
# ARROW-232
tables = []
batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10))
tables.append(pa.Table.from_batches([batch] * 3))
df, _ = dataframe_with_lists()
batch = pa.RecordBatch.from_pandas(df)
tables.append(pa.Table.from_batches([batch] * 3))
for data_page_version in ['1.0', '2.0']:
for use_dictionary in [True, False]:
for table in tables:
_check_roundtrip(
table, version='2.6',
use_legacy_dataset=use_legacy_dataset,
data_page_version=data_page_version,
use_dictionary=use_dictionary)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_memory_map(tempdir, use_legacy_dataset):
df = alltypes_sample(size=10)
table = pa.Table.from_pandas(df)
_check_roundtrip(table, read_table_kwargs={'memory_map': True},
version='2.6', use_legacy_dataset=use_legacy_dataset)
filename = str(tempdir / 'tmp_file')
with open(filename, 'wb') as f:
_write_table(table, f, version='2.6')
table_read = pq.read_pandas(filename, memory_map=True,
use_legacy_dataset=use_legacy_dataset)
assert table_read.equals(table)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_enable_buffered_stream(tempdir, use_legacy_dataset):
df = alltypes_sample(size=10)
table = pa.Table.from_pandas(df)
_check_roundtrip(table, read_table_kwargs={'buffer_size': 1025},
version='2.6', use_legacy_dataset=use_legacy_dataset)
filename = str(tempdir / 'tmp_file')
with open(filename, 'wb') as f:
_write_table(table, f, version='2.6')
table_read = pq.read_pandas(filename, buffer_size=4096,
use_legacy_dataset=use_legacy_dataset)
assert table_read.equals(table)
@parametrize_legacy_dataset
def test_special_chars_filename(tempdir, use_legacy_dataset):
table = pa.Table.from_arrays([pa.array([42])], ["ints"])
filename = "foo # bar"
path = tempdir / filename
assert not path.exists()
_write_table(table, str(path))
assert path.exists()
table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset)
assert table_read.equals(table)
@parametrize_legacy_dataset
def test_invalid_source(use_legacy_dataset):
# Test that we provide an helpful error message pointing out
# that None wasn't expected when trying to open a Parquet None file.
#
# Depending on use_legacy_dataset the message changes slightly
# but in both cases it should point out that None wasn't expected.
with pytest.raises(TypeError, match="None"):
pq.read_table(None, use_legacy_dataset=use_legacy_dataset)
with pytest.raises(TypeError, match="None"):
pq.ParquetFile(None)
@pytest.mark.slow
def test_file_with_over_int16_max_row_groups():
# PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
# limit on the number of row groups, but this limit only impacts files with
# encrypted row group metadata because of the int16 row group ordinal used
# in the Parquet Thrift metadata. Unencrypted files are not impacted, so
# this test checks that it works (even if it isn't a good idea)
t = pa.table([list(range(40000))], names=['f0'])
_check_roundtrip(t, row_group_size=1)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_empty_table_roundtrip(use_legacy_dataset):
df = alltypes_sample(size=10)
# Create a non-empty table to infer the types correctly, then slice to 0
table = pa.Table.from_pandas(df)
table = pa.Table.from_arrays(
[col.chunk(0)[:0] for col in table.itercolumns()],
names=table.schema.names)
assert table.schema.field('null').type == pa.null()
assert table.schema.field('null_list').type == pa.list_(pa.null())
_check_roundtrip(
table, version='2.6', use_legacy_dataset=use_legacy_dataset)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_empty_table_no_columns(use_legacy_dataset):
df = pd.DataFrame()
empty = pa.Table.from_pandas(df, preserve_index=False)
_check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset)
@parametrize_legacy_dataset
def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset):
# Bug report in ARROW-3792
cols = OrderedDict(
int32=pa.int32(),
list_string=pa.list_(pa.string())
)
data = [[], [OrderedDict(int32=1, list_string=('G',)), ]]
# This produces a table with a column like
# <Column name='list_string' type=ListType(list<item: string>)>
# [
# [],
# [
# [
# "G"
# ]
# ]
# ]
#
# Each column is a ChunkedArray with 2 elements
my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten()
for batch in data]
my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols))
for batch in my_arrays]
tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
_check_roundtrip(tbl, use_legacy_dataset=use_legacy_dataset)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_multiple_path_types(tempdir, use_legacy_dataset):
# Test compatibility with PEP 519 path-like objects
path = tempdir / 'zzz.parquet'
df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
_write_table(df, path)
table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
# Test compatibility with plain string paths
path = str(tempdir) + 'zzz.parquet'
df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
_write_table(df, path)
table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
@parametrize_legacy_dataset
def test_fspath(tempdir, use_legacy_dataset):
# ARROW-12472 support __fspath__ objects without using str()
path = tempdir / "test.parquet"
table = pa.table({"a": [1, 2, 3]})
_write_table(table, path)
fs_protocol_obj = util.FSProtocolClass(path)
result = _read_table(
fs_protocol_obj, use_legacy_dataset=use_legacy_dataset
)
assert result.equals(table)
# combined with non-local filesystem raises
with pytest.raises(TypeError):
_read_table(fs_protocol_obj, filesystem=FileSystem())
@pytest.mark.dataset
@parametrize_legacy_dataset
@pytest.mark.parametrize("filesystem", [
None, fs.LocalFileSystem(), LocalFileSystem._get_instance()
])
def test_relative_paths(tempdir, use_legacy_dataset, filesystem):
# reading and writing from relative paths
table = pa.table({"a": [1, 2, 3]})
# reading
pq.write_table(table, str(tempdir / "data.parquet"))
with util.change_cwd(tempdir):
result = pq.read_table("data.parquet", filesystem=filesystem,
use_legacy_dataset=use_legacy_dataset)
assert result.equals(table)
# writing
with util.change_cwd(tempdir):
pq.write_table(table, "data2.parquet", filesystem=filesystem)
result = pq.read_table(tempdir / "data2.parquet")
assert result.equals(table)
def test_read_non_existing_file():
# ensure we have a proper error message
with pytest.raises(FileNotFoundError):
pq.read_table('i-am-not-existing.parquet')
def test_file_error_python_exception():
class BogusFile(io.BytesIO):
def read(self, *args):
raise ZeroDivisionError("zorglub")
def seek(self, *args):
raise ZeroDivisionError("zorglub")
# ensure the Python exception is restored
with pytest.raises(ZeroDivisionError, match="zorglub"):
pq.read_table(BogusFile(b""))
@parametrize_legacy_dataset
def test_parquet_read_from_buffer(tempdir, use_legacy_dataset):
# reading from a buffer from python's open()
table = pa.table({"a": [1, 2, 3]})
pq.write_table(table, str(tempdir / "data.parquet"))
with open(str(tempdir / "data.parquet"), "rb") as f:
result = pq.read_table(f, use_legacy_dataset=use_legacy_dataset)
assert result.equals(table)
with open(str(tempdir / "data.parquet"), "rb") as f:
result = pq.read_table(pa.PythonFile(f),
use_legacy_dataset=use_legacy_dataset)
assert result.equals(table)
@parametrize_legacy_dataset
def test_byte_stream_split(use_legacy_dataset):
# This is only a smoke test.
arr_float = pa.array(list(map(float, range(100))))
arr_int = pa.array(list(map(int, range(100))))
data_float = [arr_float, arr_float]
table = pa.Table.from_arrays(data_float, names=['a', 'b'])
# Check with byte_stream_split for both columns.
_check_roundtrip(table, expected=table, compression="gzip",
use_dictionary=False, use_byte_stream_split=True)
# Check with byte_stream_split for column 'b' and dictionary
# for column 'a'.
_check_roundtrip(table, expected=table, compression="gzip",
use_dictionary=['a'],
use_byte_stream_split=['b'])
# Check with a collision for both columns.
_check_roundtrip(table, expected=table, compression="gzip",
use_dictionary=['a', 'b'],
use_byte_stream_split=['a', 'b'])
# Check with mixed column types.
mixed_table = pa.Table.from_arrays([arr_float, arr_int],
names=['a', 'b'])
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=['b'],
use_byte_stream_split=['a'])
# Try to use the wrong data type with the byte_stream_split encoding.
# This should throw an exception.
table = pa.Table.from_arrays([arr_int], names=['tmp'])
with pytest.raises(IOError):
_check_roundtrip(table, expected=table, use_byte_stream_split=True,
use_dictionary=False,
use_legacy_dataset=use_legacy_dataset)
@parametrize_legacy_dataset
def test_column_encoding(use_legacy_dataset):
arr_float = pa.array(list(map(float, range(100))))
arr_int = pa.array(list(map(int, range(100))))
mixed_table = pa.Table.from_arrays([arr_float, arr_int],
names=['a', 'b'])
# Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for
# column 'b'.
_check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False,
column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN"},
use_legacy_dataset=use_legacy_dataset)
# Check "PLAIN" for all columns.
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
column_encoding="PLAIN",
use_legacy_dataset=use_legacy_dataset)
# Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'.
# This should throw an error as it is only supports FLOAT and DOUBLE.
with pytest.raises(IOError,
match="BYTE_STREAM_SPLIT only supports FLOAT and"
" DOUBLE"):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
column_encoding={'b': "BYTE_STREAM_SPLIT"},
use_legacy_dataset=use_legacy_dataset)
# Try to pass "DELTA_BINARY_PACKED".
# This should throw an error as it is only supported for reading.
with pytest.raises(IOError,
match="Not yet implemented: Selected encoding is"
" not supported."):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
column_encoding={'b': "DELTA_BINARY_PACKED"},
use_legacy_dataset=use_legacy_dataset)
# Try to pass "RLE_DICTIONARY".
# This should throw an error as dictionary encoding is already used by
# default and not supported to be specified as "fallback" encoding
with pytest.raises(ValueError):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
column_encoding="RLE_DICTIONARY",
use_legacy_dataset=use_legacy_dataset)
# Try to pass unsupported encoding.
with pytest.raises(ValueError):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
column_encoding={'a': "MADE_UP_ENCODING"},
use_legacy_dataset=use_legacy_dataset)
# Try to pass column_encoding and use_dictionary.
# This should throw an error.
with pytest.raises(ValueError):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=['b'],
column_encoding={'b': "PLAIN"},
use_legacy_dataset=use_legacy_dataset)
# Try to pass column_encoding and use_dictionary=True (default value).
# This should throw an error.
with pytest.raises(ValueError):
_check_roundtrip(mixed_table, expected=mixed_table,
column_encoding={'b': "PLAIN"},
use_legacy_dataset=use_legacy_dataset)
# Try to pass column_encoding and use_byte_stream_split on same column.
# This should throw an error.
with pytest.raises(ValueError):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
use_byte_stream_split=['a'],
column_encoding={'a': "RLE",
'b': "BYTE_STREAM_SPLIT"},
use_legacy_dataset=use_legacy_dataset)
# Try to pass column_encoding and use_byte_stream_split=True.
# This should throw an error.
with pytest.raises(ValueError):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
use_byte_stream_split=True,
column_encoding={'a': "RLE",
'b': "BYTE_STREAM_SPLIT"},
use_legacy_dataset=use_legacy_dataset)
# Try to pass column_encoding=True.
# This should throw an error.
with pytest.raises(TypeError):
_check_roundtrip(mixed_table, expected=mixed_table,
use_dictionary=False,
column_encoding=True,
use_legacy_dataset=use_legacy_dataset)
@parametrize_legacy_dataset
def test_compression_level(use_legacy_dataset):
arr = pa.array(list(map(int, range(1000))))
data = [arr, arr]
table = pa.Table.from_arrays(data, names=['a', 'b'])
# Check one compression level.
_check_roundtrip(table, expected=table, compression="gzip",
compression_level=1,
use_legacy_dataset=use_legacy_dataset)
# Check another one to make sure that compression_level=1 does not
# coincide with the default one in Arrow.
_check_roundtrip(table, expected=table, compression="gzip",
compression_level=5,
use_legacy_dataset=use_legacy_dataset)
# Check that the user can provide a compression per column
_check_roundtrip(table, expected=table,
compression={'a': "gzip", 'b': "snappy"},
use_legacy_dataset=use_legacy_dataset)
# Check that the user can provide a compression level per column
_check_roundtrip(table, expected=table, compression="gzip",
compression_level={'a': 2, 'b': 3},
use_legacy_dataset=use_legacy_dataset)
# Check if both LZ4 compressors are working
# (level < 3 -> fast, level >= 3 -> HC)
_check_roundtrip(table, expected=table, compression="lz4",
compression_level=1,
use_legacy_dataset=use_legacy_dataset)
_check_roundtrip(table, expected=table, compression="lz4",
compression_level=9,
use_legacy_dataset=use_legacy_dataset)
# Check that specifying a compression level for a codec which does allow
# specifying one, results into an error.
# Uncompressed, snappy and lzo do not support specifying a compression
# level.
# GZIP (zlib) allows for specifying a compression level but as of up
# to version 1.2.11 the valid range is [-1, 9].
invalid_combinations = [("snappy", 4), ("gzip", -1337),
("None", 444), ("lzo", 14)]
buf = io.BytesIO()
for (codec, level) in invalid_combinations:
with pytest.raises((ValueError, OSError)):
_write_table(table, buf, compression=codec,
compression_level=level)
def test_sanitized_spark_field_names():
a0 = pa.array([0, 1, 2, 3, 4])
name = 'prohib; ,\t{}'
table = pa.Table.from_arrays([a0], [name])
result = _roundtrip_table(table, write_table_kwargs={'flavor': 'spark'})
expected_name = 'prohib______'
assert result.schema[0].name == expected_name
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_multithreaded_read(use_legacy_dataset):
df = alltypes_sample(size=10000)
table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(table, buf, compression='SNAPPY', version='2.6')
buf.seek(0)
table1 = _read_table(
buf, use_threads=True, use_legacy_dataset=use_legacy_dataset)
buf.seek(0)
table2 = _read_table(
buf, use_threads=False, use_legacy_dataset=use_legacy_dataset)
assert table1.equals(table2)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_min_chunksize(use_legacy_dataset):
data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
table = pa.Table.from_pandas(data.reset_index())
buf = io.BytesIO()
_write_table(table, buf, chunk_size=-1)
buf.seek(0)
result = _read_table(buf, use_legacy_dataset=use_legacy_dataset)
assert result.equals(table)
with pytest.raises(ValueError):
_write_table(table, buf, chunk_size=0)
@pytest.mark.pandas
def test_write_error_deletes_incomplete_file(tempdir):
# ARROW-1285
df = pd.DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3, freq='ns')})
pdf = pa.Table.from_pandas(df)
filename = tempdir / 'tmp_file'
try:
_write_table(pdf, filename)
except pa.ArrowException:
pass
assert not filename.exists()
@parametrize_legacy_dataset
def test_read_non_existent_file(tempdir, use_legacy_dataset):
path = 'non-existent-file.parquet'
try:
pq.read_table(path, use_legacy_dataset=use_legacy_dataset)
except Exception as e:
assert path in e.args[0]
@parametrize_legacy_dataset
def test_read_table_doesnt_warn(datadir, use_legacy_dataset):
with pytest.warns(None) as record:
pq.read_table(datadir / 'v0.7.1.parquet',
use_legacy_dataset=use_legacy_dataset)
if use_legacy_dataset:
# FutureWarning: 'use_legacy_dataset=True'
assert len(record) == 1
else:
assert len(record) == 0
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_zlib_compression_bug(use_legacy_dataset):
# ARROW-3514: "zlib deflate failed, output buffer too small"
table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col'])
f = io.BytesIO()
pq.write_table(table, f, compression='gzip')
f.seek(0)
roundtrip = pq.read_table(f, use_legacy_dataset=use_legacy_dataset)
tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas())
@parametrize_legacy_dataset
def test_parquet_file_too_small(tempdir, use_legacy_dataset):
path = str(tempdir / "test.parquet")
# TODO(dataset) with datasets API it raises OSError instead
with pytest.raises((pa.ArrowInvalid, OSError),
match='size is 0 bytes'):
with open(path, 'wb') as f:
pass
pq.read_table(path, use_legacy_dataset=use_legacy_dataset)
with pytest.raises((pa.ArrowInvalid, OSError),
match='size is 4 bytes'):
with open(path, 'wb') as f:
f.write(b'ffff')
pq.read_table(path, use_legacy_dataset=use_legacy_dataset)
@pytest.mark.pandas
@pytest.mark.fastparquet
@pytest.mark.filterwarnings("ignore:RangeIndex:FutureWarning")
@pytest.mark.filterwarnings("ignore:tostring:DeprecationWarning:fastparquet")
def test_fastparquet_cross_compatibility(tempdir):
fp = pytest.importorskip('fastparquet')
df = pd.DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(4.0, 7.0, dtype="float64"),
"d": [True, False, True],
"e": pd.date_range("20130101", periods=3),
"f": pd.Categorical(["a", "b", "a"]),
# fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip
# "g": [[1, 2], None, [1, 2, 3]],
}
)
table = pa.table(df)
# Arrow -> fastparquet
file_arrow = str(tempdir / "cross_compat_arrow.parquet")
pq.write_table(table, file_arrow, compression=None)
fp_file = fp.ParquetFile(file_arrow)
df_fp = fp_file.to_pandas()
tm.assert_frame_equal(df, df_fp)
# Fastparquet -> arrow
file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet")
fp.write(file_fastparquet, df)
table_fp = pq.read_pandas(file_fastparquet)
# for fastparquet written file, categoricals comes back as strings
# (no arrow schema in parquet metadata)
df['f'] = df['f'].astype(object)
tm.assert_frame_equal(table_fp.to_pandas(), df)
@parametrize_legacy_dataset
@pytest.mark.parametrize('array_factory', [
lambda: pa.array([0, None] * 10),
lambda: pa.array([0, None] * 10).dictionary_encode(),
lambda: pa.array(["", None] * 10),
lambda: pa.array(["", None] * 10).dictionary_encode(),
])
@pytest.mark.parametrize('use_dictionary', [False, True])
@pytest.mark.parametrize('read_dictionary', [False, True])
def test_buffer_contents(
array_factory, use_dictionary, read_dictionary, use_legacy_dataset
):
# Test that null values are deterministically initialized to zero
# after a roundtrip through Parquet.
# See ARROW-8006 and ARROW-8011.
orig_table = pa.Table.from_pydict({"col": array_factory()})
bio = io.BytesIO()
pq.write_table(orig_table, bio, use_dictionary=True)
bio.seek(0)
read_dictionary = ['col'] if read_dictionary else None
table = pq.read_table(bio, use_threads=False,
read_dictionary=read_dictionary,
use_legacy_dataset=use_legacy_dataset)
for col in table.columns:
[chunk] = col.chunks
buf = chunk.buffers()[1]
assert buf.to_pybytes() == buf.size * b"\0"
def test_parquet_compression_roundtrip(tempdir):
# ARROW-10480: ensure even with nonstandard Parquet file naming
# conventions, writing and then reading a file works. In
# particular, ensure that we don't automatically double-compress
# the stream due to auto-detecting the extension in the filename
table = pa.table([pa.array(range(4))], names=["ints"])
path = tempdir / "arrow-10480.pyarrow.gz"
pq.write_table(table, path, compression="GZIP")
result = pq.read_table(path)
assert result.equals(table)
def test_empty_row_groups(tempdir):
# ARROW-3020
table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
path = tempdir / 'empty_row_groups.parquet'
num_groups = 3
with pq.ParquetWriter(path, table.schema) as writer:
for i in range(num_groups):
writer.write_table(table)
reader = pq.ParquetFile(path)
assert reader.metadata.num_row_groups == num_groups
for i in range(num_groups):
assert reader.read_row_group(i).equals(table)
def test_reads_over_batch(tempdir):
data = [None] * (1 << 20)
data.append([1])
# Large list<int64> with mostly nones and one final
# value. This should force batched reads when
# reading back.
table = pa.Table.from_arrays([data], ['column'])
path = tempdir / 'arrow-11607.parquet'
pq.write_table(table, path)
table2 = pq.read_table(path)
assert table == table2
@pytest.mark.dataset
def test_permutation_of_column_order(tempdir):
# ARROW-2366
case = tempdir / "dataset_column_order_permutation"
case.mkdir(exist_ok=True)
data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
pq.write_table(data1, case / "data1.parquet")
data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
pq.write_table(data2, case / "data2.parquet")
table = pq.read_table(str(case))
table2 = pa.table([[1, 2, 3, 4, 5, 6],
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6]],
names=['a', 'b'])
assert table == table2
def test_read_table_legacy_deprecated(tempdir):
# ARROW-15870
table = pa.table({'a': [1, 2, 3]})
path = tempdir / 'data.parquet'
pq.write_table(table, path)
with pytest.warns(
FutureWarning, match="Passing 'use_legacy_dataset=True'"
):
pq.read_table(path, use_legacy_dataset=True)

View File

@ -0,0 +1,114 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
import pyarrow as pa
from pyarrow.tests.parquet.common import parametrize_legacy_dataset
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import (_read_table,
_check_roundtrip)
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
except ImportError:
pd = tm = None
# Tests for ARROW-11497
_test_data_simple = [
{'items': [1, 2]},
{'items': [0]},
]
_test_data_complex = [
{'items': [{'name': 'elem1', 'value': '1'},
{'name': 'elem2', 'value': '2'}]},
{'items': [{'name': 'elem1', 'value': '0'}]},
]
parametrize_test_data = pytest.mark.parametrize(
"test_data", [_test_data_simple, _test_data_complex])
@pytest.mark.pandas
@parametrize_legacy_dataset
@parametrize_test_data
def test_write_compliant_nested_type_enable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write pandas df with new flag
_roundtrip_pandas_dataframe(df,
write_kwargs={
'use_compliant_nested_type': True},
use_legacy_dataset=use_legacy_dataset)
# Write to a parquet file with compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema,
use_compliant_nested_type=True,
version='2.6') as writer:
writer.write_table(table)
# Read back as a table
new_table = _read_table(path)
# Validate that "items" columns compliant to Parquet nested format
# Should be like this: list<element: struct<name: string, value: string>>
assert isinstance(new_table.schema.types[0], pa.ListType)
assert new_table.schema.types[0].value_field.name == 'element'
# Verify that the new table can be read/written correctly
_check_roundtrip(new_table,
use_legacy_dataset=use_legacy_dataset,
use_compliant_nested_type=True)
@pytest.mark.pandas
@parametrize_legacy_dataset
@parametrize_test_data
def test_write_compliant_nested_type_disable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write with new flag disabled (default behaviour)
_roundtrip_pandas_dataframe(df, write_kwargs={},
use_legacy_dataset=use_legacy_dataset)
# Write to a parquet file while disabling compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
writer.write_table(table)
new_table = _read_table(path)
# Validate that "items" columns is not compliant to Parquet nested format
# Should be like this: list<item: struct<name: string, value: string>>
assert isinstance(new_table.schema.types[0], pa.ListType)
assert new_table.schema.types[0].value_field.name == 'item'
# Verify that the new table can be read/written correctly
_check_roundtrip(new_table,
use_legacy_dataset=use_legacy_dataset,
use_compliant_nested_type=False)

View File

@ -0,0 +1,526 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import decimal
import io
import numpy as np
import pytest
import pyarrow as pa
from pyarrow.tests import util
from pyarrow.tests.parquet.common import (_check_roundtrip,
parametrize_legacy_dataset)
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _read_table, _write_table
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
from pyarrow.tests.pandas_examples import (dataframe_with_arrays,
dataframe_with_lists)
from pyarrow.tests.parquet.common import alltypes_sample
except ImportError:
pd = tm = None
# General roundtrip of data types
# -----------------------------------------------------------------------------
@pytest.mark.pandas
@parametrize_legacy_dataset
@pytest.mark.parametrize('chunk_size', [None, 1000])
def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset):
df = alltypes_sample(size=10000, categorical=True)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
assert arrow_table.schema.pandas_metadata is not None
_write_table(arrow_table, filename, version='2.6',
coerce_timestamps='ms', chunk_size=chunk_size)
table_read = pq.read_pandas(
filename, use_legacy_dataset=use_legacy_dataset)
assert table_read.schema.pandas_metadata is not None
read_metadata = table_read.schema.metadata
assert arrow_table.schema.metadata == read_metadata
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'empty_str': [''] * size
})
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
_write_table(arrow_table, filename, version='1.0')
table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
# We pass uint32_t as int64_t if we write Parquet version 1.0
df['uint32'] = df['uint32'].values.astype(np.int64)
tm.assert_frame_equal(df, df_read)
# Dictionary
# -----------------------------------------------------------------------------
def _simple_table_write_read(table, use_legacy_dataset):
bio = pa.BufferOutputStream()
pq.write_table(table, bio)
contents = bio.getvalue()
return pq.read_table(
pa.BufferReader(contents), use_legacy_dataset=use_legacy_dataset
)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_direct_read_dictionary(use_legacy_dataset):
# ARROW-3325
repeats = 10
nunique = 5
data = [
[util.rands(10) for i in range(nunique)] * repeats,
]
table = pa.table(data, names=['f0'])
bio = pa.BufferOutputStream()
pq.write_table(table, bio)
contents = bio.getvalue()
result = pq.read_table(pa.BufferReader(contents),
read_dictionary=['f0'],
use_legacy_dataset=use_legacy_dataset)
# Compute dictionary-encoded subfield
expected = pa.table([table[0].dictionary_encode()], names=['f0'])
assert result.equals(expected)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_direct_read_dictionary_subfield(use_legacy_dataset):
repeats = 10
nunique = 5
data = [
[[util.rands(10)] for i in range(nunique)] * repeats,
]
table = pa.table(data, names=['f0'])
bio = pa.BufferOutputStream()
pq.write_table(table, bio)
contents = bio.getvalue()
result = pq.read_table(pa.BufferReader(contents),
read_dictionary=['f0.list.item'],
use_legacy_dataset=use_legacy_dataset)
arr = pa.array(data[0])
values_as_dict = arr.values.dictionary_encode()
inner_indices = values_as_dict.indices.cast('int32')
new_values = pa.DictionaryArray.from_arrays(inner_indices,
values_as_dict.dictionary)
offsets = pa.array(range(51), type='int32')
expected_arr = pa.ListArray.from_arrays(offsets, new_values)
expected = pa.table([expected_arr], names=['f0'])
assert result.equals(expected)
assert result[0].num_chunks == 1
@parametrize_legacy_dataset
def test_dictionary_array_automatically_read(use_legacy_dataset):
# ARROW-3246
# Make a large dictionary, a little over 4MB of data
dict_length = 4000
dict_values = pa.array([('x' * 1000 + '_{}'.format(i))
for i in range(dict_length)])
num_chunks = 10
chunk_size = 100
chunks = []
for i in range(num_chunks):
indices = np.random.randint(0, dict_length,
size=chunk_size).astype(np.int32)
chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices),
dict_values))
table = pa.table([pa.chunked_array(chunks)], names=['f0'])
result = _simple_table_write_read(table, use_legacy_dataset)
assert result.equals(table)
# The only key in the metadata was the Arrow schema key
assert result.schema.metadata is None
# Decimal
# -----------------------------------------------------------------------------
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_decimal_roundtrip(tempdir, use_legacy_dataset):
num_values = 10
columns = {}
for precision in range(1, 39):
for scale in range(0, precision + 1):
with util.random_seed(0):
random_decimal_values = [
util.randdecimal(precision, scale)
for _ in range(num_values)
]
column_name = ('dec_precision_{:d}_scale_{:d}'
.format(precision, scale))
columns[column_name] = random_decimal_values
expected = pd.DataFrame(columns)
filename = tempdir / 'decimals.parquet'
string_filename = str(filename)
table = pa.Table.from_pandas(expected)
_write_table(table, string_filename)
result_table = _read_table(
string_filename, use_legacy_dataset=use_legacy_dataset)
result = result_table.to_pandas()
tm.assert_frame_equal(result, expected)
@pytest.mark.pandas
@pytest.mark.xfail(
raises=OSError, reason='Parquet does not support negative scale'
)
def test_decimal_roundtrip_negative_scale(tempdir):
expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]})
filename = tempdir / 'decimals.parquet'
string_filename = str(filename)
t = pa.Table.from_pandas(expected)
_write_table(t, string_filename)
result_table = _read_table(string_filename)
result = result_table.to_pandas()
tm.assert_frame_equal(result, expected)
# List types
# -----------------------------------------------------------------------------
@parametrize_legacy_dataset
@pytest.mark.parametrize('dtype', [int, float])
def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset):
filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__)
data = [pa.array(list(map(dtype, range(5))))]
table = pa.Table.from_arrays(data, names=['a'])
_write_table(table, filename)
table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
for i in range(table.num_columns):
col_written = table[i]
col_read = table_read[i]
assert table.field(i).name == table_read.field(i).name
assert col_read.num_chunks == 1
data_written = col_written.chunk(0)
data_read = col_read.chunk(0)
assert data_written.equals(data_read)
@parametrize_legacy_dataset
def test_empty_lists_table_roundtrip(use_legacy_dataset):
# ARROW-2744: Shouldn't crash when writing an array of empty lists
arr = pa.array([[], []], type=pa.list_(pa.int32()))
table = pa.Table.from_arrays([arr], ["A"])
_check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
@parametrize_legacy_dataset
def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset):
# Reproduce failure in ARROW-5630
typ = pa.list_(pa.field("item", pa.float32(), False))
num_rows = 10000
t = pa.table([
pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] *
(num_rows // 10)), type=typ)
], ['a'])
_check_roundtrip(
t, data_page_size=4096, use_legacy_dataset=use_legacy_dataset)
@parametrize_legacy_dataset
def test_nested_list_struct_multiple_batches_roundtrip(
tempdir, use_legacy_dataset
):
# Reproduce failure in ARROW-11024
data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100
table = pa.table([pa.array(data)], names=['column'])
_check_roundtrip(
table, row_group_size=20, use_legacy_dataset=use_legacy_dataset)
# Reproduce failure in ARROW-11069 (plain non-nested structs with strings)
data = pa.array(
[{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10
)
table = pa.table({'column': data})
_check_roundtrip(
table, row_group_size=10, use_legacy_dataset=use_legacy_dataset)
def test_writing_empty_lists():
# ARROW-2591: [Python] Segmentation fault issue in pq.write_table
arr1 = pa.array([[], []], pa.list_(pa.int32()))
table = pa.Table.from_arrays([arr1], ['list(int32)'])
_check_roundtrip(table)
@pytest.mark.pandas
def test_column_of_arrays(tempdir):
df, schema = dataframe_with_arrays()
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df, schema=schema)
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
table_read = _read_table(filename)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
def test_column_of_lists(tempdir):
df, schema = dataframe_with_lists(parquet_compatible=True)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df, schema=schema)
_write_table(arrow_table, filename, version='2.6')
table_read = _read_table(filename)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
def test_large_list_records():
# This was fixed in PARQUET-1100
list_lengths = np.random.randint(0, 500, size=50)
list_lengths[::10] = 0
list_values = [list(map(int, np.random.randint(0, 100, size=x)))
if i % 8 else None
for i, x in enumerate(list_lengths)]
a1 = pa.array(list_values)
table = pa.Table.from_arrays([a1], ['int_lists'])
_check_roundtrip(table)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_parquet_nested_convenience(tempdir, use_legacy_dataset):
# ARROW-1684
df = pd.DataFrame({
'a': [[1, 2, 3], None, [4, 5], []],
'b': [[1.], None, None, [6., 7.]],
})
path = str(tempdir / 'nested_convenience.parquet')
table = pa.Table.from_pandas(df, preserve_index=False)
_write_table(table, path)
read = pq.read_table(
path, columns=['a'], use_legacy_dataset=use_legacy_dataset)
tm.assert_frame_equal(read.to_pandas(), df[['a']])
read = pq.read_table(
path, columns=['a', 'b'], use_legacy_dataset=use_legacy_dataset)
tm.assert_frame_equal(read.to_pandas(), df)
# Binary
# -----------------------------------------------------------------------------
def test_fixed_size_binary():
t0 = pa.binary(10)
data = [b'fooooooooo', None, b'barooooooo', b'quxooooooo']
a0 = pa.array(data, type=t0)
table = pa.Table.from_arrays([a0],
['binary[10]'])
_check_roundtrip(table)
# Large types
# -----------------------------------------------------------------------------
@pytest.mark.slow
@pytest.mark.large_memory
def test_large_table_int32_overflow():
size = np.iinfo('int32').max + 1
arr = np.ones(size, dtype='uint8')
parr = pa.array(arr, type=pa.uint8())
table = pa.Table.from_arrays([parr], names=['one'])
f = io.BytesIO()
_write_table(table, f)
def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs):
stream = pa.BufferOutputStream()
_write_table(table, stream, **write_kwargs)
buf = stream.getvalue()
return _read_table(buf, use_legacy_dataset=use_legacy_dataset)
@pytest.mark.slow
@pytest.mark.large_memory
@parametrize_legacy_dataset
def test_byte_array_exactly_2gb(use_legacy_dataset):
# Test edge case reported in ARROW-3762
val = b'x' * (1 << 10)
base = pa.array([val] * ((1 << 21) - 1))
cases = [
[b'x' * 1023], # 2^31 - 1
[b'x' * 1024], # 2^31
[b'x' * 1025] # 2^31 + 1
]
for case in cases:
values = pa.chunked_array([base, pa.array(case)])
t = pa.table([values], names=['f0'])
result = _simple_table_roundtrip(
t, use_legacy_dataset=use_legacy_dataset, use_dictionary=False)
assert t.equals(result)
@pytest.mark.slow
@pytest.mark.pandas
@pytest.mark.large_memory
@parametrize_legacy_dataset
def test_binary_array_overflow_to_chunked(use_legacy_dataset):
# ARROW-3762
# 2^31 + 1 bytes
values = [b'x'] + [
b'x' * (1 << 20)
] * 2 * (1 << 10)
df = pd.DataFrame({'byte_col': values})
tbl = pa.Table.from_pandas(df, preserve_index=False)
read_tbl = _simple_table_roundtrip(
tbl, use_legacy_dataset=use_legacy_dataset)
col0_data = read_tbl[0]
assert isinstance(col0_data, pa.ChunkedArray)
# Split up into 2GB chunks
assert col0_data.num_chunks == 2
assert tbl.equals(read_tbl)
@pytest.mark.slow
@pytest.mark.pandas
@pytest.mark.large_memory
@parametrize_legacy_dataset
def test_list_of_binary_large_cell(use_legacy_dataset):
# ARROW-4688
data = []
# TODO(wesm): handle chunked children
# 2^31 - 1 bytes in a single cell
# data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
# A little under 2GB in cell each containing approximately 10MB each
data.extend([[b'x' * 1000000] * 10] * 214)
arr = pa.array(data)
table = pa.Table.from_arrays([arr], ['chunky_cells'])
read_table = _simple_table_roundtrip(
table, use_legacy_dataset=use_legacy_dataset)
assert table.equals(read_table)
def test_large_binary():
data = [b'foo', b'bar'] * 50
for type in [pa.large_binary(), pa.large_string()]:
arr = pa.array(data, type=type)
table = pa.Table.from_arrays([arr], names=['strs'])
for use_dictionary in [False, True]:
_check_roundtrip(table, use_dictionary=use_dictionary)
@pytest.mark.slow
@pytest.mark.large_memory
def test_large_binary_huge():
s = b'xy' * 997
data = [s] * ((1 << 33) // len(s))
for type in [pa.large_binary(), pa.large_string()]:
arr = pa.array(data, type=type)
table = pa.Table.from_arrays([arr], names=['strs'])
for use_dictionary in [False, True]:
_check_roundtrip(table, use_dictionary=use_dictionary)
del arr, table
@pytest.mark.large_memory
def test_large_binary_overflow():
s = b'x' * (1 << 31)
arr = pa.array([s], type=pa.large_binary())
table = pa.Table.from_arrays([arr], names=['strs'])
for use_dictionary in [False, True]:
writer = pa.BufferOutputStream()
with pytest.raises(
pa.ArrowInvalid,
match="Parquet cannot store strings with size 2GB or more"):
_write_table(table, writer, use_dictionary=use_dictionary)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,446 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import datetime
import io
import numpy as np
import pytest
import pyarrow as pa
from pyarrow.tests.parquet.common import (
_check_roundtrip, parametrize_legacy_dataset)
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _read_table, _write_table
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
except ImportError:
pd = tm = None
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_datetime_tz(use_legacy_dataset):
s = pd.Series([datetime.datetime(2017, 9, 6)])
s = s.dt.tz_localize('utc')
s.index = s
# Both a column and an index to hit both use cases
df = pd.DataFrame({'tz_aware': s,
'tz_eastern': s.dt.tz_convert('US/Eastern')},
index=s)
f = io.BytesIO()
arrow_table = pa.Table.from_pandas(df)
_write_table(arrow_table, f, coerce_timestamps='ms')
f.seek(0)
table_read = pq.read_pandas(f, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_datetime_timezone_tzinfo(use_legacy_dataset):
value = datetime.datetime(2018, 1, 1, 1, 23, 45,
tzinfo=datetime.timezone.utc)
df = pd.DataFrame({'foo': [value]})
_roundtrip_pandas_dataframe(
df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset)
@pytest.mark.pandas
def test_coerce_timestamps(tempdir):
from collections import OrderedDict
# ARROW-622
arrays = OrderedDict()
fields = [pa.field('datetime64',
pa.list_(pa.timestamp('ms')))]
arrays['datetime64'] = [
np.array(['2007-07-13T01:23:34.123456789',
None,
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ms]'),
None,
None,
np.array(['2007-07-13T02',
None,
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ms]'),
]
df = pd.DataFrame(arrays)
schema = pa.schema(fields)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df, schema=schema)
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='us')
table_read = _read_table(filename)
df_read = table_read.to_pandas()
df_expected = df.copy()
for i, x in enumerate(df_expected['datetime64']):
if isinstance(x, np.ndarray):
df_expected['datetime64'][i] = x.astype('M8[us]')
tm.assert_frame_equal(df_expected, df_read)
with pytest.raises(ValueError):
_write_table(arrow_table, filename, version='2.6',
coerce_timestamps='unknown')
@pytest.mark.pandas
def test_coerce_timestamps_truncated(tempdir):
"""
ARROW-2555: Test that we can truncate timestamps when coercing if
explicitly allowed.
"""
dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
second=1, microsecond=1)
dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
second=1)
fields_us = [pa.field('datetime64', pa.timestamp('us'))]
arrays_us = {'datetime64': [dt_us, dt_ms]}
df_us = pd.DataFrame(arrays_us)
schema_us = pa.schema(fields_us)
filename = tempdir / 'pandas_truncated.parquet'
table_us = pa.Table.from_pandas(df_us, schema=schema_us)
_write_table(table_us, filename, version='2.6', coerce_timestamps='ms',
allow_truncated_timestamps=True)
table_ms = _read_table(filename)
df_ms = table_ms.to_pandas()
arrays_expected = {'datetime64': [dt_ms, dt_ms]}
df_expected = pd.DataFrame(arrays_expected)
tm.assert_frame_equal(df_expected, df_ms)
@pytest.mark.pandas
def test_date_time_types(tempdir):
t1 = pa.date32()
data1 = np.array([17259, 17260, 17261], dtype='int32')
a1 = pa.array(data1, type=t1)
t2 = pa.date64()
data2 = data1.astype('int64') * 86400000
a2 = pa.array(data2, type=t2)
t3 = pa.timestamp('us')
start = pd.Timestamp('2001-01-01').value / 1000
data3 = np.array([start, start + 1, start + 2], dtype='int64')
a3 = pa.array(data3, type=t3)
t4 = pa.time32('ms')
data4 = np.arange(3, dtype='i4')
a4 = pa.array(data4, type=t4)
t5 = pa.time64('us')
a5 = pa.array(data4.astype('int64'), type=t5)
t6 = pa.time32('s')
a6 = pa.array(data4, type=t6)
ex_t6 = pa.time32('ms')
ex_a6 = pa.array(data4 * 1000, type=ex_t6)
t7 = pa.timestamp('ns')
start = pd.Timestamp('2001-01-01').value
data7 = np.array([start, start + 1000, start + 2000],
dtype='int64')
a7 = pa.array(data7, type=t7)
table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
['date32', 'date64', 'timestamp[us]',
'time32[s]', 'time64[us]',
'time32_from64[s]',
'timestamp[ns]'])
# date64 as date32
# time32[s] to time32[ms]
expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
['date32', 'date64', 'timestamp[us]',
'time32[s]', 'time64[us]',
'time32_from64[s]',
'timestamp[ns]'])
_check_roundtrip(table, expected=expected, version='2.6')
t0 = pa.timestamp('ms')
data0 = np.arange(4, dtype='int64')
a0 = pa.array(data0, type=t0)
t1 = pa.timestamp('us')
data1 = np.arange(4, dtype='int64')
a1 = pa.array(data1, type=t1)
t2 = pa.timestamp('ns')
data2 = np.arange(4, dtype='int64')
a2 = pa.array(data2, type=t2)
table = pa.Table.from_arrays([a0, a1, a2],
['ts[ms]', 'ts[us]', 'ts[ns]'])
expected = pa.Table.from_arrays([a0, a1, a2],
['ts[ms]', 'ts[us]', 'ts[ns]'])
# int64 for all timestamps supported by default
filename = tempdir / 'int64_timestamps.parquet'
_write_table(table, filename, version='2.6')
parquet_schema = pq.ParquetFile(filename).schema
for i in range(3):
assert parquet_schema.column(i).physical_type == 'INT64'
read_table = _read_table(filename)
assert read_table.equals(expected)
t0_ns = pa.timestamp('ns')
data0_ns = np.array(data0 * 1000000, dtype='int64')
a0_ns = pa.array(data0_ns, type=t0_ns)
t1_ns = pa.timestamp('ns')
data1_ns = np.array(data1 * 1000, dtype='int64')
a1_ns = pa.array(data1_ns, type=t1_ns)
expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
['ts[ms]', 'ts[us]', 'ts[ns]'])
# int96 nanosecond timestamps produced upon request
filename = tempdir / 'explicit_int96_timestamps.parquet'
_write_table(table, filename, version='2.6',
use_deprecated_int96_timestamps=True)
parquet_schema = pq.ParquetFile(filename).schema
for i in range(3):
assert parquet_schema.column(i).physical_type == 'INT96'
read_table = _read_table(filename)
assert read_table.equals(expected)
# int96 nanosecond timestamps implied by flavor 'spark'
filename = tempdir / 'spark_int96_timestamps.parquet'
_write_table(table, filename, version='2.6',
flavor='spark')
parquet_schema = pq.ParquetFile(filename).schema
for i in range(3):
assert parquet_schema.column(i).physical_type == 'INT96'
read_table = _read_table(filename)
assert read_table.equals(expected)
@pytest.mark.pandas
@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
def test_coerce_int96_timestamp_unit(unit):
i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000
d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
d_ms = d_s * 1000
d_us = d_ms * 1000
d_ns = d_us * 1000
a_s = pa.array(d_s, type=pa.timestamp('s'))
a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
a_us = pa.array(d_us, type=pa.timestamp('us'))
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
arrays = {"s": a_s, "ms": a_ms, "us": a_us, "ns": a_ns}
names = ['ts_s', 'ts_ms', 'ts_us', 'ts_ns']
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
# For either Parquet version, coercing to nanoseconds is allowed
# if Int96 storage is used
expected = pa.Table.from_arrays([arrays.get(unit)]*4, names)
read_table_kwargs = {"coerce_int96_timestamp_unit": unit}
_check_roundtrip(table, expected,
read_table_kwargs=read_table_kwargs,
use_deprecated_int96_timestamps=True)
_check_roundtrip(table, expected, version='2.6',
read_table_kwargs=read_table_kwargs,
use_deprecated_int96_timestamps=True)
@pytest.mark.pandas
@pytest.mark.parametrize('pq_reader_method', ['ParquetFile', 'read_table'])
def test_coerce_int96_timestamp_overflow(pq_reader_method, tempdir):
def get_table(pq_reader_method, filename, **kwargs):
if pq_reader_method == "ParquetFile":
return pq.ParquetFile(filename, **kwargs).read()
elif pq_reader_method == "read_table":
return pq.read_table(filename, **kwargs)
# Recreating the initial JIRA issue referenced in ARROW-12096
oob_dts = [
datetime.datetime(1000, 1, 1),
datetime.datetime(2000, 1, 1),
datetime.datetime(3000, 1, 1)
]
df = pd.DataFrame({"a": oob_dts})
table = pa.table(df)
filename = tempdir / "test_round_trip_overflow.parquet"
pq.write_table(table, filename, use_deprecated_int96_timestamps=True,
version="1.0")
# with the default resolution of ns, we get wrong values for INT96
# that are out of bounds for nanosecond range
tab_error = get_table(pq_reader_method, filename)
assert tab_error["a"].to_pylist() != oob_dts
# avoid this overflow by specifying the resolution to use for INT96 values
tab_correct = get_table(
pq_reader_method, filename, coerce_int96_timestamp_unit="s"
)
df_correct = tab_correct.to_pandas(timestamp_as_object=True)
tm.assert_frame_equal(df, df_correct)
def test_timestamp_restore_timezone():
# ARROW-5888, restore timezone from serialized metadata
ty = pa.timestamp('ms', tz='America/New_York')
arr = pa.array([1, 2, 3], type=ty)
t = pa.table([arr], names=['f0'])
_check_roundtrip(t)
def test_timestamp_restore_timezone_nanosecond():
# ARROW-9634, also restore timezone for nanosecond data that get stored
# as microseconds in the parquet file
ty = pa.timestamp('ns', tz='America/New_York')
arr = pa.array([1000, 2000, 3000], type=ty)
table = pa.table([arr], names=['f0'])
ty_us = pa.timestamp('us', tz='America/New_York')
expected = pa.table([arr.cast(ty_us)], names=['f0'])
_check_roundtrip(table, expected=expected)
@pytest.mark.pandas
def test_list_of_datetime_time_roundtrip():
# ARROW-4135
times = pd.to_datetime(['09:00', '09:30', '10:00', '10:30', '11:00',
'11:30', '12:00'])
df = pd.DataFrame({'time': [times.time]})
_roundtrip_pandas_dataframe(df, write_kwargs={})
@pytest.mark.pandas
def test_parquet_version_timestamp_differences():
i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000
d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
d_ms = d_s * 1000
d_us = d_ms * 1000
d_ns = d_us * 1000
a_s = pa.array(d_s, type=pa.timestamp('s'))
a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
a_us = pa.array(d_us, type=pa.timestamp('us'))
a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
# Using Parquet version 1.0, seconds should be coerced to milliseconds
# and nanoseconds should be coerced to microseconds by default
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
_check_roundtrip(table, expected)
# Using Parquet version 2.0, seconds should be coerced to milliseconds
# and nanoseconds should be retained by default
expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
_check_roundtrip(table, expected, version='2.6')
# Using Parquet version 1.0, coercing to milliseconds or microseconds
# is allowed
expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
_check_roundtrip(table, expected, coerce_timestamps='ms')
# Using Parquet version 2.0, coercing to milliseconds or microseconds
# is allowed
expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
_check_roundtrip(table, expected, version='2.6', coerce_timestamps='us')
# TODO: after pyarrow allows coerce_timestamps='ns', tests like the
# following should pass ...
# Using Parquet version 1.0, coercing to nanoseconds is not allowed
# expected = None
# with pytest.raises(NotImplementedError):
# _roundtrip_table(table, coerce_timestamps='ns')
# Using Parquet version 2.0, coercing to nanoseconds is allowed
# expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
# _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns')
# For either Parquet version, coercing to nanoseconds is allowed
# if Int96 storage is used
expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
_check_roundtrip(table, expected,
use_deprecated_int96_timestamps=True)
_check_roundtrip(table, expected, version='2.6',
use_deprecated_int96_timestamps=True)
@pytest.mark.pandas
def test_noncoerced_nanoseconds_written_without_exception(tempdir):
# ARROW-1957: the Parquet version 2.0 writer preserves Arrow
# nanosecond timestamps by default
n = 9
df = pd.DataFrame({'x': range(n)},
index=pd.date_range('2017-01-01', freq='1n', periods=n))
tb = pa.Table.from_pandas(df)
filename = tempdir / 'written.parquet'
try:
pq.write_table(tb, filename, version='2.6')
except Exception:
pass
assert filename.exists()
recovered_table = pq.read_table(filename)
assert tb.equals(recovered_table)
# Loss of data through coercion (without explicit override) still an error
filename = tempdir / 'not_written.parquet'
with pytest.raises(ValueError):
pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
def test_duration_type():
# ARROW-6780
arrays = [pa.array([0, 1, 2, 3], type=pa.duration(unit))
for unit in ["s", "ms", "us", "ns"]]
table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"])
_check_roundtrip(table)

View File

@ -0,0 +1,530 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
from datetime import timedelta
import pyarrow as pa
try:
import pyarrow.parquet as pq
import pyarrow.parquet.encryption as pe
except ImportError:
pq = None
pe = None
else:
from pyarrow.tests.parquet.encryption import (
InMemoryKmsClient, verify_file_encrypted)
PARQUET_NAME = 'encrypted_table.in_mem.parquet'
FOOTER_KEY = b"0123456789112345"
FOOTER_KEY_NAME = "footer_key"
COL_KEY = b"1234567890123450"
COL_KEY_NAME = "col_key"
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not parquet_encryption'
pytestmark = pytest.mark.parquet_encryption
@pytest.fixture(scope='module')
def data_table():
data_table = pa.Table.from_pydict({
'a': pa.array([1, 2, 3]),
'b': pa.array(['a', 'b', 'c']),
'c': pa.array(['x', 'y', 'z'])
})
return data_table
@pytest.fixture(scope='module')
def basic_encryption_config():
basic_encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME,
column_keys={
COL_KEY_NAME: ["a", "b"],
})
return basic_encryption_config
def test_encrypted_parquet_write_read(tempdir, data_table):
"""Write an encrypted parquet, verify it's encrypted, and then read it."""
path = tempdir / PARQUET_NAME
# Encrypt the footer with the footer key,
# encrypt column `a` and column `b` with another key,
# keep `c` plaintext
encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME,
column_keys={
COL_KEY_NAME: ["a", "b"],
},
encryption_algorithm="AES_GCM_V1",
cache_lifetime=timedelta(minutes=5.0),
data_key_length_bits=256)
kms_connection_config = pe.KmsConnectionConfig(
custom_kms_conf={
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
COL_KEY_NAME: COL_KEY.decode("UTF-8"),
}
)
def kms_factory(kms_connection_configuration):
return InMemoryKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
verify_file_encrypted(path)
# Read with decryption properties
decryption_config = pe.DecryptionConfiguration(
cache_lifetime=timedelta(minutes=5.0))
result_table = read_encrypted_parquet(
path, decryption_config, kms_connection_config, crypto_factory)
assert data_table.equals(result_table)
def write_encrypted_parquet(path, table, encryption_config,
kms_connection_config, crypto_factory):
file_encryption_properties = crypto_factory.file_encryption_properties(
kms_connection_config, encryption_config)
assert(file_encryption_properties is not None)
with pq.ParquetWriter(
path, table.schema,
encryption_properties=file_encryption_properties) as writer:
writer.write_table(table)
def read_encrypted_parquet(path, decryption_config,
kms_connection_config, crypto_factory):
file_decryption_properties = crypto_factory.file_decryption_properties(
kms_connection_config, decryption_config)
assert(file_decryption_properties is not None)
meta = pq.read_metadata(
path, decryption_properties=file_decryption_properties)
assert(meta.num_columns == 3)
schema = pq.read_schema(
path, decryption_properties=file_decryption_properties)
assert(len(schema.names) == 3)
result = pq.ParquetFile(
path, decryption_properties=file_decryption_properties)
return result.read(use_threads=False)
def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table):
"""Write an encrypted parquet, verify it's encrypted,
and then read it using wrong keys."""
path = tempdir / PARQUET_NAME
# Encrypt the footer with the footer key,
# encrypt column `a` and column `b` with another key,
# keep `c` plaintext
encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME,
column_keys={
COL_KEY_NAME: ["a", "b"],
},
encryption_algorithm="AES_GCM_V1",
cache_lifetime=timedelta(minutes=5.0),
data_key_length_bits=256)
kms_connection_config = pe.KmsConnectionConfig(
custom_kms_conf={
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
COL_KEY_NAME: COL_KEY.decode("UTF-8"),
}
)
def kms_factory(kms_connection_configuration):
return InMemoryKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
verify_file_encrypted(path)
# Read with decryption properties
wrong_kms_connection_config = pe.KmsConnectionConfig(
custom_kms_conf={
# Wrong keys - mixup in names
FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"),
COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
}
)
decryption_config = pe.DecryptionConfiguration(
cache_lifetime=timedelta(minutes=5.0))
with pytest.raises(ValueError, match=r"Incorrect master key used"):
read_encrypted_parquet(
path, decryption_config, wrong_kms_connection_config,
crypto_factory)
def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table):
"""Write an encrypted parquet, verify it's encrypted,
but then try to read it without decryption properties."""
test_encrypted_parquet_write_read(tempdir, data_table)
# Read without decryption properties
with pytest.raises(IOError, match=r"no decryption"):
pq.ParquetFile(tempdir / PARQUET_NAME).read()
def test_encrypted_parquet_read_metadata_no_decryption_config(
tempdir, data_table):
"""Write an encrypted parquet, verify it's encrypted,
but then try to read its metadata without decryption properties."""
test_encrypted_parquet_write_read(tempdir, data_table)
# Read metadata without decryption properties
with pytest.raises(IOError, match=r"no decryption"):
pq.read_metadata(tempdir / PARQUET_NAME)
def test_encrypted_parquet_read_schema_no_decryption_config(
tempdir, data_table):
"""Write an encrypted parquet, verify it's encrypted,
but then try to read its schema without decryption properties."""
test_encrypted_parquet_write_read(tempdir, data_table)
with pytest.raises(IOError, match=r"no decryption"):
pq.read_schema(tempdir / PARQUET_NAME)
def test_encrypted_parquet_write_no_col_key(tempdir, data_table):
"""Write an encrypted parquet, but give only footer key,
without column key."""
path = tempdir / 'encrypted_table_no_col_key.in_mem.parquet'
# Encrypt the footer with the footer key
encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME)
kms_connection_config = pe.KmsConnectionConfig(
custom_kms_conf={
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
COL_KEY_NAME: COL_KEY.decode("UTF-8"),
}
)
def kms_factory(kms_connection_configuration):
return InMemoryKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
with pytest.raises(OSError,
match="Either column_keys or uniform_encryption "
"must be set"):
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
def test_encrypted_parquet_write_kms_error(tempdir, data_table,
basic_encryption_config):
"""Write an encrypted parquet, but raise KeyError in KmsClient."""
path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
encryption_config = basic_encryption_config
# Empty master_keys_map
kms_connection_config = pe.KmsConnectionConfig()
def kms_factory(kms_connection_configuration):
# Empty master keys map will cause KeyError to be raised
# on wrap/unwrap calls
return InMemoryKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
with pytest.raises(KeyError, match="footer_key"):
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
def test_encrypted_parquet_write_kms_specific_error(tempdir, data_table,
basic_encryption_config):
"""Write an encrypted parquet, but raise KeyError in KmsClient."""
path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
encryption_config = basic_encryption_config
# Empty master_keys_map
kms_connection_config = pe.KmsConnectionConfig()
class ThrowingKmsClient(pe.KmsClient):
"""A KmsClient implementation that throws exception in
wrap/unwrap calls
"""
def __init__(self, config):
"""Create an InMemoryKmsClient instance."""
pe.KmsClient.__init__(self)
self.config = config
def wrap_key(self, key_bytes, master_key_identifier):
raise ValueError("Cannot Wrap Key")
def unwrap_key(self, wrapped_key, master_key_identifier):
raise ValueError("Cannot Unwrap Key")
def kms_factory(kms_connection_configuration):
# Exception thrown in wrap/unwrap calls
return ThrowingKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
with pytest.raises(ValueError, match="Cannot Wrap Key"):
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
def test_encrypted_parquet_write_kms_factory_error(tempdir, data_table,
basic_encryption_config):
"""Write an encrypted parquet, but raise ValueError in kms_factory."""
path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
encryption_config = basic_encryption_config
# Empty master_keys_map
kms_connection_config = pe.KmsConnectionConfig()
def kms_factory(kms_connection_configuration):
raise ValueError('Cannot create KmsClient')
crypto_factory = pe.CryptoFactory(kms_factory)
with pytest.raises(ValueError,
match="Cannot create KmsClient"):
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
def test_encrypted_parquet_write_kms_factory_type_error(
tempdir, data_table, basic_encryption_config):
"""Write an encrypted parquet, but use wrong KMS client type
that doesn't implement KmsClient."""
path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
encryption_config = basic_encryption_config
# Empty master_keys_map
kms_connection_config = pe.KmsConnectionConfig()
class WrongTypeKmsClient():
"""This is not an implementation of KmsClient.
"""
def __init__(self, config):
self.master_keys_map = config.custom_kms_conf
def wrap_key(self, key_bytes, master_key_identifier):
return None
def unwrap_key(self, wrapped_key, master_key_identifier):
return None
def kms_factory(kms_connection_configuration):
return WrongTypeKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
with pytest.raises(TypeError):
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
def test_encrypted_parquet_encryption_configuration():
def validate_encryption_configuration(encryption_config):
assert(FOOTER_KEY_NAME == encryption_config.footer_key)
assert(["a", "b"] == encryption_config.column_keys[COL_KEY_NAME])
assert("AES_GCM_CTR_V1" == encryption_config.encryption_algorithm)
assert(encryption_config.plaintext_footer)
assert(not encryption_config.double_wrapping)
assert(timedelta(minutes=10.0) == encryption_config.cache_lifetime)
assert(not encryption_config.internal_key_material)
assert(192 == encryption_config.data_key_length_bits)
encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME,
column_keys={COL_KEY_NAME: ["a", "b"], },
encryption_algorithm="AES_GCM_CTR_V1",
plaintext_footer=True,
double_wrapping=False,
cache_lifetime=timedelta(minutes=10.0),
internal_key_material=False,
data_key_length_bits=192,
)
validate_encryption_configuration(encryption_config)
encryption_config_1 = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME)
encryption_config_1.column_keys = {COL_KEY_NAME: ["a", "b"], }
encryption_config_1.encryption_algorithm = "AES_GCM_CTR_V1"
encryption_config_1.plaintext_footer = True
encryption_config_1.double_wrapping = False
encryption_config_1.cache_lifetime = timedelta(minutes=10.0)
encryption_config_1.internal_key_material = False
encryption_config_1.data_key_length_bits = 192
validate_encryption_configuration(encryption_config_1)
def test_encrypted_parquet_decryption_configuration():
decryption_config = pe.DecryptionConfiguration(
cache_lifetime=timedelta(minutes=10.0))
assert(timedelta(minutes=10.0) == decryption_config.cache_lifetime)
decryption_config_1 = pe.DecryptionConfiguration()
decryption_config_1.cache_lifetime = timedelta(minutes=10.0)
assert(timedelta(minutes=10.0) == decryption_config_1.cache_lifetime)
def test_encrypted_parquet_kms_configuration():
def validate_kms_connection_config(kms_connection_config):
assert("Instance1" == kms_connection_config.kms_instance_id)
assert("URL1" == kms_connection_config.kms_instance_url)
assert("MyToken" == kms_connection_config.key_access_token)
assert({"key1": "key_material_1", "key2": "key_material_2"} ==
kms_connection_config.custom_kms_conf)
kms_connection_config = pe.KmsConnectionConfig(
kms_instance_id="Instance1",
kms_instance_url="URL1",
key_access_token="MyToken",
custom_kms_conf={
"key1": "key_material_1",
"key2": "key_material_2",
})
validate_kms_connection_config(kms_connection_config)
kms_connection_config_1 = pe.KmsConnectionConfig()
kms_connection_config_1.kms_instance_id = "Instance1"
kms_connection_config_1.kms_instance_url = "URL1"
kms_connection_config_1.key_access_token = "MyToken"
kms_connection_config_1.custom_kms_conf = {
"key1": "key_material_1",
"key2": "key_material_2",
}
validate_kms_connection_config(kms_connection_config_1)
@pytest.mark.xfail(reason="Plaintext footer - reading plaintext column subset"
" reads encrypted columns too")
def test_encrypted_parquet_write_read_plain_footer_single_wrapping(
tempdir, data_table):
"""Write an encrypted parquet, with plaintext footer
and with single wrapping,
verify it's encrypted, and then read plaintext columns."""
path = tempdir / PARQUET_NAME
# Encrypt the footer with the footer key,
# encrypt column `a` and column `b` with another key,
# keep `c` plaintext
encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME,
column_keys={
COL_KEY_NAME: ["a", "b"],
},
plaintext_footer=True,
double_wrapping=False)
kms_connection_config = pe.KmsConnectionConfig(
custom_kms_conf={
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
COL_KEY_NAME: COL_KEY.decode("UTF-8"),
}
)
def kms_factory(kms_connection_configuration):
return InMemoryKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
# # Read without decryption properties only the plaintext column
# result = pq.ParquetFile(path)
# result_table = result.read(columns='c', use_threads=False)
# assert table.num_rows == result_table.num_rows
@pytest.mark.xfail(reason="External key material not supported yet")
def test_encrypted_parquet_write_external(tempdir, data_table):
"""Write an encrypted parquet, with external key
material.
Currently it's not implemented, so should throw
an exception"""
path = tempdir / PARQUET_NAME
# Encrypt the file with the footer key
encryption_config = pe.EncryptionConfiguration(
footer_key=FOOTER_KEY_NAME,
column_keys={},
internal_key_material=False)
kms_connection_config = pe.KmsConnectionConfig(
custom_kms_conf={FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8")}
)
def kms_factory(kms_connection_configuration):
return InMemoryKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
@pytest.mark.skip(reason="ARROW-14114: Multithreaded read sometimes fails"
"decryption finalization or with Segmentation fault")
def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config):
"""Write an encrypted parquet, verify it's encrypted,
and then read it multithreaded in a loop."""
path = tempdir / PARQUET_NAME
# Encrypt the footer with the footer key,
# encrypt column `a` and column `b` with another key,
# keep `c` plaintext
encryption_config = basic_encryption_config
kms_connection_config = pe.KmsConnectionConfig(
custom_kms_conf={
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
COL_KEY_NAME: COL_KEY.decode("UTF-8"),
}
)
def kms_factory(kms_connection_configuration):
return InMemoryKmsClient(kms_connection_configuration)
crypto_factory = pe.CryptoFactory(kms_factory)
# Write with encryption properties
write_encrypted_parquet(path, data_table, encryption_config,
kms_connection_config, crypto_factory)
verify_file_encrypted(path)
decryption_config = pe.DecryptionConfiguration(
cache_lifetime=timedelta(minutes=5.0))
for i in range(50):
# Read with decryption properties
file_decryption_properties = crypto_factory.file_decryption_properties(
kms_connection_config, decryption_config)
assert(file_decryption_properties is not None)
result = pq.ParquetFile(
path, decryption_properties=file_decryption_properties)
result_table = result.read(use_threads=True)
assert data_table.equals(result_table)

View File

@ -0,0 +1,528 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import datetime
import decimal
from collections import OrderedDict
import numpy as np
import pytest
import pyarrow as pa
from pyarrow.tests.parquet.common import _check_roundtrip, make_sample_file
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _write_table
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
from pyarrow.tests.parquet.common import alltypes_sample
except ImportError:
pd = tm = None
@pytest.mark.pandas
def test_parquet_metadata_api():
df = alltypes_sample(size=10000)
df = df.reindex(columns=sorted(df.columns))
df.index = np.random.randint(0, 1000000, size=len(df))
fileh = make_sample_file(df)
ncols = len(df.columns)
# Series of sniff tests
meta = fileh.metadata
repr(meta)
assert meta.num_rows == len(df)
assert meta.num_columns == ncols + 1 # +1 for index
assert meta.num_row_groups == 1
assert meta.format_version == '2.6'
assert 'parquet-cpp' in meta.created_by
assert isinstance(meta.serialized_size, int)
assert isinstance(meta.metadata, dict)
# Schema
schema = fileh.schema
assert meta.schema is schema
assert len(schema) == ncols + 1 # +1 for index
repr(schema)
col = schema[0]
repr(col)
assert col.name == df.columns[0]
assert col.max_definition_level == 1
assert col.max_repetition_level == 0
assert col.max_repetition_level == 0
assert col.physical_type == 'BOOLEAN'
assert col.converted_type == 'NONE'
with pytest.raises(IndexError):
schema[ncols + 1] # +1 for index
with pytest.raises(IndexError):
schema[-1]
# Row group
for rg in range(meta.num_row_groups):
rg_meta = meta.row_group(rg)
assert isinstance(rg_meta, pq.RowGroupMetaData)
repr(rg_meta)
for col in range(rg_meta.num_columns):
col_meta = rg_meta.column(col)
assert isinstance(col_meta, pq.ColumnChunkMetaData)
repr(col_meta)
with pytest.raises(IndexError):
meta.row_group(-1)
with pytest.raises(IndexError):
meta.row_group(meta.num_row_groups + 1)
rg_meta = meta.row_group(0)
assert rg_meta.num_rows == len(df)
assert rg_meta.num_columns == ncols + 1 # +1 for index
assert rg_meta.total_byte_size > 0
with pytest.raises(IndexError):
col_meta = rg_meta.column(-1)
with pytest.raises(IndexError):
col_meta = rg_meta.column(ncols + 2)
col_meta = rg_meta.column(0)
assert col_meta.file_offset > 0
assert col_meta.file_path == '' # created from BytesIO
assert col_meta.physical_type == 'BOOLEAN'
assert col_meta.num_values == 10000
assert col_meta.path_in_schema == 'bool'
assert col_meta.is_stats_set is True
assert isinstance(col_meta.statistics, pq.Statistics)
assert col_meta.compression == 'SNAPPY'
assert col_meta.encodings == ('PLAIN', 'RLE')
assert col_meta.has_dictionary_page is False
assert col_meta.dictionary_page_offset is None
assert col_meta.data_page_offset > 0
assert col_meta.total_compressed_size > 0
assert col_meta.total_uncompressed_size > 0
with pytest.raises(NotImplementedError):
col_meta.has_index_page
with pytest.raises(NotImplementedError):
col_meta.index_page_offset
def test_parquet_metadata_lifetime(tempdir):
# ARROW-6642 - ensure that chained access keeps parent objects alive
table = pa.table({'a': [1, 2, 3]})
pq.write_table(table, tempdir / 'test_metadata_segfault.parquet')
parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet')
parquet_file.metadata.row_group(0).column(0).statistics
@pytest.mark.pandas
@pytest.mark.parametrize(
(
'data',
'type',
'physical_type',
'min_value',
'max_value',
'null_count',
'num_values',
'distinct_count'
),
[
([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
(
[-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
'FLOAT', -1.1, 4.4, 1, 4, 0
),
(
[-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
'DOUBLE', -1.1, 4.4, 1, 4, 0
),
(
['', 'b', chr(1000), None, 'aaa'], pa.binary(),
'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0
),
(
[True, False, False, True, True], pa.bool_(),
'BOOLEAN', False, True, 0, 5, 0
),
(
[b'\x00', b'b', b'12', None, b'aaa'], pa.binary(),
'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0
),
]
)
def test_parquet_column_statistics_api(data, type, physical_type, min_value,
max_value, null_count, num_values,
distinct_count):
df = pd.DataFrame({'data': data})
schema = pa.schema([pa.field('data', type)])
table = pa.Table.from_pandas(df, schema=schema, safe=False)
fileh = make_sample_file(table)
meta = fileh.metadata
rg_meta = meta.row_group(0)
col_meta = rg_meta.column(0)
stat = col_meta.statistics
assert stat.has_min_max
assert _close(type, stat.min, min_value)
assert _close(type, stat.max, max_value)
assert stat.null_count == null_count
assert stat.num_values == num_values
# TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
# method, missing distinct_count is represented as zero instead of None
assert stat.distinct_count == distinct_count
assert stat.physical_type == physical_type
def _close(type, left, right):
if type == pa.float32():
return abs(left - right) < 1E-7
elif type == pa.float64():
return abs(left - right) < 1E-13
else:
return left == right
# ARROW-6339
@pytest.mark.pandas
def test_parquet_raise_on_unset_statistics():
df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")})
meta = make_sample_file(pa.Table.from_pandas(df)).metadata
assert not meta.row_group(0).column(0).statistics.has_min_max
assert meta.row_group(0).column(0).statistics.max is None
def test_statistics_convert_logical_types(tempdir):
# ARROW-5166, ARROW-4139
# (min, max, type)
cases = [(10, 11164359321221007157, pa.uint64()),
(10, 4294967295, pa.uint32()),
("ähnlich", "öffentlich", pa.utf8()),
(datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
pa.time32('ms')),
(datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
pa.time64('us')),
(datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
pa.timestamp('ms')),
(datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
pa.timestamp('us')),
(datetime.date(2019, 6, 24),
datetime.date(2019, 6, 25),
pa.date32()),
(decimal.Decimal("20.123"),
decimal.Decimal("20.124"),
pa.decimal128(12, 5))]
for i, (min_val, max_val, typ) in enumerate(cases):
t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)],
['col'])
path = str(tempdir / ('example{}.parquet'.format(i)))
pq.write_table(t, path, version='2.6')
pf = pq.ParquetFile(path)
stats = pf.metadata.row_group(0).column(0).statistics
assert stats.min == min_val
assert stats.max == max_val
def test_parquet_write_disable_statistics(tempdir):
table = pa.Table.from_pydict(
OrderedDict([
('a', pa.array([1, 2, 3])),
('b', pa.array(['a', 'b', 'c']))
])
)
_write_table(table, tempdir / 'data.parquet')
meta = pq.read_metadata(tempdir / 'data.parquet')
for col in [0, 1]:
cc = meta.row_group(0).column(col)
assert cc.is_stats_set is True
assert cc.statistics is not None
_write_table(table, tempdir / 'data2.parquet', write_statistics=False)
meta = pq.read_metadata(tempdir / 'data2.parquet')
for col in [0, 1]:
cc = meta.row_group(0).column(col)
assert cc.is_stats_set is False
assert cc.statistics is None
_write_table(table, tempdir / 'data3.parquet', write_statistics=['a'])
meta = pq.read_metadata(tempdir / 'data3.parquet')
cc_a = meta.row_group(0).column(0)
cc_b = meta.row_group(0).column(1)
assert cc_a.is_stats_set is True
assert cc_b.is_stats_set is False
assert cc_a.statistics is not None
assert cc_b.statistics is None
def test_field_id_metadata():
# ARROW-7080
field_id = b'PARQUET:field_id'
inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
middle = pa.field('middle', pa.struct(
[inner]), metadata={field_id: b'101'})
fields = [
pa.field('basic', pa.int32(), metadata={
b'other': b'abc', field_id: b'1'}),
pa.field(
'list',
pa.list_(pa.field('list-inner', pa.int32(),
metadata={field_id: b'10'})),
metadata={field_id: b'11'}),
pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
pa.field('no-metadata', pa.int32()),
pa.field('non-integral-field-id', pa.int32(),
metadata={field_id: b'xyz'}),
pa.field('negative-field-id', pa.int32(),
metadata={field_id: b'-1000'})
]
arrs = [[] for _ in fields]
table = pa.table(arrs, schema=pa.schema(fields))
bio = pa.BufferOutputStream()
pq.write_table(table, bio)
contents = bio.getvalue()
pf = pq.ParquetFile(pa.BufferReader(contents))
schema = pf.schema_arrow
assert schema[0].metadata[field_id] == b'1'
assert schema[0].metadata[b'other'] == b'abc'
list_field = schema[1]
assert list_field.metadata[field_id] == b'11'
list_item_field = list_field.type.value_field
assert list_item_field.metadata[field_id] == b'10'
struct_field = schema[2]
assert struct_field.metadata[field_id] == b'102'
struct_middle_field = struct_field.type[0]
assert struct_middle_field.metadata[field_id] == b'101'
struct_inner_field = struct_middle_field.type[0]
assert struct_inner_field.metadata[field_id] == b'100'
assert schema[3].metadata is None
# Invalid input is passed through (ok) but does not
# have field_id in parquet (not tested)
assert schema[4].metadata[field_id] == b'xyz'
assert schema[5].metadata[field_id] == b'-1000'
@pytest.mark.pandas
def test_multi_dataset_metadata(tempdir):
filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]
metapath = str(tempdir / "_metadata")
# create a test dataset
df = pd.DataFrame({
'one': [1, 2, 3],
'two': [-1, -2, -3],
'three': [[1, 2], [2, 3], [3, 4]],
})
table = pa.Table.from_pandas(df)
# write dataset twice and collect/merge metadata
_meta = None
for filename in filenames:
meta = []
pq.write_table(table, str(tempdir / filename),
metadata_collector=meta)
meta[0].set_file_path(filename)
if _meta is None:
_meta = meta[0]
else:
_meta.append_row_groups(meta[0])
# Write merged metadata-only file
with open(metapath, "wb") as f:
_meta.write_metadata_file(f)
# Read back the metadata
meta = pq.read_metadata(metapath)
md = meta.to_dict()
_md = _meta.to_dict()
for key in _md:
if key != 'serialized_size':
assert _md[key] == md[key]
assert _md['num_columns'] == 3
assert _md['num_rows'] == 6
assert _md['num_row_groups'] == 2
assert _md['serialized_size'] == 0
assert md['serialized_size'] > 0
def test_write_metadata(tempdir):
path = str(tempdir / "metadata")
schema = pa.schema([("a", "int64"), ("b", "float64")])
# write a pyarrow schema
pq.write_metadata(schema, path)
parquet_meta = pq.read_metadata(path)
schema_as_arrow = parquet_meta.schema.to_arrow_schema()
assert schema_as_arrow.equals(schema)
# ARROW-8980: Check that the ARROW:schema metadata key was removed
if schema_as_arrow.metadata:
assert b'ARROW:schema' not in schema_as_arrow.metadata
# pass through writer keyword arguments
for version in ["1.0", "2.0", "2.4", "2.6"]:
pq.write_metadata(schema, path, version=version)
parquet_meta = pq.read_metadata(path)
# The version is stored as a single integer in the Parquet metadata,
# so it cannot correctly express dotted format versions
expected_version = "1.0" if version == "1.0" else "2.6"
assert parquet_meta.format_version == expected_version
# metadata_collector: list of FileMetaData objects
table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema)
pq.write_table(table, tempdir / "data.parquet")
parquet_meta = pq.read_metadata(str(tempdir / "data.parquet"))
pq.write_metadata(
schema, path, metadata_collector=[parquet_meta, parquet_meta]
)
parquet_meta_mult = pq.read_metadata(path)
assert parquet_meta_mult.num_row_groups == 2
# append metadata with different schema raises an error
with pytest.raises(RuntimeError, match="requires equal schemas"):
pq.write_metadata(
pa.schema([("a", "int32"), ("b", "null")]),
path, metadata_collector=[parquet_meta, parquet_meta]
)
def test_table_large_metadata():
# ARROW-8694
my_schema = pa.schema([pa.field('f0', 'double')],
metadata={'large': 'x' * 10000000})
table = pa.table([np.arange(10)], schema=my_schema)
_check_roundtrip(table)
@pytest.mark.pandas
def test_compare_schemas():
df = alltypes_sample(size=10000)
fileh = make_sample_file(df)
fileh2 = make_sample_file(df)
fileh3 = make_sample_file(df[df.columns[::2]])
# ParquetSchema
assert isinstance(fileh.schema, pq.ParquetSchema)
assert fileh.schema.equals(fileh.schema)
assert fileh.schema == fileh.schema
assert fileh.schema.equals(fileh2.schema)
assert fileh.schema == fileh2.schema
assert fileh.schema != 'arbitrary object'
assert not fileh.schema.equals(fileh3.schema)
assert fileh.schema != fileh3.schema
# ColumnSchema
assert isinstance(fileh.schema[0], pq.ColumnSchema)
assert fileh.schema[0].equals(fileh.schema[0])
assert fileh.schema[0] == fileh.schema[0]
assert not fileh.schema[0].equals(fileh.schema[1])
assert fileh.schema[0] != fileh.schema[1]
assert fileh.schema[0] != 'arbitrary object'
@pytest.mark.pandas
def test_read_schema(tempdir):
N = 100
df = pd.DataFrame({
'index': np.arange(N),
'values': np.random.randn(N)
}, columns=['index', 'values'])
data_path = tempdir / 'test.parquet'
table = pa.Table.from_pandas(df)
_write_table(table, data_path)
read1 = pq.read_schema(data_path)
read2 = pq.read_schema(data_path, memory_map=True)
assert table.schema.equals(read1)
assert table.schema.equals(read2)
assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
def test_parquet_metadata_empty_to_dict(tempdir):
# https://issues.apache.org/jira/browse/ARROW-10146
table = pa.table({"a": pa.array([], type="int64")})
pq.write_table(table, tempdir / "data.parquet")
metadata = pq.read_metadata(tempdir / "data.parquet")
# ensure this doesn't error / statistics set to None
metadata_dict = metadata.to_dict()
assert len(metadata_dict["row_groups"]) == 1
assert len(metadata_dict["row_groups"][0]["columns"]) == 1
assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
@pytest.mark.slow
@pytest.mark.large_memory
def test_metadata_exceeds_message_size():
# ARROW-13655: Thrift may enable a default message size that limits
# the size of Parquet metadata that can be written.
NCOLS = 1000
NREPEATS = 4000
table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)})
with pa.BufferOutputStream() as out:
pq.write_table(table, out)
buf = out.getvalue()
original_metadata = pq.read_metadata(pa.BufferReader(buf))
metadata = pq.read_metadata(pa.BufferReader(buf))
for i in range(NREPEATS):
metadata.append_row_groups(original_metadata)
with pa.BufferOutputStream() as out:
metadata.write_metadata_file(out)
buf = out.getvalue()
metadata = pq.read_metadata(pa.BufferReader(buf))

View File

@ -0,0 +1,707 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
import json
import numpy as np
import pytest
import pyarrow as pa
from pyarrow.fs import LocalFileSystem, SubTreeFileSystem
from pyarrow.tests.parquet.common import (
parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
from pyarrow.util import guid
from pyarrow.vendored.version import Version
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
_write_table)
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
alltypes_sample)
except ImportError:
pd = tm = None
@pytest.mark.pandas
def test_pandas_parquet_custom_metadata(tempdir):
df = alltypes_sample(size=10000)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
assert b'pandas' in arrow_table.schema.metadata
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
metadata = pq.read_metadata(filename).metadata
assert b'pandas' in metadata
js = json.loads(metadata[b'pandas'].decode('utf8'))
assert js['index_columns'] == [{'kind': 'range',
'name': None,
'start': 0, 'stop': 10000,
'step': 1}]
@pytest.mark.pandas
def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
# ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
schema = pa.schema([
pa.field('int', pa.int16()),
pa.field('float', pa.float32()),
pa.field('string', pa.string())
])
df1 = pd.DataFrame({
'int': np.arange(3, dtype=np.uint8),
'float': np.arange(3, dtype=np.float32),
'string': ['ABBA', 'EDDA', 'ACDC']
})
df2 = pd.DataFrame({
'int': [4, 5],
'float': [1.1, None],
'string': [None, None]
})
table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
assert not table1.schema.equals(table2.schema, check_metadata=True)
assert table1.schema.equals(table2.schema)
writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
writer.write_table(table1)
writer.write_table(table2)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset):
df = alltypes_sample(size=10)
df.columns = pd.MultiIndex.from_tuples(
list(zip(df.columns, df.columns[::-1])),
names=['level_1', 'level_2']
)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
assert arrow_table.schema.pandas_metadata is not None
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
table_read = pq.read_pandas(
filename, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(
tempdir, use_legacy_dataset
):
df = alltypes_sample(size=10000)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
js = arrow_table.schema.pandas_metadata
assert not js['index_columns']
# ARROW-2170
# While index_columns should be empty, columns needs to be filled still.
assert js['columns']
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
table_read = pq.read_pandas(
filename, use_legacy_dataset=use_legacy_dataset)
js = table_read.schema.pandas_metadata
assert not js['index_columns']
read_metadata = table_read.schema.metadata
assert arrow_table.schema.metadata == read_metadata
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
# TODO(dataset) duplicate column selection actually gives duplicate columns now
@pytest.mark.pandas
@parametrize_legacy_dataset_not_supported
def test_pandas_column_selection(tempdir, use_legacy_dataset):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16)
})
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
_write_table(arrow_table, filename)
table_read = _read_table(
filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df[['uint8']], df_read)
# ARROW-4267: Selection of duplicate columns still leads to these columns
# being read uniquely.
table_read = _read_table(
filename, columns=['uint8', 'uint8'],
use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df[['uint8']], df_read)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset):
df = _test_dataframe(10000)
arrow_table = pa.Table.from_pandas(df)
imos = pa.BufferOutputStream()
_write_table(arrow_table, imos, version='2.6')
buf = imos.getvalue()
reader = pa.BufferReader(buf)
df_read = _read_table(
reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_read_pandas_column_subset(tempdir, use_legacy_dataset):
df = _test_dataframe(10000)
arrow_table = pa.Table.from_pandas(df)
imos = pa.BufferOutputStream()
_write_table(arrow_table, imos, version='2.6')
buf = imos.getvalue()
reader = pa.BufferReader(buf)
df_read = pq.read_pandas(
reader, columns=['strings', 'uint8'],
use_legacy_dataset=use_legacy_dataset
).to_pandas()
tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset):
df = _test_dataframe(0)
arrow_table = pa.Table.from_pandas(df)
imos = pa.BufferOutputStream()
_write_table(arrow_table, imos, version='2.6')
buf = imos.getvalue()
reader = pa.BufferReader(buf)
df_read = _read_table(
reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
def test_pandas_can_write_nested_data(tempdir):
data = {
"agg_col": [
{"page_type": 1},
{"record_type": 1},
{"non_consecutive_home": 0},
],
"uid_first": "1001"
}
df = pd.DataFrame(data=data)
arrow_table = pa.Table.from_pandas(df)
imos = pa.BufferOutputStream()
# This succeeds under V2
_write_table(arrow_table, imos)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
filename = tempdir / 'pandas_pyfile_roundtrip.parquet'
size = 5
df = pd.DataFrame({
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
'strings': ['foo', 'bar', None, 'baz', 'qux']
})
arrow_table = pa.Table.from_pandas(df)
with filename.open('wb') as f:
_write_table(arrow_table, f, version="1.0")
data = io.BytesIO(filename.read_bytes())
table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0
})
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
for use_dictionary in [True, False]:
_write_table(arrow_table, filename, version='2.6',
use_dictionary=use_dictionary)
table_read = _read_table(
filename, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
for write_statistics in [True, False]:
_write_table(arrow_table, filename, version='2.6',
write_statistics=write_statistics)
table_read = _read_table(filename,
use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
if (compression != 'NONE' and
not pa.lib.Codec.is_available(compression)):
continue
_write_table(arrow_table, filename, version='2.6',
compression=compression)
table_read = _read_table(
filename, use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df, df_read)
@pytest.mark.pandas
def test_spark_flavor_preserves_pandas_metadata():
df = _test_dataframe(size=100)
df.index = np.arange(0, 10 * len(df), 10)
df.index.name = 'foo'
result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
'flavor': 'spark'})
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_index_column_name_duplicate(tempdir, use_legacy_dataset):
data = {
'close': {
pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
},
'time': {
pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
'2017-06-30 01:31:00'
),
pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
'2017-06-30 01:32:00'
),
}
}
path = str(tempdir / 'data.parquet')
dfx = pd.DataFrame(data).set_index('time', drop=False)
tdfx = pa.Table.from_pandas(dfx)
_write_table(tdfx, path)
arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
result_df = arrow_table.to_pandas()
tm.assert_frame_equal(result_df, dfx)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_multiindex_duplicate_values(tempdir, use_legacy_dataset):
num_rows = 3
numbers = list(range(num_rows))
index = pd.MultiIndex.from_arrays(
[['foo', 'foo', 'bar'], numbers],
names=['foobar', 'some_numbers'],
)
df = pd.DataFrame({'numbers': numbers}, index=index)
table = pa.Table.from_pandas(df)
filename = tempdir / 'dup_multi_index_levels.parquet'
_write_table(table, filename)
result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
assert table.equals(result_table)
result_df = result_table.to_pandas()
tm.assert_frame_equal(result_df, df)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_backwards_compatible_index_naming(datadir, use_legacy_dataset):
expected_string = b"""\
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}',
index_col=None, header=0, engine='python')
table = _read_table(
datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset)
result = table.to_pandas()
tm.assert_frame_equal(result, expected)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_backwards_compatible_index_multi_level_named(
datadir, use_legacy_dataset
):
expected_string = b"""\
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
expected = pd.read_csv(
io.BytesIO(expected_string), sep=r'\s{2,}',
index_col=['cut', 'color', 'clarity'],
header=0, engine='python'
).sort_index()
table = _read_table(datadir / 'v0.7.1.all-named-index.parquet',
use_legacy_dataset=use_legacy_dataset)
result = table.to_pandas()
tm.assert_frame_equal(result, expected)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_backwards_compatible_index_multi_level_some_named(
datadir, use_legacy_dataset
):
expected_string = b"""\
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
expected = pd.read_csv(
io.BytesIO(expected_string),
sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'],
header=0, engine='python'
).sort_index()
expected.index = expected.index.set_names(['cut', None, 'clarity'])
table = _read_table(datadir / 'v0.7.1.some-named-index.parquet',
use_legacy_dataset=use_legacy_dataset)
result = table.to_pandas()
tm.assert_frame_equal(result, expected)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_backwards_compatible_column_metadata_handling(
datadir, use_legacy_dataset
):
expected = pd.DataFrame(
{'a': [1, 2, 3], 'b': [.1, .2, .3],
'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
expected.index = pd.MultiIndex.from_arrays(
[['a', 'b', 'c'],
pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
names=['index', None])
path = datadir / 'v0.7.1.column-metadata-handling.parquet'
table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
result = table.to_pandas()
tm.assert_frame_equal(result, expected)
table = _read_table(
path, columns=['a'], use_legacy_dataset=use_legacy_dataset)
result = table.to_pandas()
tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_categorical_index_survives_roundtrip(use_legacy_dataset):
# ARROW-3652, addressed by ARROW-3246
df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2'])
df['c1'] = df['c1'].astype('category')
df = df.set_index(['c1'])
table = pa.Table.from_pandas(df)
bos = pa.BufferOutputStream()
pq.write_table(table, bos)
ref_df = pq.read_pandas(
bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
assert isinstance(ref_df.index, pd.CategoricalIndex)
assert ref_df.index.equals(df.index)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_categorical_order_survives_roundtrip(use_legacy_dataset):
# ARROW-6302
df = pd.DataFrame({"a": pd.Categorical(
["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)})
table = pa.Table.from_pandas(df)
bos = pa.BufferOutputStream()
pq.write_table(table, bos)
contents = bos.getvalue()
result = pq.read_pandas(
contents, use_legacy_dataset=use_legacy_dataset).to_pandas()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_categorical_na_type_row_groups(use_legacy_dataset):
# ARROW-5085
df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
df_category = df.astype({"col": "category", "int": "category"})
table = pa.Table.from_pandas(df)
table_cat = pa.Table.from_pandas(df_category)
buf = pa.BufferOutputStream()
# it works
pq.write_table(table_cat, buf, version='2.6', chunk_size=10)
result = pq.read_table(
buf.getvalue(), use_legacy_dataset=use_legacy_dataset)
# Result is non-categorical
assert result[0].equals(table[0])
assert result[1].equals(table[1])
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_categorical_roundtrip(use_legacy_dataset):
# ARROW-5480, this was enabled by ARROW-3246
# Have one of the categories unobserved and include a null (-1)
codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
categories = ['foo', 'bar', 'baz']
df = pd.DataFrame({'x': pd.Categorical.from_codes(
codes, categories=categories)})
buf = pa.BufferOutputStream()
pq.write_table(pa.table(df), buf)
result = pq.read_table(
buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
assert result.x.dtype == 'category'
assert (result.x.cat.categories == categories).all()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_write_to_dataset_pandas_preserve_extensiondtypes(
tempdir, use_legacy_dataset
):
# ARROW-8251 - preserve pandas extension dtypes in roundtrip
if Version(pd.__version__) < Version("1.0.0"):
pytest.skip("__arrow_array__ added to pandas in 1.0.0")
df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
df['col'] = df['col'].astype("Int64")
table = pa.table(df)
pq.write_to_dataset(
table, str(tempdir / "case1"), partition_cols=['part'],
use_legacy_dataset=use_legacy_dataset
)
result = pq.read_table(
str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
).to_pandas()
tm.assert_frame_equal(result[["col"]], df[["col"]])
pq.write_to_dataset(
table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
)
result = pq.read_table(
str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
).to_pandas()
tm.assert_frame_equal(result[["col"]], df[["col"]])
pq.write_table(table, str(tempdir / "data.parquet"))
result = pq.read_table(
str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
).to_pandas()
tm.assert_frame_equal(result[["col"]], df[["col"]])
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset):
# ARROW-8251 - preserve pandas index in roundtrip
df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
df.index = pd.Index(['a', 'b', 'c'], name="idx")
table = pa.table(df)
df_cat = df[["col", "part"]].copy()
df_cat["part"] = df_cat["part"].astype("category")
pq.write_to_dataset(
table, str(tempdir / "case1"), partition_cols=['part'],
use_legacy_dataset=use_legacy_dataset
)
result = pq.read_table(
str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
).to_pandas()
tm.assert_frame_equal(result, df_cat)
pq.write_to_dataset(
table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
)
result = pq.read_table(
str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
).to_pandas()
tm.assert_frame_equal(result, df)
pq.write_table(table, str(tempdir / "data.parquet"))
result = pq.read_table(
str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
).to_pandas()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@pytest.mark.parametrize('preserve_index', [True, False, None])
def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
# ARROW-1103
nfiles = 5
size = 5
dirpath = tempdir / guid()
dirpath.mkdir()
test_data = []
frames = []
paths = []
for i in range(nfiles):
df = _test_dataframe(size, seed=i)
df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
path = dirpath / '{}.parquet'.format(i)
table = pa.Table.from_pandas(df, preserve_index=preserve_index)
# Obliterate metadata
table = table.replace_schema_metadata(None)
assert table.schema.metadata is None
_write_table(table, path)
test_data.append(table)
frames.append(df)
paths.append(path)
# Write _metadata common file
table_for_metadata = pa.Table.from_pandas(
df, preserve_index=preserve_index
)
pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata')
dataset = pq.ParquetDataset(dirpath)
columns = ['uint8', 'strings']
result = dataset.read_pandas(columns=columns).to_pandas()
expected = pd.concat([x[columns] for x in frames])
expected.index.name = (
df.index.name if preserve_index is not False else None)
tm.assert_frame_equal(result, expected)
@pytest.mark.pandas
def test_read_pandas_passthrough_keywords(tempdir):
# ARROW-11464 - previously not all keywords were passed through (such as
# the filesystem keyword)
df = pd.DataFrame({'a': [1, 2, 3]})
filename = tempdir / 'data.parquet'
_write_table(df, filename)
result = pq.read_pandas(
'data.parquet',
filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem())
)
assert result.equals(pa.table(df))
@pytest.mark.pandas
def test_read_pandas_map_fields(tempdir):
# ARROW-10140 - table created from Pandas with mapping fields
df = pd.DataFrame({
'col1': pd.Series([
[('id', 'something'), ('value2', 'else')],
[('id', 'something2'), ('value', 'else2')],
]),
'col2': pd.Series(['foo', 'bar'])
})
filename = tempdir / 'data.parquet'
udt = pa.map_(pa.string(), pa.string())
schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())])
arrow_table = pa.Table.from_pandas(df, schema)
_write_table(arrow_table, filename)
result = pq.read_pandas(filename).to_pandas()
tm.assert_frame_equal(result, df)

View File

@ -0,0 +1,274 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
import os
import pytest
import pyarrow as pa
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _write_table
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
from pyarrow.tests.parquet.common import alltypes_sample
except ImportError:
pd = tm = None
@pytest.mark.pandas
def test_pass_separate_metadata():
# ARROW-471
df = alltypes_sample(size=10000)
a_table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(a_table, buf, compression='snappy', version='2.6')
buf.seek(0)
metadata = pq.read_metadata(buf)
buf.seek(0)
fileh = pq.ParquetFile(buf, metadata=metadata)
tm.assert_frame_equal(df, fileh.read().to_pandas())
@pytest.mark.pandas
def test_read_single_row_group():
# ARROW-471
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.6')
buf.seek(0)
pf = pq.ParquetFile(buf)
assert pf.num_row_groups == K
row_groups = [pf.read_row_group(i) for i in range(K)]
result = pa.concat_tables(row_groups)
tm.assert_frame_equal(df, result.to_pandas())
@pytest.mark.pandas
def test_read_single_row_group_with_column_subset():
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.6')
buf.seek(0)
pf = pq.ParquetFile(buf)
cols = list(df.columns[:2])
row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
result = pa.concat_tables(row_groups)
tm.assert_frame_equal(df[cols], result.to_pandas())
# ARROW-4267: Selection of duplicate columns still leads to these columns
# being read uniquely.
row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)]
result = pa.concat_tables(row_groups)
tm.assert_frame_equal(df[cols], result.to_pandas())
@pytest.mark.pandas
def test_read_multiple_row_groups():
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.6')
buf.seek(0)
pf = pq.ParquetFile(buf)
assert pf.num_row_groups == K
result = pf.read_row_groups(range(K))
tm.assert_frame_equal(df, result.to_pandas())
@pytest.mark.pandas
def test_read_multiple_row_groups_with_column_subset():
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.6')
buf.seek(0)
pf = pq.ParquetFile(buf)
cols = list(df.columns[:2])
result = pf.read_row_groups(range(K), columns=cols)
tm.assert_frame_equal(df[cols], result.to_pandas())
# ARROW-4267: Selection of duplicate columns still leads to these columns
# being read uniquely.
result = pf.read_row_groups(range(K), columns=cols + cols)
tm.assert_frame_equal(df[cols], result.to_pandas())
@pytest.mark.pandas
def test_scan_contents():
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.6')
buf.seek(0)
pf = pq.ParquetFile(buf)
assert pf.scan_contents() == 10000
assert pf.scan_contents(df.columns[:4]) == 10000
def test_parquet_file_pass_directory_instead_of_file(tempdir):
# ARROW-7208
path = tempdir / 'directory'
os.mkdir(str(path))
with pytest.raises(IOError, match="Expected file path"):
pq.ParquetFile(path)
def test_read_column_invalid_index():
table = pa.table([pa.array([4, 5]), pa.array(["foo", "bar"])],
names=['ints', 'strs'])
bio = pa.BufferOutputStream()
pq.write_table(table, bio)
f = pq.ParquetFile(bio.getvalue())
assert f.reader.read_column(0).to_pylist() == [4, 5]
assert f.reader.read_column(1).to_pylist() == ["foo", "bar"]
for index in (-1, 2):
with pytest.raises((ValueError, IndexError)):
f.reader.read_column(index)
@pytest.mark.pandas
@pytest.mark.parametrize('batch_size', [300, 1000, 1300])
def test_iter_batches_columns_reader(tempdir, batch_size):
total_size = 3000
chunk_size = 1000
# TODO: Add categorical support
df = alltypes_sample(size=total_size)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
_write_table(arrow_table, filename, version='2.6',
coerce_timestamps='ms', chunk_size=chunk_size)
file_ = pq.ParquetFile(filename)
for columns in [df.columns[:10], df.columns[10:]]:
batches = file_.iter_batches(batch_size=batch_size, columns=columns)
batch_starts = range(0, total_size+batch_size, batch_size)
for batch, start in zip(batches, batch_starts):
end = min(total_size, start + batch_size)
tm.assert_frame_equal(
batch.to_pandas(),
df.iloc[start:end, :].loc[:, columns].reset_index(drop=True)
)
@pytest.mark.pandas
@pytest.mark.parametrize('chunk_size', [1000])
def test_iter_batches_reader(tempdir, chunk_size):
df = alltypes_sample(size=10000, categorical=True)
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
assert arrow_table.schema.pandas_metadata is not None
_write_table(arrow_table, filename, version='2.6',
coerce_timestamps='ms', chunk_size=chunk_size)
file_ = pq.ParquetFile(filename)
def get_all_batches(f):
for row_group in range(f.num_row_groups):
batches = f.iter_batches(
batch_size=900,
row_groups=[row_group],
)
for batch in batches:
yield batch
batches = list(get_all_batches(file_))
batch_no = 0
for i in range(file_.num_row_groups):
tm.assert_frame_equal(
batches[batch_no].to_pandas(),
file_.read_row_groups([i]).to_pandas().head(900)
)
batch_no += 1
tm.assert_frame_equal(
batches[batch_no].to_pandas().reset_index(drop=True),
file_.read_row_groups([i]).to_pandas().iloc[900:].reset_index(
drop=True
)
)
batch_no += 1
@pytest.mark.pandas
@pytest.mark.parametrize('pre_buffer', [False, True])
def test_pre_buffer(pre_buffer):
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df)
buf = io.BytesIO()
_write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.6')
buf.seek(0)
pf = pq.ParquetFile(buf, pre_buffer=pre_buffer)
assert pf.read().num_rows == N

View File

@ -0,0 +1,322 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
import pyarrow as pa
from pyarrow import fs
from pyarrow.filesystem import FileSystem, LocalFileSystem
from pyarrow.tests.parquet.common import parametrize_legacy_dataset
try:
import pyarrow.parquet as pq
from pyarrow.tests.parquet.common import _read_table, _test_dataframe
except ImportError:
pq = None
try:
import pandas as pd
import pandas.testing as tm
except ImportError:
pd = tm = None
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_parquet_incremental_file_build(tempdir, use_legacy_dataset):
df = _test_dataframe(100)
df['unique_id'] = 0
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
out = pa.BufferOutputStream()
writer = pq.ParquetWriter(out, arrow_table.schema, version='2.6')
frames = []
for i in range(10):
df['unique_id'] = i
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
writer.write_table(arrow_table)
frames.append(df.copy())
writer.close()
buf = out.getvalue()
result = _read_table(
pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
expected = pd.concat(frames, ignore_index=True)
tm.assert_frame_equal(result.to_pandas(), expected)
def test_validate_schema_write_table(tempdir):
# ARROW-2926
simple_fields = [
pa.field('POS', pa.uint32()),
pa.field('desc', pa.string())
]
simple_schema = pa.schema(simple_fields)
# simple_table schema does not match simple_schema
simple_from_array = [pa.array([1]), pa.array(['bla'])]
simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])
path = tempdir / 'simple_validate_schema.parquet'
with pq.ParquetWriter(path, simple_schema,
version='2.6',
compression='snappy', flavor='spark') as w:
with pytest.raises(ValueError):
w.write_table(simple_table)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_parquet_writer_context_obj(tempdir, use_legacy_dataset):
df = _test_dataframe(100)
df['unique_id'] = 0
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
out = pa.BufferOutputStream()
with pq.ParquetWriter(out, arrow_table.schema, version='2.6') as writer:
frames = []
for i in range(10):
df['unique_id'] = i
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
writer.write_table(arrow_table)
frames.append(df.copy())
buf = out.getvalue()
result = _read_table(
pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
expected = pd.concat(frames, ignore_index=True)
tm.assert_frame_equal(result.to_pandas(), expected)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_parquet_writer_context_obj_with_exception(
tempdir, use_legacy_dataset
):
df = _test_dataframe(100)
df['unique_id'] = 0
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
out = pa.BufferOutputStream()
error_text = 'Artificial Error'
try:
with pq.ParquetWriter(out,
arrow_table.schema,
version='2.6') as writer:
frames = []
for i in range(10):
df['unique_id'] = i
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
writer.write_table(arrow_table)
frames.append(df.copy())
if i == 5:
raise ValueError(error_text)
except Exception as e:
assert str(e) == error_text
buf = out.getvalue()
result = _read_table(
pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
expected = pd.concat(frames, ignore_index=True)
tm.assert_frame_equal(result.to_pandas(), expected)
@pytest.mark.pandas
@pytest.mark.parametrize("filesystem", [
None,
LocalFileSystem._get_instance(),
fs.LocalFileSystem(),
])
def test_parquet_writer_write_wrappers(tempdir, filesystem):
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
path_table = str(tempdir / 'data_table.parquet')
path_batch = str(tempdir / 'data_batch.parquet')
with pq.ParquetWriter(
path_table, table.schema, filesystem=filesystem, version='2.6'
) as writer:
writer.write_table(table)
result = _read_table(path_table).to_pandas()
tm.assert_frame_equal(result, df)
with pq.ParquetWriter(
path_batch, table.schema, filesystem=filesystem, version='2.6'
) as writer:
writer.write_batch(batch)
result = _read_table(path_batch).to_pandas()
tm.assert_frame_equal(result, df)
with pq.ParquetWriter(
path_table, table.schema, filesystem=filesystem, version='2.6'
) as writer:
writer.write(table)
result = _read_table(path_table).to_pandas()
tm.assert_frame_equal(result, df)
with pq.ParquetWriter(
path_batch, table.schema, filesystem=filesystem, version='2.6'
) as writer:
writer.write(batch)
result = _read_table(path_batch).to_pandas()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@pytest.mark.parametrize("filesystem", [
None,
LocalFileSystem._get_instance(),
fs.LocalFileSystem(),
])
def test_parquet_writer_filesystem_local(tempdir, filesystem):
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(
path, table.schema, filesystem=filesystem, version='2.6'
) as writer:
writer.write_table(table)
result = _read_table(path).to_pandas()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@pytest.mark.s3
def test_parquet_writer_filesystem_s3(s3_example_fs):
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
fs, uri, path = s3_example_fs
with pq.ParquetWriter(
path, table.schema, filesystem=fs, version='2.6'
) as writer:
writer.write_table(table)
result = _read_table(uri).to_pandas()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@pytest.mark.s3
def test_parquet_writer_filesystem_s3_uri(s3_example_fs):
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
fs, uri, path = s3_example_fs
with pq.ParquetWriter(uri, table.schema, version='2.6') as writer:
writer.write_table(table)
result = _read_table(path, filesystem=fs).to_pandas()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
@pytest.mark.s3
def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
fs, directory = s3_example_s3fs
path = directory + "/test.parquet"
with pq.ParquetWriter(
path, table.schema, filesystem=fs, version='2.6'
) as writer:
writer.write_table(table)
result = _read_table(path, filesystem=fs).to_pandas()
tm.assert_frame_equal(result, df)
@pytest.mark.pandas
def test_parquet_writer_filesystem_buffer_raises():
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
filesystem = fs.LocalFileSystem()
# Should raise ValueError when filesystem is passed with file-like object
with pytest.raises(ValueError, match="specified path is file-like"):
pq.ParquetWriter(
pa.BufferOutputStream(), table.schema, filesystem=filesystem
)
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset):
out = pa.BufferOutputStream()
class CustomFS(FileSystem):
def __init__(self):
self.path = None
self.mode = None
def open(self, path, mode='rb'):
self.path = path
self.mode = mode
return out
fs = CustomFS()
fname = 'expected_fname.parquet'
df = _test_dataframe(100)
table = pa.Table.from_pandas(df, preserve_index=False)
with pq.ParquetWriter(fname, table.schema, filesystem=fs, version='2.6') \
as writer:
writer.write_table(table)
assert fs.path == fname
assert fs.mode == 'wb'
assert out.closed
buf = out.getvalue()
table_read = _read_table(
pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
df_read = table_read.to_pandas()
tm.assert_frame_equal(df_read, df)
# Should raise ValueError when filesystem is passed with file-like object
with pytest.raises(ValueError) as err_info:
pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=fs)
expected_msg = ("filesystem passed but where is file-like, so"
" there is nothing to open with filesystem.")
assert str(err_info) == expected_msg