mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-21 18:23:35 +00:00
708 lines
24 KiB
Python
708 lines
24 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import io
|
|
import json
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import pyarrow as pa
|
|
from pyarrow.fs import LocalFileSystem, SubTreeFileSystem
|
|
from pyarrow.tests.parquet.common import (
|
|
parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
|
|
from pyarrow.util import guid
|
|
from pyarrow.vendored.version import Version
|
|
|
|
try:
|
|
import pyarrow.parquet as pq
|
|
from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
|
|
_write_table)
|
|
except ImportError:
|
|
pq = None
|
|
|
|
|
|
try:
|
|
import pandas as pd
|
|
import pandas.testing as tm
|
|
|
|
from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
|
|
alltypes_sample)
|
|
except ImportError:
|
|
pd = tm = None
|
|
|
|
|
|
@pytest.mark.pandas
|
|
def test_pandas_parquet_custom_metadata(tempdir):
|
|
df = alltypes_sample(size=10000)
|
|
|
|
filename = tempdir / 'pandas_roundtrip.parquet'
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
assert b'pandas' in arrow_table.schema.metadata
|
|
|
|
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
|
|
|
|
metadata = pq.read_metadata(filename).metadata
|
|
assert b'pandas' in metadata
|
|
|
|
js = json.loads(metadata[b'pandas'].decode('utf8'))
|
|
assert js['index_columns'] == [{'kind': 'range',
|
|
'name': None,
|
|
'start': 0, 'stop': 10000,
|
|
'step': 1}]
|
|
|
|
|
|
@pytest.mark.pandas
|
|
def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
|
|
# ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
|
|
schema = pa.schema([
|
|
pa.field('int', pa.int16()),
|
|
pa.field('float', pa.float32()),
|
|
pa.field('string', pa.string())
|
|
])
|
|
df1 = pd.DataFrame({
|
|
'int': np.arange(3, dtype=np.uint8),
|
|
'float': np.arange(3, dtype=np.float32),
|
|
'string': ['ABBA', 'EDDA', 'ACDC']
|
|
})
|
|
df2 = pd.DataFrame({
|
|
'int': [4, 5],
|
|
'float': [1.1, None],
|
|
'string': [None, None]
|
|
})
|
|
table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
|
|
table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
|
|
|
|
assert not table1.schema.equals(table2.schema, check_metadata=True)
|
|
assert table1.schema.equals(table2.schema)
|
|
|
|
writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
|
|
writer.write_table(table1)
|
|
writer.write_table(table2)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset):
|
|
df = alltypes_sample(size=10)
|
|
df.columns = pd.MultiIndex.from_tuples(
|
|
list(zip(df.columns, df.columns[::-1])),
|
|
names=['level_1', 'level_2']
|
|
)
|
|
|
|
filename = tempdir / 'pandas_roundtrip.parquet'
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
assert arrow_table.schema.pandas_metadata is not None
|
|
|
|
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
|
|
|
|
table_read = pq.read_pandas(
|
|
filename, use_legacy_dataset=use_legacy_dataset)
|
|
df_read = table_read.to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(
|
|
tempdir, use_legacy_dataset
|
|
):
|
|
df = alltypes_sample(size=10000)
|
|
|
|
filename = tempdir / 'pandas_roundtrip.parquet'
|
|
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
|
|
js = arrow_table.schema.pandas_metadata
|
|
assert not js['index_columns']
|
|
# ARROW-2170
|
|
# While index_columns should be empty, columns needs to be filled still.
|
|
assert js['columns']
|
|
|
|
_write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
|
|
table_read = pq.read_pandas(
|
|
filename, use_legacy_dataset=use_legacy_dataset)
|
|
|
|
js = table_read.schema.pandas_metadata
|
|
assert not js['index_columns']
|
|
|
|
read_metadata = table_read.schema.metadata
|
|
assert arrow_table.schema.metadata == read_metadata
|
|
|
|
df_read = table_read.to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
|
|
# TODO(dataset) duplicate column selection actually gives duplicate columns now
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset_not_supported
|
|
def test_pandas_column_selection(tempdir, use_legacy_dataset):
|
|
size = 10000
|
|
np.random.seed(0)
|
|
df = pd.DataFrame({
|
|
'uint8': np.arange(size, dtype=np.uint8),
|
|
'uint16': np.arange(size, dtype=np.uint16)
|
|
})
|
|
filename = tempdir / 'pandas_roundtrip.parquet'
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
_write_table(arrow_table, filename)
|
|
table_read = _read_table(
|
|
filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset)
|
|
df_read = table_read.to_pandas()
|
|
|
|
tm.assert_frame_equal(df[['uint8']], df_read)
|
|
|
|
# ARROW-4267: Selection of duplicate columns still leads to these columns
|
|
# being read uniquely.
|
|
table_read = _read_table(
|
|
filename, columns=['uint8', 'uint8'],
|
|
use_legacy_dataset=use_legacy_dataset)
|
|
df_read = table_read.to_pandas()
|
|
|
|
tm.assert_frame_equal(df[['uint8']], df_read)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset):
|
|
df = _test_dataframe(10000)
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
imos = pa.BufferOutputStream()
|
|
_write_table(arrow_table, imos, version='2.6')
|
|
buf = imos.getvalue()
|
|
reader = pa.BufferReader(buf)
|
|
df_read = _read_table(
|
|
reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_read_pandas_column_subset(tempdir, use_legacy_dataset):
|
|
df = _test_dataframe(10000)
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
imos = pa.BufferOutputStream()
|
|
_write_table(arrow_table, imos, version='2.6')
|
|
buf = imos.getvalue()
|
|
reader = pa.BufferReader(buf)
|
|
df_read = pq.read_pandas(
|
|
reader, columns=['strings', 'uint8'],
|
|
use_legacy_dataset=use_legacy_dataset
|
|
).to_pandas()
|
|
tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset):
|
|
df = _test_dataframe(0)
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
imos = pa.BufferOutputStream()
|
|
_write_table(arrow_table, imos, version='2.6')
|
|
buf = imos.getvalue()
|
|
reader = pa.BufferReader(buf)
|
|
df_read = _read_table(
|
|
reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
def test_pandas_can_write_nested_data(tempdir):
|
|
data = {
|
|
"agg_col": [
|
|
{"page_type": 1},
|
|
{"record_type": 1},
|
|
{"non_consecutive_home": 0},
|
|
],
|
|
"uid_first": "1001"
|
|
}
|
|
df = pd.DataFrame(data=data)
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
imos = pa.BufferOutputStream()
|
|
# This succeeds under V2
|
|
_write_table(arrow_table, imos)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
|
|
filename = tempdir / 'pandas_pyfile_roundtrip.parquet'
|
|
size = 5
|
|
df = pd.DataFrame({
|
|
'int64': np.arange(size, dtype=np.int64),
|
|
'float32': np.arange(size, dtype=np.float32),
|
|
'float64': np.arange(size, dtype=np.float64),
|
|
'bool': np.random.randn(size) > 0,
|
|
'strings': ['foo', 'bar', None, 'baz', 'qux']
|
|
})
|
|
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
|
|
with filename.open('wb') as f:
|
|
_write_table(arrow_table, f, version="1.0")
|
|
|
|
data = io.BytesIO(filename.read_bytes())
|
|
|
|
table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset)
|
|
df_read = table_read.to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset):
|
|
size = 10000
|
|
np.random.seed(0)
|
|
df = pd.DataFrame({
|
|
'uint8': np.arange(size, dtype=np.uint8),
|
|
'uint16': np.arange(size, dtype=np.uint16),
|
|
'uint32': np.arange(size, dtype=np.uint32),
|
|
'uint64': np.arange(size, dtype=np.uint64),
|
|
'int8': np.arange(size, dtype=np.int16),
|
|
'int16': np.arange(size, dtype=np.int16),
|
|
'int32': np.arange(size, dtype=np.int32),
|
|
'int64': np.arange(size, dtype=np.int64),
|
|
'float32': np.arange(size, dtype=np.float32),
|
|
'float64': np.arange(size, dtype=np.float64),
|
|
'bool': np.random.randn(size) > 0
|
|
})
|
|
filename = tempdir / 'pandas_roundtrip.parquet'
|
|
arrow_table = pa.Table.from_pandas(df)
|
|
|
|
for use_dictionary in [True, False]:
|
|
_write_table(arrow_table, filename, version='2.6',
|
|
use_dictionary=use_dictionary)
|
|
table_read = _read_table(
|
|
filename, use_legacy_dataset=use_legacy_dataset)
|
|
df_read = table_read.to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
for write_statistics in [True, False]:
|
|
_write_table(arrow_table, filename, version='2.6',
|
|
write_statistics=write_statistics)
|
|
table_read = _read_table(filename,
|
|
use_legacy_dataset=use_legacy_dataset)
|
|
df_read = table_read.to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
|
|
if (compression != 'NONE' and
|
|
not pa.lib.Codec.is_available(compression)):
|
|
continue
|
|
_write_table(arrow_table, filename, version='2.6',
|
|
compression=compression)
|
|
table_read = _read_table(
|
|
filename, use_legacy_dataset=use_legacy_dataset)
|
|
df_read = table_read.to_pandas()
|
|
tm.assert_frame_equal(df, df_read)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
def test_spark_flavor_preserves_pandas_metadata():
|
|
df = _test_dataframe(size=100)
|
|
df.index = np.arange(0, 10 * len(df), 10)
|
|
df.index.name = 'foo'
|
|
|
|
result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
|
|
'flavor': 'spark'})
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_index_column_name_duplicate(tempdir, use_legacy_dataset):
|
|
data = {
|
|
'close': {
|
|
pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
|
|
pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
|
|
},
|
|
'time': {
|
|
pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
|
|
'2017-06-30 01:31:00'
|
|
),
|
|
pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
|
|
'2017-06-30 01:32:00'
|
|
),
|
|
}
|
|
}
|
|
path = str(tempdir / 'data.parquet')
|
|
dfx = pd.DataFrame(data).set_index('time', drop=False)
|
|
tdfx = pa.Table.from_pandas(dfx)
|
|
_write_table(tdfx, path)
|
|
arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
|
|
result_df = arrow_table.to_pandas()
|
|
tm.assert_frame_equal(result_df, dfx)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_multiindex_duplicate_values(tempdir, use_legacy_dataset):
|
|
num_rows = 3
|
|
numbers = list(range(num_rows))
|
|
index = pd.MultiIndex.from_arrays(
|
|
[['foo', 'foo', 'bar'], numbers],
|
|
names=['foobar', 'some_numbers'],
|
|
)
|
|
|
|
df = pd.DataFrame({'numbers': numbers}, index=index)
|
|
table = pa.Table.from_pandas(df)
|
|
|
|
filename = tempdir / 'dup_multi_index_levels.parquet'
|
|
|
|
_write_table(table, filename)
|
|
result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
|
|
assert table.equals(result_table)
|
|
|
|
result_df = result_table.to_pandas()
|
|
tm.assert_frame_equal(result_df, df)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_backwards_compatible_index_naming(datadir, use_legacy_dataset):
|
|
expected_string = b"""\
|
|
carat cut color clarity depth table price x y z
|
|
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
|
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
|
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
|
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
|
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
|
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
|
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
|
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
|
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
|
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
|
expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}',
|
|
index_col=None, header=0, engine='python')
|
|
table = _read_table(
|
|
datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset)
|
|
result = table.to_pandas()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_backwards_compatible_index_multi_level_named(
|
|
datadir, use_legacy_dataset
|
|
):
|
|
expected_string = b"""\
|
|
carat cut color clarity depth table price x y z
|
|
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
|
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
|
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
|
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
|
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
|
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
|
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
|
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
|
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
|
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
|
expected = pd.read_csv(
|
|
io.BytesIO(expected_string), sep=r'\s{2,}',
|
|
index_col=['cut', 'color', 'clarity'],
|
|
header=0, engine='python'
|
|
).sort_index()
|
|
|
|
table = _read_table(datadir / 'v0.7.1.all-named-index.parquet',
|
|
use_legacy_dataset=use_legacy_dataset)
|
|
result = table.to_pandas()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_backwards_compatible_index_multi_level_some_named(
|
|
datadir, use_legacy_dataset
|
|
):
|
|
expected_string = b"""\
|
|
carat cut color clarity depth table price x y z
|
|
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
|
|
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
|
|
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
|
|
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
|
|
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
|
|
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
|
|
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
|
|
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
|
|
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
|
|
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
|
|
expected = pd.read_csv(
|
|
io.BytesIO(expected_string),
|
|
sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'],
|
|
header=0, engine='python'
|
|
).sort_index()
|
|
expected.index = expected.index.set_names(['cut', None, 'clarity'])
|
|
|
|
table = _read_table(datadir / 'v0.7.1.some-named-index.parquet',
|
|
use_legacy_dataset=use_legacy_dataset)
|
|
result = table.to_pandas()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_backwards_compatible_column_metadata_handling(
|
|
datadir, use_legacy_dataset
|
|
):
|
|
expected = pd.DataFrame(
|
|
{'a': [1, 2, 3], 'b': [.1, .2, .3],
|
|
'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
|
|
expected.index = pd.MultiIndex.from_arrays(
|
|
[['a', 'b', 'c'],
|
|
pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
|
|
names=['index', None])
|
|
|
|
path = datadir / 'v0.7.1.column-metadata-handling.parquet'
|
|
table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
|
|
result = table.to_pandas()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
table = _read_table(
|
|
path, columns=['a'], use_legacy_dataset=use_legacy_dataset)
|
|
result = table.to_pandas()
|
|
tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_categorical_index_survives_roundtrip(use_legacy_dataset):
|
|
# ARROW-3652, addressed by ARROW-3246
|
|
df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2'])
|
|
df['c1'] = df['c1'].astype('category')
|
|
df = df.set_index(['c1'])
|
|
|
|
table = pa.Table.from_pandas(df)
|
|
bos = pa.BufferOutputStream()
|
|
pq.write_table(table, bos)
|
|
ref_df = pq.read_pandas(
|
|
bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
|
|
assert isinstance(ref_df.index, pd.CategoricalIndex)
|
|
assert ref_df.index.equals(df.index)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_categorical_order_survives_roundtrip(use_legacy_dataset):
|
|
# ARROW-6302
|
|
df = pd.DataFrame({"a": pd.Categorical(
|
|
["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)})
|
|
|
|
table = pa.Table.from_pandas(df)
|
|
bos = pa.BufferOutputStream()
|
|
pq.write_table(table, bos)
|
|
|
|
contents = bos.getvalue()
|
|
result = pq.read_pandas(
|
|
contents, use_legacy_dataset=use_legacy_dataset).to_pandas()
|
|
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_categorical_na_type_row_groups(use_legacy_dataset):
|
|
# ARROW-5085
|
|
df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
|
|
df_category = df.astype({"col": "category", "int": "category"})
|
|
table = pa.Table.from_pandas(df)
|
|
table_cat = pa.Table.from_pandas(df_category)
|
|
buf = pa.BufferOutputStream()
|
|
|
|
# it works
|
|
pq.write_table(table_cat, buf, version='2.6', chunk_size=10)
|
|
result = pq.read_table(
|
|
buf.getvalue(), use_legacy_dataset=use_legacy_dataset)
|
|
|
|
# Result is non-categorical
|
|
assert result[0].equals(table[0])
|
|
assert result[1].equals(table[1])
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_pandas_categorical_roundtrip(use_legacy_dataset):
|
|
# ARROW-5480, this was enabled by ARROW-3246
|
|
|
|
# Have one of the categories unobserved and include a null (-1)
|
|
codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
|
|
categories = ['foo', 'bar', 'baz']
|
|
df = pd.DataFrame({'x': pd.Categorical.from_codes(
|
|
codes, categories=categories)})
|
|
|
|
buf = pa.BufferOutputStream()
|
|
pq.write_table(pa.table(df), buf)
|
|
|
|
result = pq.read_table(
|
|
buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
|
|
assert result.x.dtype == 'category'
|
|
assert (result.x.cat.categories == categories).all()
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_write_to_dataset_pandas_preserve_extensiondtypes(
|
|
tempdir, use_legacy_dataset
|
|
):
|
|
# ARROW-8251 - preserve pandas extension dtypes in roundtrip
|
|
if Version(pd.__version__) < Version("1.0.0"):
|
|
pytest.skip("__arrow_array__ added to pandas in 1.0.0")
|
|
|
|
df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
|
|
df['col'] = df['col'].astype("Int64")
|
|
table = pa.table(df)
|
|
|
|
pq.write_to_dataset(
|
|
table, str(tempdir / "case1"), partition_cols=['part'],
|
|
use_legacy_dataset=use_legacy_dataset
|
|
)
|
|
result = pq.read_table(
|
|
str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
|
|
).to_pandas()
|
|
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
|
|
|
pq.write_to_dataset(
|
|
table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
|
|
)
|
|
result = pq.read_table(
|
|
str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
|
|
).to_pandas()
|
|
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
|
|
|
pq.write_table(table, str(tempdir / "data.parquet"))
|
|
result = pq.read_table(
|
|
str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
|
|
).to_pandas()
|
|
tm.assert_frame_equal(result[["col"]], df[["col"]])
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@parametrize_legacy_dataset
|
|
def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset):
|
|
# ARROW-8251 - preserve pandas index in roundtrip
|
|
|
|
df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
|
|
df.index = pd.Index(['a', 'b', 'c'], name="idx")
|
|
table = pa.table(df)
|
|
df_cat = df[["col", "part"]].copy()
|
|
df_cat["part"] = df_cat["part"].astype("category")
|
|
|
|
pq.write_to_dataset(
|
|
table, str(tempdir / "case1"), partition_cols=['part'],
|
|
use_legacy_dataset=use_legacy_dataset
|
|
)
|
|
result = pq.read_table(
|
|
str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
|
|
).to_pandas()
|
|
tm.assert_frame_equal(result, df_cat)
|
|
|
|
pq.write_to_dataset(
|
|
table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
|
|
)
|
|
result = pq.read_table(
|
|
str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
|
|
).to_pandas()
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
pq.write_table(table, str(tempdir / "data.parquet"))
|
|
result = pq.read_table(
|
|
str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
|
|
).to_pandas()
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
@pytest.mark.parametrize('preserve_index', [True, False, None])
|
|
def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
|
|
# ARROW-1103
|
|
nfiles = 5
|
|
size = 5
|
|
|
|
dirpath = tempdir / guid()
|
|
dirpath.mkdir()
|
|
|
|
test_data = []
|
|
frames = []
|
|
paths = []
|
|
for i in range(nfiles):
|
|
df = _test_dataframe(size, seed=i)
|
|
df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
|
|
|
|
path = dirpath / '{}.parquet'.format(i)
|
|
|
|
table = pa.Table.from_pandas(df, preserve_index=preserve_index)
|
|
|
|
# Obliterate metadata
|
|
table = table.replace_schema_metadata(None)
|
|
assert table.schema.metadata is None
|
|
|
|
_write_table(table, path)
|
|
test_data.append(table)
|
|
frames.append(df)
|
|
paths.append(path)
|
|
|
|
# Write _metadata common file
|
|
table_for_metadata = pa.Table.from_pandas(
|
|
df, preserve_index=preserve_index
|
|
)
|
|
pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata')
|
|
|
|
dataset = pq.ParquetDataset(dirpath)
|
|
columns = ['uint8', 'strings']
|
|
result = dataset.read_pandas(columns=columns).to_pandas()
|
|
expected = pd.concat([x[columns] for x in frames])
|
|
expected.index.name = (
|
|
df.index.name if preserve_index is not False else None)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.pandas
|
|
def test_read_pandas_passthrough_keywords(tempdir):
|
|
# ARROW-11464 - previously not all keywords were passed through (such as
|
|
# the filesystem keyword)
|
|
df = pd.DataFrame({'a': [1, 2, 3]})
|
|
|
|
filename = tempdir / 'data.parquet'
|
|
_write_table(df, filename)
|
|
|
|
result = pq.read_pandas(
|
|
'data.parquet',
|
|
filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem())
|
|
)
|
|
assert result.equals(pa.table(df))
|
|
|
|
|
|
@pytest.mark.pandas
|
|
def test_read_pandas_map_fields(tempdir):
|
|
# ARROW-10140 - table created from Pandas with mapping fields
|
|
df = pd.DataFrame({
|
|
'col1': pd.Series([
|
|
[('id', 'something'), ('value2', 'else')],
|
|
[('id', 'something2'), ('value', 'else2')],
|
|
]),
|
|
'col2': pd.Series(['foo', 'bar'])
|
|
})
|
|
|
|
filename = tempdir / 'data.parquet'
|
|
|
|
udt = pa.map_(pa.string(), pa.string())
|
|
schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())])
|
|
arrow_table = pa.Table.from_pandas(df, schema)
|
|
|
|
_write_table(arrow_table, filename)
|
|
|
|
result = pq.read_pandas(filename).to_pandas()
|
|
tm.assert_frame_equal(result, df)
|