first commit

2025-07-01 14:07:48 +00:00 · 2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/init.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/init.py
@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = [
+    pytest.mark.parquet,
+    pytest.mark.filterwarnings(
+        "ignore:Passing 'use_legacy_dataset=True':DeprecationWarning"
+    ),
+]
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/common.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/common.py
@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+
+import numpy as np
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests import util
+
+legacy_filter_mark = pytest.mark.filterwarnings(
+    "ignore:Passing 'use_legacy:FutureWarning"
+)
+
+parametrize_legacy_dataset = pytest.mark.parametrize(
+    "use_legacy_dataset",
+    [pytest.param(True, marks=legacy_filter_mark),
+     pytest.param(False, marks=pytest.mark.dataset)]
+)
+parametrize_legacy_dataset_not_supported = pytest.mark.parametrize(
+    "use_legacy_dataset",
+    [pytest.param(True, marks=legacy_filter_mark),
+     pytest.param(False, marks=pytest.mark.skip)]
+)
+parametrize_legacy_dataset_fixed = pytest.mark.parametrize(
+    "use_legacy_dataset",
+    [pytest.param(True, marks=[pytest.mark.xfail, legacy_filter_mark]),
+     pytest.param(False, marks=pytest.mark.dataset)]
+)
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+def _write_table(table, path, **kwargs):
+    # So we see the ImportError somewhere
+    import pyarrow.parquet as pq
+    from pyarrow.pandas_compat import _pandas_api
+
+    if _pandas_api.is_data_frame(table):
+        table = pa.Table.from_pandas(table)
+
+    pq.write_table(table, path, **kwargs)
+    return table
+
+
+def _read_table(*args, **kwargs):
+    import pyarrow.parquet as pq
+
+    table = pq.read_table(*args, **kwargs)
+    table.validate(full=True)
+    return table
+
+
+def _roundtrip_table(table, read_table_kwargs=None,
+                     write_table_kwargs=None, use_legacy_dataset=False):
+    read_table_kwargs = read_table_kwargs or {}
+    write_table_kwargs = write_table_kwargs or {}
+
+    writer = pa.BufferOutputStream()
+    _write_table(table, writer, **write_table_kwargs)
+    reader = pa.BufferReader(writer.getvalue())
+    return _read_table(reader, use_legacy_dataset=use_legacy_dataset,
+                       **read_table_kwargs)
+
+
+def _check_roundtrip(table, expected=None, read_table_kwargs=None,
+                     use_legacy_dataset=False, **write_table_kwargs):
+    if expected is None:
+        expected = table
+
+    read_table_kwargs = read_table_kwargs or {}
+
+    # intentionally check twice
+    result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs,
+                              write_table_kwargs=write_table_kwargs,
+                              use_legacy_dataset=use_legacy_dataset)
+    assert result.equals(expected)
+    result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs,
+                              write_table_kwargs=write_table_kwargs,
+                              use_legacy_dataset=use_legacy_dataset)
+    assert result.equals(expected)
+
+
+def _roundtrip_pandas_dataframe(df, write_kwargs, use_legacy_dataset=False):
+    table = pa.Table.from_pandas(df)
+    result = _roundtrip_table(
+        table, write_table_kwargs=write_kwargs,
+        use_legacy_dataset=use_legacy_dataset)
+    return result.to_pandas()
+
+
+def _random_integers(size, dtype):
+    # We do not generate integers outside the int64 range
+    platform_int_info = np.iinfo('int_')
+    iinfo = np.iinfo(dtype)
+    return np.random.randint(max(iinfo.min, platform_int_info.min),
+                             min(iinfo.max, platform_int_info.max),
+                             size=size).astype(dtype)
+
+
+def _test_dataframe(size=10000, seed=0):
+    import pandas as pd
+
+    np.random.seed(seed)
+    df = pd.DataFrame({
+        'uint8': _random_integers(size, np.uint8),
+        'uint16': _random_integers(size, np.uint16),
+        'uint32': _random_integers(size, np.uint32),
+        'uint64': _random_integers(size, np.uint64),
+        'int8': _random_integers(size, np.int8),
+        'int16': _random_integers(size, np.int16),
+        'int32': _random_integers(size, np.int32),
+        'int64': _random_integers(size, np.int64),
+        'float32': np.random.randn(size).astype(np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        'strings': [util.rands(10) for i in range(size)],
+        'all_none': [None] * size,
+        'all_none_category': [None] * size
+    })
+
+    # TODO(PARQUET-1015)
+    # df['all_none_category'] = df['all_none_category'].astype('category')
+    return df
+
+
+def make_sample_file(table_or_df):
+    import pyarrow.parquet as pq
+
+    if isinstance(table_or_df, pa.Table):
+        a_table = table_or_df
+    else:
+        a_table = pa.Table.from_pandas(table_or_df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, compression='SNAPPY', version='2.6',
+                 coerce_timestamps='ms')
+
+    buf.seek(0)
+    return pq.ParquetFile(buf)
+
+
+def alltypes_sample(size=10000, seed=0, categorical=False):
+    import pandas as pd
+
+    np.random.seed(seed)
+    arrays = {
+        'uint8': np.arange(size, dtype=np.uint8),
+        'uint16': np.arange(size, dtype=np.uint16),
+        'uint32': np.arange(size, dtype=np.uint32),
+        'uint64': np.arange(size, dtype=np.uint64),
+        'int8': np.arange(size, dtype=np.int16),
+        'int16': np.arange(size, dtype=np.int16),
+        'int32': np.arange(size, dtype=np.int32),
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        # TODO(wesm): Test other timestamp resolutions now that arrow supports
+        # them
+        'datetime': np.arange("2016-01-01T00:00:00.001", size,
+                              dtype='datetime64[ms]'),
+        'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
+        'str': pd.Series([str(x) for x in range(size)]),
+        'empty_str': [''] * size,
+        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
+        'null': [None] * size,
+        'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
+    }
+    if categorical:
+        arrays['str_category'] = arrays['str'].astype('category')
+    return pd.DataFrame(arrays)
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/conftest.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/conftest.py
@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+from pyarrow.util import guid
+
+
+@pytest.fixture(scope='module')
+def datadir(base_datadir):
+    return base_datadir / 'parquet'
+
+
+@pytest.fixture
+def s3_bucket(s3_server):
+    boto3 = pytest.importorskip('boto3')
+    botocore = pytest.importorskip('botocore')
+
+    host, port, access_key, secret_key = s3_server['connection']
+    s3 = boto3.resource(
+        's3',
+        endpoint_url='http://{}:{}'.format(host, port),
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        config=botocore.client.Config(signature_version='s3v4'),
+        region_name='us-east-1'
+    )
+    bucket = s3.Bucket('test-s3fs')
+    try:
+        bucket.create()
+    except Exception:
+        # we get BucketAlreadyOwnedByYou error with fsspec handler
+        pass
+    return 'test-s3fs'
+
+
+@pytest.fixture
+def s3_example_s3fs(s3_server, s3_bucket):
+    s3fs = pytest.importorskip('s3fs')
+
+    host, port, access_key, secret_key = s3_server['connection']
+    fs = s3fs.S3FileSystem(
+        key=access_key,
+        secret=secret_key,
+        client_kwargs={
+            'endpoint_url': 'http://{}:{}'.format(host, port)
+        }
+    )
+
+    test_path = '{}/{}'.format(s3_bucket, guid())
+
+    fs.mkdir(test_path)
+    yield fs, test_path
+    try:
+        fs.rm(test_path, recursive=True)
+    except FileNotFoundError:
+        pass
+
+
+@pytest.fixture
+def s3_example_fs(s3_server):
+    from pyarrow.fs import FileSystem
+
+    host, port, access_key, secret_key = s3_server['connection']
+    uri = (
+        "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}"
+        .format(access_key, secret_key, host, port)
+    )
+    fs, path = FileSystem.from_uri(uri)
+
+    fs.create_dir("mybucket")
+
+    yield fs, uri, path
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/encryption.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/encryption.py
@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import base64
+
+import pyarrow.parquet.encryption as pe
+
+
+class InMemoryKmsClient(pe.KmsClient):
+    """This is a mock class implementation of KmsClient, built for testing only.
+    """
+
+    def __init__(self, config):
+        """Create an InMemoryKmsClient instance."""
+        pe.KmsClient.__init__(self)
+        self.master_keys_map = config.custom_kms_conf
+
+    def wrap_key(self, key_bytes, master_key_identifier):
+        """Not a secure cipher - the wrapped key
+        is just the master key concatenated with key bytes"""
+        master_key_bytes = self.master_keys_map[master_key_identifier].encode(
+            'utf-8')
+        wrapped_key = b"".join([master_key_bytes, key_bytes])
+        result = base64.b64encode(wrapped_key)
+        return result
+
+    def unwrap_key(self, wrapped_key, master_key_identifier):
+        """Not a secure cipher - just extract the key from
+        the wrapped key"""
+        expected_master_key = self.master_keys_map[master_key_identifier]
+        decoded_wrapped_key = base64.b64decode(wrapped_key)
+        master_key_bytes = decoded_wrapped_key[:16]
+        decrypted_key = decoded_wrapped_key[16:]
+        if (expected_master_key == master_key_bytes.decode('utf-8')):
+            return decrypted_key
+        raise ValueError("Incorrect master key used",
+                         master_key_bytes, decrypted_key)
+
+
+def verify_file_encrypted(path):
+    """Verify that the file is encrypted by looking at its first 4 bytes.
+    If it's the magic string PARE
+    then this is a parquet with encrypted footer."""
+    with open(path, "rb") as file:
+        magic_str = file.read(4)
+        # Verify magic string for parquet with encrypted footer is PARE
+        assert magic_str == b'PARE'
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_basic.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_basic.py
@ -0,0 +1,799 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections import OrderedDict
+import io
+
+import numpy as np
+import pytest
+
+import pyarrow as pa
+from pyarrow import fs
+from pyarrow.filesystem import LocalFileSystem, FileSystem
+from pyarrow.tests import util
+from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
+                                          parametrize_legacy_dataset,
+                                          _test_dataframe)
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.pandas_examples import dataframe_with_lists
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+
+def test_parquet_invalid_version(tempdir):
+    table = pa.table({'a': [1, 2, 3]})
+    with pytest.raises(ValueError, match="Unsupported Parquet format version"):
+        _write_table(table, tempdir / 'test_version.parquet', version="2.2")
+    with pytest.raises(ValueError, match="Unsupported Parquet data page " +
+                       "version"):
+        _write_table(table, tempdir / 'test_version.parquet',
+                     data_page_version="2.2")
+
+
+@parametrize_legacy_dataset
+def test_set_data_page_size(use_legacy_dataset):
+    arr = pa.array([1, 2, 3] * 100000)
+    t = pa.Table.from_arrays([arr], names=['f0'])
+
+    # 128K, 512K
+    page_sizes = [2 << 16, 2 << 18]
+    for target_page_size in page_sizes:
+        _check_roundtrip(t, data_page_size=target_page_size,
+                         use_legacy_dataset=use_legacy_dataset)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_set_write_batch_size(use_legacy_dataset):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    _check_roundtrip(
+        table, data_page_size=10, write_batch_size=1, version='2.4'
+    )
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_set_dictionary_pagesize_limit(use_legacy_dataset):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    _check_roundtrip(table, dictionary_pagesize_limit=1,
+                     data_page_size=10, version='2.4')
+
+    with pytest.raises(TypeError):
+        _check_roundtrip(table, dictionary_pagesize_limit="a",
+                         data_page_size=10, version='2.4')
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_chunked_table_write(use_legacy_dataset):
+    # ARROW-232
+    tables = []
+    batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10))
+    tables.append(pa.Table.from_batches([batch] * 3))
+    df, _ = dataframe_with_lists()
+    batch = pa.RecordBatch.from_pandas(df)
+    tables.append(pa.Table.from_batches([batch] * 3))
+
+    for data_page_version in ['1.0', '2.0']:
+        for use_dictionary in [True, False]:
+            for table in tables:
+                _check_roundtrip(
+                    table, version='2.6',
+                    use_legacy_dataset=use_legacy_dataset,
+                    data_page_version=data_page_version,
+                    use_dictionary=use_dictionary)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_memory_map(tempdir, use_legacy_dataset):
+    df = alltypes_sample(size=10)
+
+    table = pa.Table.from_pandas(df)
+    _check_roundtrip(table, read_table_kwargs={'memory_map': True},
+                     version='2.6', use_legacy_dataset=use_legacy_dataset)
+
+    filename = str(tempdir / 'tmp_file')
+    with open(filename, 'wb') as f:
+        _write_table(table, f, version='2.6')
+    table_read = pq.read_pandas(filename, memory_map=True,
+                                use_legacy_dataset=use_legacy_dataset)
+    assert table_read.equals(table)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_enable_buffered_stream(tempdir, use_legacy_dataset):
+    df = alltypes_sample(size=10)
+
+    table = pa.Table.from_pandas(df)
+    _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025},
+                     version='2.6', use_legacy_dataset=use_legacy_dataset)
+
+    filename = str(tempdir / 'tmp_file')
+    with open(filename, 'wb') as f:
+        _write_table(table, f, version='2.6')
+    table_read = pq.read_pandas(filename, buffer_size=4096,
+                                use_legacy_dataset=use_legacy_dataset)
+    assert table_read.equals(table)
+
+
+@parametrize_legacy_dataset
+def test_special_chars_filename(tempdir, use_legacy_dataset):
+    table = pa.Table.from_arrays([pa.array([42])], ["ints"])
+    filename = "foo # bar"
+    path = tempdir / filename
+    assert not path.exists()
+    _write_table(table, str(path))
+    assert path.exists()
+    table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset)
+    assert table_read.equals(table)
+
+
+@parametrize_legacy_dataset
+def test_invalid_source(use_legacy_dataset):
+    # Test that we provide an helpful error message pointing out
+    # that None wasn't expected when trying to open a Parquet None file.
+    #
+    # Depending on use_legacy_dataset the message changes slightly
+    # but in both cases it should point out that None wasn't expected.
+    with pytest.raises(TypeError, match="None"):
+        pq.read_table(None, use_legacy_dataset=use_legacy_dataset)
+
+    with pytest.raises(TypeError, match="None"):
+        pq.ParquetFile(None)
+
+
+@pytest.mark.slow
+def test_file_with_over_int16_max_row_groups():
+    # PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
+    # limit on the number of row groups, but this limit only impacts files with
+    # encrypted row group metadata because of the int16 row group ordinal used
+    # in the Parquet Thrift metadata. Unencrypted files are not impacted, so
+    # this test checks that it works (even if it isn't a good idea)
+    t = pa.table([list(range(40000))], names=['f0'])
+    _check_roundtrip(t, row_group_size=1)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_empty_table_roundtrip(use_legacy_dataset):
+    df = alltypes_sample(size=10)
+
+    # Create a non-empty table to infer the types correctly, then slice to 0
+    table = pa.Table.from_pandas(df)
+    table = pa.Table.from_arrays(
+        [col.chunk(0)[:0] for col in table.itercolumns()],
+        names=table.schema.names)
+
+    assert table.schema.field('null').type == pa.null()
+    assert table.schema.field('null_list').type == pa.list_(pa.null())
+    _check_roundtrip(
+        table, version='2.6', use_legacy_dataset=use_legacy_dataset)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_empty_table_no_columns(use_legacy_dataset):
+    df = pd.DataFrame()
+    empty = pa.Table.from_pandas(df, preserve_index=False)
+    _check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset)
+
+
+@parametrize_legacy_dataset
+def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset):
+    # Bug report in ARROW-3792
+    cols = OrderedDict(
+        int32=pa.int32(),
+        list_string=pa.list_(pa.string())
+    )
+    data = [[], [OrderedDict(int32=1, list_string=('G',)), ]]
+
+    # This produces a table with a column like
+    # <Column name='list_string' type=ListType(list<item: string>)>
+    # [
+    #   [],
+    #   [
+    #     [
+    #       "G"
+    #     ]
+    #   ]
+    # ]
+    #
+    # Each column is a ChunkedArray with 2 elements
+    my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten()
+                 for batch in data]
+    my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols))
+                  for batch in my_arrays]
+    tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
+    _check_roundtrip(tbl, use_legacy_dataset=use_legacy_dataset)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_multiple_path_types(tempdir, use_legacy_dataset):
+    # Test compatibility with PEP 519 path-like objects
+    path = tempdir / 'zzz.parquet'
+    df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
+    _write_table(df, path)
+    table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+    # Test compatibility with plain string paths
+    path = str(tempdir) + 'zzz.parquet'
+    df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
+    _write_table(df, path)
+    table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@parametrize_legacy_dataset
+def test_fspath(tempdir, use_legacy_dataset):
+    # ARROW-12472 support __fspath__ objects without using str()
+    path = tempdir / "test.parquet"
+    table = pa.table({"a": [1, 2, 3]})
+    _write_table(table, path)
+
+    fs_protocol_obj = util.FSProtocolClass(path)
+
+    result = _read_table(
+        fs_protocol_obj, use_legacy_dataset=use_legacy_dataset
+    )
+    assert result.equals(table)
+
+    # combined with non-local filesystem raises
+    with pytest.raises(TypeError):
+        _read_table(fs_protocol_obj, filesystem=FileSystem())
+
+
+@pytest.mark.dataset
+@parametrize_legacy_dataset
+@pytest.mark.parametrize("filesystem", [
+    None, fs.LocalFileSystem(), LocalFileSystem._get_instance()
+])
+def test_relative_paths(tempdir, use_legacy_dataset, filesystem):
+    # reading and writing from relative paths
+    table = pa.table({"a": [1, 2, 3]})
+
+    # reading
+    pq.write_table(table, str(tempdir / "data.parquet"))
+    with util.change_cwd(tempdir):
+        result = pq.read_table("data.parquet", filesystem=filesystem,
+                               use_legacy_dataset=use_legacy_dataset)
+    assert result.equals(table)
+
+    # writing
+    with util.change_cwd(tempdir):
+        pq.write_table(table, "data2.parquet", filesystem=filesystem)
+    result = pq.read_table(tempdir / "data2.parquet")
+    assert result.equals(table)
+
+
+def test_read_non_existing_file():
+    # ensure we have a proper error message
+    with pytest.raises(FileNotFoundError):
+        pq.read_table('i-am-not-existing.parquet')
+
+
+def test_file_error_python_exception():
+    class BogusFile(io.BytesIO):
+        def read(self, *args):
+            raise ZeroDivisionError("zorglub")
+
+        def seek(self, *args):
+            raise ZeroDivisionError("zorglub")
+
+    # ensure the Python exception is restored
+    with pytest.raises(ZeroDivisionError, match="zorglub"):
+        pq.read_table(BogusFile(b""))
+
+
+@parametrize_legacy_dataset
+def test_parquet_read_from_buffer(tempdir, use_legacy_dataset):
+    # reading from a buffer from python's open()
+    table = pa.table({"a": [1, 2, 3]})
+    pq.write_table(table, str(tempdir / "data.parquet"))
+
+    with open(str(tempdir / "data.parquet"), "rb") as f:
+        result = pq.read_table(f, use_legacy_dataset=use_legacy_dataset)
+    assert result.equals(table)
+
+    with open(str(tempdir / "data.parquet"), "rb") as f:
+        result = pq.read_table(pa.PythonFile(f),
+                               use_legacy_dataset=use_legacy_dataset)
+    assert result.equals(table)
+
+
+@parametrize_legacy_dataset
+def test_byte_stream_split(use_legacy_dataset):
+    # This is only a smoke test.
+    arr_float = pa.array(list(map(float, range(100))))
+    arr_int = pa.array(list(map(int, range(100))))
+    data_float = [arr_float, arr_float]
+    table = pa.Table.from_arrays(data_float, names=['a', 'b'])
+
+    # Check with byte_stream_split for both columns.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     use_dictionary=False, use_byte_stream_split=True)
+
+    # Check with byte_stream_split for column 'b' and dictionary
+    # for column 'a'.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     use_dictionary=['a'],
+                     use_byte_stream_split=['b'])
+
+    # Check with a collision for both columns.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     use_dictionary=['a', 'b'],
+                     use_byte_stream_split=['a', 'b'])
+
+    # Check with mixed column types.
+    mixed_table = pa.Table.from_arrays([arr_float, arr_int],
+                                       names=['a', 'b'])
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=['b'],
+                     use_byte_stream_split=['a'])
+
+    # Try to use the wrong data type with the byte_stream_split encoding.
+    # This should throw an exception.
+    table = pa.Table.from_arrays([arr_int], names=['tmp'])
+    with pytest.raises(IOError):
+        _check_roundtrip(table, expected=table, use_byte_stream_split=True,
+                         use_dictionary=False,
+                         use_legacy_dataset=use_legacy_dataset)
+
+
+@parametrize_legacy_dataset
+def test_column_encoding(use_legacy_dataset):
+    arr_float = pa.array(list(map(float, range(100))))
+    arr_int = pa.array(list(map(int, range(100))))
+    mixed_table = pa.Table.from_arrays([arr_float, arr_int],
+                                       names=['a', 'b'])
+
+    # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for
+    # column 'b'.
+    _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False,
+                     column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN"},
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check "PLAIN" for all columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding="PLAIN",
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'.
+    # This should throw an error as it is only supports FLOAT and DOUBLE.
+    with pytest.raises(IOError,
+                       match="BYTE_STREAM_SPLIT only supports FLOAT and"
+                             " DOUBLE"):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'b': "BYTE_STREAM_SPLIT"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass "DELTA_BINARY_PACKED".
+    # This should throw an error as it is only supported for reading.
+    with pytest.raises(IOError,
+                       match="Not yet implemented: Selected encoding is"
+                             " not supported."):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'b': "DELTA_BINARY_PACKED"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass "RLE_DICTIONARY".
+    # This should throw an error as dictionary encoding is already used by
+    # default and not supported to be specified as "fallback" encoding
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding="RLE_DICTIONARY",
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass unsupported encoding.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'a': "MADE_UP_ENCODING"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_dictionary.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=['b'],
+                         column_encoding={'b': "PLAIN"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_dictionary=True (default value).
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         column_encoding={'b': "PLAIN"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_byte_stream_split on same column.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         use_byte_stream_split=['a'],
+                         column_encoding={'a': "RLE",
+                                          'b': "BYTE_STREAM_SPLIT"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_byte_stream_split=True.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         use_byte_stream_split=True,
+                         column_encoding={'a': "RLE",
+                                          'b': "BYTE_STREAM_SPLIT"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding=True.
+    # This should throw an error.
+    with pytest.raises(TypeError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding=True,
+                         use_legacy_dataset=use_legacy_dataset)
+
+
+@parametrize_legacy_dataset
+def test_compression_level(use_legacy_dataset):
+    arr = pa.array(list(map(int, range(1000))))
+    data = [arr, arr]
+    table = pa.Table.from_arrays(data, names=['a', 'b'])
+
+    # Check one compression level.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     compression_level=1,
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check another one to make sure that compression_level=1 does not
+    # coincide with the default one in Arrow.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     compression_level=5,
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check that the user can provide a compression per column
+    _check_roundtrip(table, expected=table,
+                     compression={'a': "gzip", 'b': "snappy"},
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check that the user can provide a compression level per column
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     compression_level={'a': 2, 'b': 3},
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check if both LZ4 compressors are working
+    # (level < 3 -> fast, level >= 3 -> HC)
+    _check_roundtrip(table, expected=table, compression="lz4",
+                     compression_level=1,
+                     use_legacy_dataset=use_legacy_dataset)
+
+    _check_roundtrip(table, expected=table, compression="lz4",
+                     compression_level=9,
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check that specifying a compression level for a codec which does allow
+    # specifying one, results into an error.
+    # Uncompressed, snappy and lzo do not support specifying a compression
+    # level.
+    # GZIP (zlib) allows for specifying a compression level but as of up
+    # to version 1.2.11 the valid range is [-1, 9].
+    invalid_combinations = [("snappy", 4), ("gzip", -1337),
+                            ("None", 444), ("lzo", 14)]
+    buf = io.BytesIO()
+    for (codec, level) in invalid_combinations:
+        with pytest.raises((ValueError, OSError)):
+            _write_table(table, buf, compression=codec,
+                         compression_level=level)
+
+
+def test_sanitized_spark_field_names():
+    a0 = pa.array([0, 1, 2, 3, 4])
+    name = 'prohib; ,\t{}'
+    table = pa.Table.from_arrays([a0], [name])
+
+    result = _roundtrip_table(table, write_table_kwargs={'flavor': 'spark'})
+
+    expected_name = 'prohib______'
+    assert result.schema[0].name == expected_name
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_multithreaded_read(use_legacy_dataset):
+    df = alltypes_sample(size=10000)
+
+    table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(table, buf, compression='SNAPPY', version='2.6')
+
+    buf.seek(0)
+    table1 = _read_table(
+        buf, use_threads=True, use_legacy_dataset=use_legacy_dataset)
+
+    buf.seek(0)
+    table2 = _read_table(
+        buf, use_threads=False, use_legacy_dataset=use_legacy_dataset)
+
+    assert table1.equals(table2)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_min_chunksize(use_legacy_dataset):
+    data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
+    table = pa.Table.from_pandas(data.reset_index())
+
+    buf = io.BytesIO()
+    _write_table(table, buf, chunk_size=-1)
+
+    buf.seek(0)
+    result = _read_table(buf, use_legacy_dataset=use_legacy_dataset)
+
+    assert result.equals(table)
+
+    with pytest.raises(ValueError):
+        _write_table(table, buf, chunk_size=0)
+
+
+@pytest.mark.pandas
+def test_write_error_deletes_incomplete_file(tempdir):
+    # ARROW-1285
+    df = pd.DataFrame({'a': list('abc'),
+                       'b': list(range(1, 4)),
+                       'c': np.arange(3, 6).astype('u1'),
+                       'd': np.arange(4.0, 7.0, dtype='float64'),
+                       'e': [True, False, True],
+                       'f': pd.Categorical(list('abc')),
+                       'g': pd.date_range('20130101', periods=3),
+                       'h': pd.date_range('20130101', periods=3,
+                                          tz='US/Eastern'),
+                       'i': pd.date_range('20130101', periods=3, freq='ns')})
+
+    pdf = pa.Table.from_pandas(df)
+
+    filename = tempdir / 'tmp_file'
+    try:
+        _write_table(pdf, filename)
+    except pa.ArrowException:
+        pass
+
+    assert not filename.exists()
+
+
+@parametrize_legacy_dataset
+def test_read_non_existent_file(tempdir, use_legacy_dataset):
+    path = 'non-existent-file.parquet'
+    try:
+        pq.read_table(path, use_legacy_dataset=use_legacy_dataset)
+    except Exception as e:
+        assert path in e.args[0]
+
+
+@parametrize_legacy_dataset
+def test_read_table_doesnt_warn(datadir, use_legacy_dataset):
+    with pytest.warns(None) as record:
+        pq.read_table(datadir / 'v0.7.1.parquet',
+                      use_legacy_dataset=use_legacy_dataset)
+
+    if use_legacy_dataset:
+        # FutureWarning: 'use_legacy_dataset=True'
+        assert len(record) == 1
+    else:
+        assert len(record) == 0
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_zlib_compression_bug(use_legacy_dataset):
+    # ARROW-3514: "zlib deflate failed, output buffer too small"
+    table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col'])
+    f = io.BytesIO()
+    pq.write_table(table, f, compression='gzip')
+
+    f.seek(0)
+    roundtrip = pq.read_table(f, use_legacy_dataset=use_legacy_dataset)
+    tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas())
+
+
+@parametrize_legacy_dataset
+def test_parquet_file_too_small(tempdir, use_legacy_dataset):
+    path = str(tempdir / "test.parquet")
+    # TODO(dataset) with datasets API it raises OSError instead
+    with pytest.raises((pa.ArrowInvalid, OSError),
+                       match='size is 0 bytes'):
+        with open(path, 'wb') as f:
+            pass
+        pq.read_table(path, use_legacy_dataset=use_legacy_dataset)
+
+    with pytest.raises((pa.ArrowInvalid, OSError),
+                       match='size is 4 bytes'):
+        with open(path, 'wb') as f:
+            f.write(b'ffff')
+        pq.read_table(path, use_legacy_dataset=use_legacy_dataset)
+
+
+@pytest.mark.pandas
+@pytest.mark.fastparquet
+@pytest.mark.filterwarnings("ignore:RangeIndex:FutureWarning")
+@pytest.mark.filterwarnings("ignore:tostring:DeprecationWarning:fastparquet")
+def test_fastparquet_cross_compatibility(tempdir):
+    fp = pytest.importorskip('fastparquet')
+
+    df = pd.DataFrame(
+        {
+            "a": list("abc"),
+            "b": list(range(1, 4)),
+            "c": np.arange(4.0, 7.0, dtype="float64"),
+            "d": [True, False, True],
+            "e": pd.date_range("20130101", periods=3),
+            "f": pd.Categorical(["a", "b", "a"]),
+            # fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip
+            # "g": [[1, 2], None, [1, 2, 3]],
+        }
+    )
+    table = pa.table(df)
+
+    # Arrow -> fastparquet
+    file_arrow = str(tempdir / "cross_compat_arrow.parquet")
+    pq.write_table(table, file_arrow, compression=None)
+
+    fp_file = fp.ParquetFile(file_arrow)
+    df_fp = fp_file.to_pandas()
+    tm.assert_frame_equal(df, df_fp)
+
+    # Fastparquet -> arrow
+    file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet")
+    fp.write(file_fastparquet, df)
+
+    table_fp = pq.read_pandas(file_fastparquet)
+    # for fastparquet written file, categoricals comes back as strings
+    # (no arrow schema in parquet metadata)
+    df['f'] = df['f'].astype(object)
+    tm.assert_frame_equal(table_fp.to_pandas(), df)
+
+
+@parametrize_legacy_dataset
+@pytest.mark.parametrize('array_factory', [
+    lambda: pa.array([0, None] * 10),
+    lambda: pa.array([0, None] * 10).dictionary_encode(),
+    lambda: pa.array(["", None] * 10),
+    lambda: pa.array(["", None] * 10).dictionary_encode(),
+])
+@pytest.mark.parametrize('use_dictionary', [False, True])
+@pytest.mark.parametrize('read_dictionary', [False, True])
+def test_buffer_contents(
+        array_factory, use_dictionary, read_dictionary, use_legacy_dataset
+):
+    # Test that null values are deterministically initialized to zero
+    # after a roundtrip through Parquet.
+    # See ARROW-8006 and ARROW-8011.
+    orig_table = pa.Table.from_pydict({"col": array_factory()})
+    bio = io.BytesIO()
+    pq.write_table(orig_table, bio, use_dictionary=True)
+    bio.seek(0)
+    read_dictionary = ['col'] if read_dictionary else None
+    table = pq.read_table(bio, use_threads=False,
+                          read_dictionary=read_dictionary,
+                          use_legacy_dataset=use_legacy_dataset)
+
+    for col in table.columns:
+        [chunk] = col.chunks
+        buf = chunk.buffers()[1]
+        assert buf.to_pybytes() == buf.size * b"\0"
+
+
+def test_parquet_compression_roundtrip(tempdir):
+    # ARROW-10480: ensure even with nonstandard Parquet file naming
+    # conventions, writing and then reading a file works. In
+    # particular, ensure that we don't automatically double-compress
+    # the stream due to auto-detecting the extension in the filename
+    table = pa.table([pa.array(range(4))], names=["ints"])
+    path = tempdir / "arrow-10480.pyarrow.gz"
+    pq.write_table(table, path, compression="GZIP")
+    result = pq.read_table(path)
+    assert result.equals(table)
+
+
+def test_empty_row_groups(tempdir):
+    # ARROW-3020
+    table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
+
+    path = tempdir / 'empty_row_groups.parquet'
+
+    num_groups = 3
+    with pq.ParquetWriter(path, table.schema) as writer:
+        for i in range(num_groups):
+            writer.write_table(table)
+
+    reader = pq.ParquetFile(path)
+    assert reader.metadata.num_row_groups == num_groups
+
+    for i in range(num_groups):
+        assert reader.read_row_group(i).equals(table)
+
+
+def test_reads_over_batch(tempdir):
+    data = [None] * (1 << 20)
+    data.append([1])
+    # Large list<int64> with mostly nones and one final
+    # value.  This should force batched reads when
+    # reading back.
+    table = pa.Table.from_arrays([data], ['column'])
+
+    path = tempdir / 'arrow-11607.parquet'
+    pq.write_table(table, path)
+    table2 = pq.read_table(path)
+    assert table == table2
+
+
+@pytest.mark.dataset
+def test_permutation_of_column_order(tempdir):
+    # ARROW-2366
+    case = tempdir / "dataset_column_order_permutation"
+    case.mkdir(exist_ok=True)
+
+    data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
+    pq.write_table(data1, case / "data1.parquet")
+
+    data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
+    pq.write_table(data2, case / "data2.parquet")
+
+    table = pq.read_table(str(case))
+    table2 = pa.table([[1, 2, 3, 4, 5, 6],
+                       [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]],
+                      names=['a', 'b'])
+
+    assert table == table2
+
+
+def test_read_table_legacy_deprecated(tempdir):
+    # ARROW-15870
+    table = pa.table({'a': [1, 2, 3]})
+    path = tempdir / 'data.parquet'
+    pq.write_table(table, path)
+
+    with pytest.warns(
+        FutureWarning, match="Passing 'use_legacy_dataset=True'"
+    ):
+        pq.read_table(path, use_legacy_dataset=True)
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_compliant_nested_type.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_compliant_nested_type.py
@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests.parquet.common import parametrize_legacy_dataset
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import (_read_table,
+                                              _check_roundtrip)
+except ImportError:
+    pq = None
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
+except ImportError:
+    pd = tm = None
+
+
+# Tests for ARROW-11497
+_test_data_simple = [
+    {'items': [1, 2]},
+    {'items': [0]},
+]
+
+_test_data_complex = [
+    {'items': [{'name': 'elem1', 'value': '1'},
+               {'name': 'elem2', 'value': '2'}]},
+    {'items': [{'name': 'elem1', 'value': '0'}]},
+]
+
+parametrize_test_data = pytest.mark.parametrize(
+    "test_data", [_test_data_simple, _test_data_complex])
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+@parametrize_test_data
+def test_write_compliant_nested_type_enable(tempdir,
+                                            use_legacy_dataset, test_data):
+    # prepare dataframe for testing
+    df = pd.DataFrame(data=test_data)
+    # verify that we can read/write pandas df with new flag
+    _roundtrip_pandas_dataframe(df,
+                                write_kwargs={
+                                    'use_compliant_nested_type': True},
+                                use_legacy_dataset=use_legacy_dataset)
+
+    # Write to a parquet file with compliant nested type
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+    with pq.ParquetWriter(path, table.schema,
+                          use_compliant_nested_type=True,
+                          version='2.6') as writer:
+        writer.write_table(table)
+    # Read back as a table
+    new_table = _read_table(path)
+    # Validate that "items" columns compliant to Parquet nested format
+    # Should be like this: list<element: struct<name: string, value: string>>
+    assert isinstance(new_table.schema.types[0], pa.ListType)
+    assert new_table.schema.types[0].value_field.name == 'element'
+
+    # Verify that the new table can be read/written correctly
+    _check_roundtrip(new_table,
+                     use_legacy_dataset=use_legacy_dataset,
+                     use_compliant_nested_type=True)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+@parametrize_test_data
+def test_write_compliant_nested_type_disable(tempdir,
+                                             use_legacy_dataset, test_data):
+    # prepare dataframe for testing
+    df = pd.DataFrame(data=test_data)
+    # verify that we can read/write with new flag disabled (default behaviour)
+    _roundtrip_pandas_dataframe(df, write_kwargs={},
+                                use_legacy_dataset=use_legacy_dataset)
+
+    # Write to a parquet file while disabling compliant nested type
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+    with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
+        writer.write_table(table)
+    new_table = _read_table(path)
+
+    # Validate that "items" columns is not compliant to Parquet nested format
+    # Should be like this: list<item: struct<name: string, value: string>>
+    assert isinstance(new_table.schema.types[0], pa.ListType)
+    assert new_table.schema.types[0].value_field.name == 'item'
+
+    # Verify that the new table can be read/written correctly
+    _check_roundtrip(new_table,
+                     use_legacy_dataset=use_legacy_dataset,
+                     use_compliant_nested_type=False)
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_data_types.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_data_types.py
@ -0,0 +1,526 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import decimal
+import io
+
+import numpy as np
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests import util
+from pyarrow.tests.parquet.common import (_check_roundtrip,
+                                          parametrize_legacy_dataset)
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.pandas_examples import (dataframe_with_arrays,
+                                               dataframe_with_lists)
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+
+# General roundtrip of data types
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+@pytest.mark.parametrize('chunk_size', [None, 1000])
+def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset):
+    df = alltypes_sample(size=10000, categorical=True)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert arrow_table.schema.pandas_metadata is not None
+
+    _write_table(arrow_table, filename, version='2.6',
+                 coerce_timestamps='ms', chunk_size=chunk_size)
+    table_read = pq.read_pandas(
+        filename, use_legacy_dataset=use_legacy_dataset)
+    assert table_read.schema.pandas_metadata is not None
+
+    read_metadata = table_read.schema.metadata
+    assert arrow_table.schema.metadata == read_metadata
+
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset):
+    size = 10000
+    np.random.seed(0)
+    df = pd.DataFrame({
+        'uint8': np.arange(size, dtype=np.uint8),
+        'uint16': np.arange(size, dtype=np.uint16),
+        'uint32': np.arange(size, dtype=np.uint32),
+        'uint64': np.arange(size, dtype=np.uint64),
+        'int8': np.arange(size, dtype=np.int16),
+        'int16': np.arange(size, dtype=np.int16),
+        'int32': np.arange(size, dtype=np.int32),
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        'str': [str(x) for x in range(size)],
+        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
+        'empty_str': [''] * size
+    })
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    _write_table(arrow_table, filename, version='1.0')
+    table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+
+    # We pass uint32_t as int64_t if we write Parquet version 1.0
+    df['uint32'] = df['uint32'].values.astype(np.int64)
+
+    tm.assert_frame_equal(df, df_read)
+
+
+# Dictionary
+# -----------------------------------------------------------------------------
+
+
+def _simple_table_write_read(table, use_legacy_dataset):
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+    return pq.read_table(
+        pa.BufferReader(contents), use_legacy_dataset=use_legacy_dataset
+    )
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_direct_read_dictionary(use_legacy_dataset):
+    # ARROW-3325
+    repeats = 10
+    nunique = 5
+
+    data = [
+        [util.rands(10) for i in range(nunique)] * repeats,
+
+    ]
+    table = pa.table(data, names=['f0'])
+
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+
+    result = pq.read_table(pa.BufferReader(contents),
+                           read_dictionary=['f0'],
+                           use_legacy_dataset=use_legacy_dataset)
+
+    # Compute dictionary-encoded subfield
+    expected = pa.table([table[0].dictionary_encode()], names=['f0'])
+    assert result.equals(expected)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_direct_read_dictionary_subfield(use_legacy_dataset):
+    repeats = 10
+    nunique = 5
+
+    data = [
+        [[util.rands(10)] for i in range(nunique)] * repeats,
+    ]
+    table = pa.table(data, names=['f0'])
+
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+    result = pq.read_table(pa.BufferReader(contents),
+                           read_dictionary=['f0.list.item'],
+                           use_legacy_dataset=use_legacy_dataset)
+
+    arr = pa.array(data[0])
+    values_as_dict = arr.values.dictionary_encode()
+
+    inner_indices = values_as_dict.indices.cast('int32')
+    new_values = pa.DictionaryArray.from_arrays(inner_indices,
+                                                values_as_dict.dictionary)
+
+    offsets = pa.array(range(51), type='int32')
+    expected_arr = pa.ListArray.from_arrays(offsets, new_values)
+    expected = pa.table([expected_arr], names=['f0'])
+
+    assert result.equals(expected)
+    assert result[0].num_chunks == 1
+
+
+@parametrize_legacy_dataset
+def test_dictionary_array_automatically_read(use_legacy_dataset):
+    # ARROW-3246
+
+    # Make a large dictionary, a little over 4MB of data
+    dict_length = 4000
+    dict_values = pa.array([('x' * 1000 + '_{}'.format(i))
+                            for i in range(dict_length)])
+
+    num_chunks = 10
+    chunk_size = 100
+    chunks = []
+    for i in range(num_chunks):
+        indices = np.random.randint(0, dict_length,
+                                    size=chunk_size).astype(np.int32)
+        chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices),
+                                                     dict_values))
+
+    table = pa.table([pa.chunked_array(chunks)], names=['f0'])
+    result = _simple_table_write_read(table, use_legacy_dataset)
+
+    assert result.equals(table)
+
+    # The only key in the metadata was the Arrow schema key
+    assert result.schema.metadata is None
+
+
+# Decimal
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_decimal_roundtrip(tempdir, use_legacy_dataset):
+    num_values = 10
+
+    columns = {}
+    for precision in range(1, 39):
+        for scale in range(0, precision + 1):
+            with util.random_seed(0):
+                random_decimal_values = [
+                    util.randdecimal(precision, scale)
+                    for _ in range(num_values)
+                ]
+            column_name = ('dec_precision_{:d}_scale_{:d}'
+                           .format(precision, scale))
+            columns[column_name] = random_decimal_values
+
+    expected = pd.DataFrame(columns)
+    filename = tempdir / 'decimals.parquet'
+    string_filename = str(filename)
+    table = pa.Table.from_pandas(expected)
+    _write_table(table, string_filename)
+    result_table = _read_table(
+        string_filename, use_legacy_dataset=use_legacy_dataset)
+    result = result_table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+@pytest.mark.xfail(
+    raises=OSError, reason='Parquet does not support negative scale'
+)
+def test_decimal_roundtrip_negative_scale(tempdir):
+    expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]})
+    filename = tempdir / 'decimals.parquet'
+    string_filename = str(filename)
+    t = pa.Table.from_pandas(expected)
+    _write_table(t, string_filename)
+    result_table = _read_table(string_filename)
+    result = result_table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+# List types
+# -----------------------------------------------------------------------------
+
+
+@parametrize_legacy_dataset
+@pytest.mark.parametrize('dtype', [int, float])
+def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset):
+    filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__)
+    data = [pa.array(list(map(dtype, range(5))))]
+    table = pa.Table.from_arrays(data, names=['a'])
+    _write_table(table, filename)
+    table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
+    for i in range(table.num_columns):
+        col_written = table[i]
+        col_read = table_read[i]
+        assert table.field(i).name == table_read.field(i).name
+        assert col_read.num_chunks == 1
+        data_written = col_written.chunk(0)
+        data_read = col_read.chunk(0)
+        assert data_written.equals(data_read)
+
+
+@parametrize_legacy_dataset
+def test_empty_lists_table_roundtrip(use_legacy_dataset):
+    # ARROW-2744: Shouldn't crash when writing an array of empty lists
+    arr = pa.array([[], []], type=pa.list_(pa.int32()))
+    table = pa.Table.from_arrays([arr], ["A"])
+    _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
+
+
+@parametrize_legacy_dataset
+def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset):
+    # Reproduce failure in ARROW-5630
+    typ = pa.list_(pa.field("item", pa.float32(), False))
+    num_rows = 10000
+    t = pa.table([
+        pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] *
+                  (num_rows // 10)), type=typ)
+    ], ['a'])
+    _check_roundtrip(
+        t, data_page_size=4096, use_legacy_dataset=use_legacy_dataset)
+
+
+@parametrize_legacy_dataset
+def test_nested_list_struct_multiple_batches_roundtrip(
+    tempdir, use_legacy_dataset
+):
+    # Reproduce failure in ARROW-11024
+    data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100
+    table = pa.table([pa.array(data)], names=['column'])
+    _check_roundtrip(
+        table, row_group_size=20, use_legacy_dataset=use_legacy_dataset)
+
+    # Reproduce failure in ARROW-11069 (plain non-nested structs with strings)
+    data = pa.array(
+        [{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10
+    )
+    table = pa.table({'column': data})
+    _check_roundtrip(
+        table, row_group_size=10, use_legacy_dataset=use_legacy_dataset)
+
+
+def test_writing_empty_lists():
+    # ARROW-2591: [Python] Segmentation fault issue in pq.write_table
+    arr1 = pa.array([[], []], pa.list_(pa.int32()))
+    table = pa.Table.from_arrays([arr1], ['list(int32)'])
+    _check_roundtrip(table)
+
+
+@pytest.mark.pandas
+def test_column_of_arrays(tempdir):
+    df, schema = dataframe_with_arrays()
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, schema=schema)
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_column_of_lists(tempdir):
+    df, schema = dataframe_with_lists(parquet_compatible=True)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, schema=schema)
+    _write_table(arrow_table, filename, version='2.6')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+
+    tm.assert_frame_equal(df, df_read)
+
+
+def test_large_list_records():
+    # This was fixed in PARQUET-1100
+
+    list_lengths = np.random.randint(0, 500, size=50)
+    list_lengths[::10] = 0
+
+    list_values = [list(map(int, np.random.randint(0, 100, size=x)))
+                   if i % 8 else None
+                   for i, x in enumerate(list_lengths)]
+
+    a1 = pa.array(list_values)
+
+    table = pa.Table.from_arrays([a1], ['int_lists'])
+    _check_roundtrip(table)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_parquet_nested_convenience(tempdir, use_legacy_dataset):
+    # ARROW-1684
+    df = pd.DataFrame({
+        'a': [[1, 2, 3], None, [4, 5], []],
+        'b': [[1.], None, None, [6., 7.]],
+    })
+
+    path = str(tempdir / 'nested_convenience.parquet')
+
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    _write_table(table, path)
+
+    read = pq.read_table(
+        path, columns=['a'], use_legacy_dataset=use_legacy_dataset)
+    tm.assert_frame_equal(read.to_pandas(), df[['a']])
+
+    read = pq.read_table(
+        path, columns=['a', 'b'], use_legacy_dataset=use_legacy_dataset)
+    tm.assert_frame_equal(read.to_pandas(), df)
+
+
+# Binary
+# -----------------------------------------------------------------------------
+
+
+def test_fixed_size_binary():
+    t0 = pa.binary(10)
+    data = [b'fooooooooo', None, b'barooooooo', b'quxooooooo']
+    a0 = pa.array(data, type=t0)
+
+    table = pa.Table.from_arrays([a0],
+                                 ['binary[10]'])
+    _check_roundtrip(table)
+
+
+# Large types
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_large_table_int32_overflow():
+    size = np.iinfo('int32').max + 1
+
+    arr = np.ones(size, dtype='uint8')
+
+    parr = pa.array(arr, type=pa.uint8())
+
+    table = pa.Table.from_arrays([parr], names=['one'])
+    f = io.BytesIO()
+    _write_table(table, f)
+
+
+def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs):
+    stream = pa.BufferOutputStream()
+    _write_table(table, stream, **write_kwargs)
+    buf = stream.getvalue()
+    return _read_table(buf, use_legacy_dataset=use_legacy_dataset)
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+@parametrize_legacy_dataset
+def test_byte_array_exactly_2gb(use_legacy_dataset):
+    # Test edge case reported in ARROW-3762
+    val = b'x' * (1 << 10)
+
+    base = pa.array([val] * ((1 << 21) - 1))
+    cases = [
+        [b'x' * 1023],  # 2^31 - 1
+        [b'x' * 1024],  # 2^31
+        [b'x' * 1025]   # 2^31 + 1
+    ]
+    for case in cases:
+        values = pa.chunked_array([base, pa.array(case)])
+        t = pa.table([values], names=['f0'])
+        result = _simple_table_roundtrip(
+            t, use_legacy_dataset=use_legacy_dataset, use_dictionary=False)
+        assert t.equals(result)
+
+
+@pytest.mark.slow
+@pytest.mark.pandas
+@pytest.mark.large_memory
+@parametrize_legacy_dataset
+def test_binary_array_overflow_to_chunked(use_legacy_dataset):
+    # ARROW-3762
+
+    # 2^31 + 1 bytes
+    values = [b'x'] + [
+        b'x' * (1 << 20)
+    ] * 2 * (1 << 10)
+    df = pd.DataFrame({'byte_col': values})
+
+    tbl = pa.Table.from_pandas(df, preserve_index=False)
+    read_tbl = _simple_table_roundtrip(
+        tbl, use_legacy_dataset=use_legacy_dataset)
+
+    col0_data = read_tbl[0]
+    assert isinstance(col0_data, pa.ChunkedArray)
+
+    # Split up into 2GB chunks
+    assert col0_data.num_chunks == 2
+
+    assert tbl.equals(read_tbl)
+
+
+@pytest.mark.slow
+@pytest.mark.pandas
+@pytest.mark.large_memory
+@parametrize_legacy_dataset
+def test_list_of_binary_large_cell(use_legacy_dataset):
+    # ARROW-4688
+    data = []
+
+    # TODO(wesm): handle chunked children
+    # 2^31 - 1 bytes in a single cell
+    # data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
+
+    # A little under 2GB in cell each containing approximately 10MB each
+    data.extend([[b'x' * 1000000] * 10] * 214)
+
+    arr = pa.array(data)
+    table = pa.Table.from_arrays([arr], ['chunky_cells'])
+    read_table = _simple_table_roundtrip(
+        table, use_legacy_dataset=use_legacy_dataset)
+    assert table.equals(read_table)
+
+
+def test_large_binary():
+    data = [b'foo', b'bar'] * 50
+    for type in [pa.large_binary(), pa.large_string()]:
+        arr = pa.array(data, type=type)
+        table = pa.Table.from_arrays([arr], names=['strs'])
+        for use_dictionary in [False, True]:
+            _check_roundtrip(table, use_dictionary=use_dictionary)
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_large_binary_huge():
+    s = b'xy' * 997
+    data = [s] * ((1 << 33) // len(s))
+    for type in [pa.large_binary(), pa.large_string()]:
+        arr = pa.array(data, type=type)
+        table = pa.Table.from_arrays([arr], names=['strs'])
+        for use_dictionary in [False, True]:
+            _check_roundtrip(table, use_dictionary=use_dictionary)
+        del arr, table
+
+
+@pytest.mark.large_memory
+def test_large_binary_overflow():
+    s = b'x' * (1 << 31)
+    arr = pa.array([s], type=pa.large_binary())
+    table = pa.Table.from_arrays([arr], names=['strs'])
+    for use_dictionary in [False, True]:
+        writer = pa.BufferOutputStream()
+        with pytest.raises(
+                pa.ArrowInvalid,
+                match="Parquet cannot store strings with size 2GB or more"):
+            _write_table(table, writer, use_dictionary=use_dictionary)
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_dataset.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_dataset.py
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_datetime.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_datetime.py
@ -0,0 +1,446 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import io
+
+import numpy as np
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests.parquet.common import (
+    _check_roundtrip, parametrize_legacy_dataset)
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
+except ImportError:
+    pd = tm = None
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_parquet_datetime_tz(use_legacy_dataset):
+    s = pd.Series([datetime.datetime(2017, 9, 6)])
+    s = s.dt.tz_localize('utc')
+
+    s.index = s
+
+    # Both a column and an index to hit both use cases
+    df = pd.DataFrame({'tz_aware': s,
+                       'tz_eastern': s.dt.tz_convert('US/Eastern')},
+                      index=s)
+
+    f = io.BytesIO()
+
+    arrow_table = pa.Table.from_pandas(df)
+
+    _write_table(arrow_table, f, coerce_timestamps='ms')
+    f.seek(0)
+
+    table_read = pq.read_pandas(f, use_legacy_dataset=use_legacy_dataset)
+
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_datetime_timezone_tzinfo(use_legacy_dataset):
+    value = datetime.datetime(2018, 1, 1, 1, 23, 45,
+                              tzinfo=datetime.timezone.utc)
+    df = pd.DataFrame({'foo': [value]})
+
+    _roundtrip_pandas_dataframe(
+        df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset)
+
+
+@pytest.mark.pandas
+def test_coerce_timestamps(tempdir):
+    from collections import OrderedDict
+
+    # ARROW-622
+    arrays = OrderedDict()
+    fields = [pa.field('datetime64',
+                       pa.list_(pa.timestamp('ms')))]
+    arrays['datetime64'] = [
+        np.array(['2007-07-13T01:23:34.123456789',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+        None,
+        None,
+        np.array(['2007-07-13T02',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+    ]
+
+    df = pd.DataFrame(arrays)
+    schema = pa.schema(fields)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, schema=schema)
+
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='us')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+
+    df_expected = df.copy()
+    for i, x in enumerate(df_expected['datetime64']):
+        if isinstance(x, np.ndarray):
+            df_expected['datetime64'][i] = x.astype('M8[us]')
+
+    tm.assert_frame_equal(df_expected, df_read)
+
+    with pytest.raises(ValueError):
+        _write_table(arrow_table, filename, version='2.6',
+                     coerce_timestamps='unknown')
+
+
+@pytest.mark.pandas
+def test_coerce_timestamps_truncated(tempdir):
+    """
+    ARROW-2555: Test that we can truncate timestamps when coercing if
+    explicitly allowed.
+    """
+    dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
+                              second=1, microsecond=1)
+    dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
+                              second=1)
+
+    fields_us = [pa.field('datetime64', pa.timestamp('us'))]
+    arrays_us = {'datetime64': [dt_us, dt_ms]}
+
+    df_us = pd.DataFrame(arrays_us)
+    schema_us = pa.schema(fields_us)
+
+    filename = tempdir / 'pandas_truncated.parquet'
+    table_us = pa.Table.from_pandas(df_us, schema=schema_us)
+
+    _write_table(table_us, filename, version='2.6', coerce_timestamps='ms',
+                 allow_truncated_timestamps=True)
+    table_ms = _read_table(filename)
+    df_ms = table_ms.to_pandas()
+
+    arrays_expected = {'datetime64': [dt_ms, dt_ms]}
+    df_expected = pd.DataFrame(arrays_expected)
+    tm.assert_frame_equal(df_expected, df_ms)
+
+
+@pytest.mark.pandas
+def test_date_time_types(tempdir):
+    t1 = pa.date32()
+    data1 = np.array([17259, 17260, 17261], dtype='int32')
+    a1 = pa.array(data1, type=t1)
+
+    t2 = pa.date64()
+    data2 = data1.astype('int64') * 86400000
+    a2 = pa.array(data2, type=t2)
+
+    t3 = pa.timestamp('us')
+    start = pd.Timestamp('2001-01-01').value / 1000
+    data3 = np.array([start, start + 1, start + 2], dtype='int64')
+    a3 = pa.array(data3, type=t3)
+
+    t4 = pa.time32('ms')
+    data4 = np.arange(3, dtype='i4')
+    a4 = pa.array(data4, type=t4)
+
+    t5 = pa.time64('us')
+    a5 = pa.array(data4.astype('int64'), type=t5)
+
+    t6 = pa.time32('s')
+    a6 = pa.array(data4, type=t6)
+
+    ex_t6 = pa.time32('ms')
+    ex_a6 = pa.array(data4 * 1000, type=ex_t6)
+
+    t7 = pa.timestamp('ns')
+    start = pd.Timestamp('2001-01-01').value
+    data7 = np.array([start, start + 1000, start + 2000],
+                     dtype='int64')
+    a7 = pa.array(data7, type=t7)
+
+    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
+                                 ['date32', 'date64', 'timestamp[us]',
+                                  'time32[s]', 'time64[us]',
+                                  'time32_from64[s]',
+                                  'timestamp[ns]'])
+
+    # date64 as date32
+    # time32[s] to time32[ms]
+    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
+                                    ['date32', 'date64', 'timestamp[us]',
+                                     'time32[s]', 'time64[us]',
+                                     'time32_from64[s]',
+                                     'timestamp[ns]'])
+
+    _check_roundtrip(table, expected=expected, version='2.6')
+
+    t0 = pa.timestamp('ms')
+    data0 = np.arange(4, dtype='int64')
+    a0 = pa.array(data0, type=t0)
+
+    t1 = pa.timestamp('us')
+    data1 = np.arange(4, dtype='int64')
+    a1 = pa.array(data1, type=t1)
+
+    t2 = pa.timestamp('ns')
+    data2 = np.arange(4, dtype='int64')
+    a2 = pa.array(data2, type=t2)
+
+    table = pa.Table.from_arrays([a0, a1, a2],
+                                 ['ts[ms]', 'ts[us]', 'ts[ns]'])
+    expected = pa.Table.from_arrays([a0, a1, a2],
+                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])
+
+    # int64 for all timestamps supported by default
+    filename = tempdir / 'int64_timestamps.parquet'
+    _write_table(table, filename, version='2.6')
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT64'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+    t0_ns = pa.timestamp('ns')
+    data0_ns = np.array(data0 * 1000000, dtype='int64')
+    a0_ns = pa.array(data0_ns, type=t0_ns)
+
+    t1_ns = pa.timestamp('ns')
+    data1_ns = np.array(data1 * 1000, dtype='int64')
+    a1_ns = pa.array(data1_ns, type=t1_ns)
+
+    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
+                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])
+
+    # int96 nanosecond timestamps produced upon request
+    filename = tempdir / 'explicit_int96_timestamps.parquet'
+    _write_table(table, filename, version='2.6',
+                 use_deprecated_int96_timestamps=True)
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT96'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+    # int96 nanosecond timestamps implied by flavor 'spark'
+    filename = tempdir / 'spark_int96_timestamps.parquet'
+    _write_table(table, filename, version='2.6',
+                 flavor='spark')
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT96'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
+def test_coerce_int96_timestamp_unit(unit):
+    i_s = pd.Timestamp('2010-01-01').value / 1000000000  # := 1262304000
+
+    d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
+    d_ms = d_s * 1000
+    d_us = d_ms * 1000
+    d_ns = d_us * 1000
+
+    a_s = pa.array(d_s, type=pa.timestamp('s'))
+    a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
+    a_us = pa.array(d_us, type=pa.timestamp('us'))
+    a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
+
+    arrays = {"s": a_s, "ms": a_ms, "us": a_us, "ns": a_ns}
+    names = ['ts_s', 'ts_ms', 'ts_us', 'ts_ns']
+    table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
+
+    # For either Parquet version, coercing to nanoseconds is allowed
+    # if Int96 storage is used
+    expected = pa.Table.from_arrays([arrays.get(unit)]*4, names)
+    read_table_kwargs = {"coerce_int96_timestamp_unit": unit}
+    _check_roundtrip(table, expected,
+                     read_table_kwargs=read_table_kwargs,
+                     use_deprecated_int96_timestamps=True)
+    _check_roundtrip(table, expected, version='2.6',
+                     read_table_kwargs=read_table_kwargs,
+                     use_deprecated_int96_timestamps=True)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('pq_reader_method', ['ParquetFile', 'read_table'])
+def test_coerce_int96_timestamp_overflow(pq_reader_method, tempdir):
+
+    def get_table(pq_reader_method, filename, **kwargs):
+        if pq_reader_method == "ParquetFile":
+            return pq.ParquetFile(filename, **kwargs).read()
+        elif pq_reader_method == "read_table":
+            return pq.read_table(filename, **kwargs)
+
+    # Recreating the initial JIRA issue referenced in ARROW-12096
+    oob_dts = [
+        datetime.datetime(1000, 1, 1),
+        datetime.datetime(2000, 1, 1),
+        datetime.datetime(3000, 1, 1)
+    ]
+    df = pd.DataFrame({"a": oob_dts})
+    table = pa.table(df)
+
+    filename = tempdir / "test_round_trip_overflow.parquet"
+    pq.write_table(table, filename, use_deprecated_int96_timestamps=True,
+                   version="1.0")
+
+    # with the default resolution of ns, we get wrong values for INT96
+    # that are out of bounds for nanosecond range
+    tab_error = get_table(pq_reader_method, filename)
+    assert tab_error["a"].to_pylist() != oob_dts
+
+    # avoid this overflow by specifying the resolution to use for INT96 values
+    tab_correct = get_table(
+        pq_reader_method, filename, coerce_int96_timestamp_unit="s"
+    )
+    df_correct = tab_correct.to_pandas(timestamp_as_object=True)
+    tm.assert_frame_equal(df, df_correct)
+
+
+def test_timestamp_restore_timezone():
+    # ARROW-5888, restore timezone from serialized metadata
+    ty = pa.timestamp('ms', tz='America/New_York')
+    arr = pa.array([1, 2, 3], type=ty)
+    t = pa.table([arr], names=['f0'])
+    _check_roundtrip(t)
+
+
+def test_timestamp_restore_timezone_nanosecond():
+    # ARROW-9634, also restore timezone for nanosecond data that get stored
+    # as microseconds in the parquet file
+    ty = pa.timestamp('ns', tz='America/New_York')
+    arr = pa.array([1000, 2000, 3000], type=ty)
+    table = pa.table([arr], names=['f0'])
+    ty_us = pa.timestamp('us', tz='America/New_York')
+    expected = pa.table([arr.cast(ty_us)], names=['f0'])
+    _check_roundtrip(table, expected=expected)
+
+
+@pytest.mark.pandas
+def test_list_of_datetime_time_roundtrip():
+    # ARROW-4135
+    times = pd.to_datetime(['09:00', '09:30', '10:00', '10:30', '11:00',
+                            '11:30', '12:00'])
+    df = pd.DataFrame({'time': [times.time]})
+    _roundtrip_pandas_dataframe(df, write_kwargs={})
+
+
+@pytest.mark.pandas
+def test_parquet_version_timestamp_differences():
+    i_s = pd.Timestamp('2010-01-01').value / 1000000000  # := 1262304000
+
+    d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
+    d_ms = d_s * 1000
+    d_us = d_ms * 1000
+    d_ns = d_us * 1000
+
+    a_s = pa.array(d_s, type=pa.timestamp('s'))
+    a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
+    a_us = pa.array(d_us, type=pa.timestamp('us'))
+    a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
+
+    names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
+    table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
+
+    # Using Parquet version 1.0, seconds should be coerced to milliseconds
+    # and nanoseconds should be coerced to microseconds by default
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
+    _check_roundtrip(table, expected)
+
+    # Using Parquet version 2.0, seconds should be coerced to milliseconds
+    # and nanoseconds should be retained by default
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
+    _check_roundtrip(table, expected, version='2.6')
+
+    # Using Parquet version 1.0, coercing to milliseconds or microseconds
+    # is allowed
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
+    _check_roundtrip(table, expected, coerce_timestamps='ms')
+
+    # Using Parquet version 2.0, coercing to milliseconds or microseconds
+    # is allowed
+    expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
+    _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us')
+
+    # TODO: after pyarrow allows coerce_timestamps='ns', tests like the
+    # following should pass ...
+
+    # Using Parquet version 1.0, coercing to nanoseconds is not allowed
+    # expected = None
+    # with pytest.raises(NotImplementedError):
+    #     _roundtrip_table(table, coerce_timestamps='ns')
+
+    # Using Parquet version 2.0, coercing to nanoseconds is allowed
+    # expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
+    # _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns')
+
+    # For either Parquet version, coercing to nanoseconds is allowed
+    # if Int96 storage is used
+    expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
+    _check_roundtrip(table, expected,
+                     use_deprecated_int96_timestamps=True)
+    _check_roundtrip(table, expected, version='2.6',
+                     use_deprecated_int96_timestamps=True)
+
+
+@pytest.mark.pandas
+def test_noncoerced_nanoseconds_written_without_exception(tempdir):
+    # ARROW-1957: the Parquet version 2.0 writer preserves Arrow
+    # nanosecond timestamps by default
+    n = 9
+    df = pd.DataFrame({'x': range(n)},
+                      index=pd.date_range('2017-01-01', freq='1n', periods=n))
+    tb = pa.Table.from_pandas(df)
+
+    filename = tempdir / 'written.parquet'
+    try:
+        pq.write_table(tb, filename, version='2.6')
+    except Exception:
+        pass
+    assert filename.exists()
+
+    recovered_table = pq.read_table(filename)
+    assert tb.equals(recovered_table)
+
+    # Loss of data through coercion (without explicit override) still an error
+    filename = tempdir / 'not_written.parquet'
+    with pytest.raises(ValueError):
+        pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
+
+
+def test_duration_type():
+    # ARROW-6780
+    arrays = [pa.array([0, 1, 2, 3], type=pa.duration(unit))
+              for unit in ["s", "ms", "us", "ns"]]
+    table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"])
+
+    _check_roundtrip(table)
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_encryption.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_encryption.py
@ -0,0 +1,530 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+from datetime import timedelta
+
+import pyarrow as pa
+try:
+    import pyarrow.parquet as pq
+    import pyarrow.parquet.encryption as pe
+except ImportError:
+    pq = None
+    pe = None
+else:
+    from pyarrow.tests.parquet.encryption import (
+        InMemoryKmsClient, verify_file_encrypted)
+
+
+PARQUET_NAME = 'encrypted_table.in_mem.parquet'
+FOOTER_KEY = b"0123456789112345"
+FOOTER_KEY_NAME = "footer_key"
+COL_KEY = b"1234567890123450"
+COL_KEY_NAME = "col_key"
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet_encryption'
+pytestmark = pytest.mark.parquet_encryption
+
+
+@pytest.fixture(scope='module')
+def data_table():
+    data_table = pa.Table.from_pydict({
+        'a': pa.array([1, 2, 3]),
+        'b': pa.array(['a', 'b', 'c']),
+        'c': pa.array(['x', 'y', 'z'])
+    })
+    return data_table
+
+
+@pytest.fixture(scope='module')
+def basic_encryption_config():
+    basic_encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        })
+    return basic_encryption_config
+
+
+def test_encrypted_parquet_write_read(tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted, and then read it."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        encryption_algorithm="AES_GCM_V1",
+        cache_lifetime=timedelta(minutes=5.0),
+        data_key_length_bits=256)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={
+            FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+            COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+        }
+    )
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    # Write with encryption properties
+    write_encrypted_parquet(path, data_table, encryption_config,
+                            kms_connection_config, crypto_factory)
+    verify_file_encrypted(path)
+
+    # Read with decryption properties
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+    result_table = read_encrypted_parquet(
+        path, decryption_config, kms_connection_config, crypto_factory)
+    assert data_table.equals(result_table)
+
+
+def write_encrypted_parquet(path, table, encryption_config,
+                            kms_connection_config, crypto_factory):
+    file_encryption_properties = crypto_factory.file_encryption_properties(
+        kms_connection_config, encryption_config)
+    assert(file_encryption_properties is not None)
+    with pq.ParquetWriter(
+            path, table.schema,
+            encryption_properties=file_encryption_properties) as writer:
+        writer.write_table(table)
+
+
+def read_encrypted_parquet(path, decryption_config,
+                           kms_connection_config, crypto_factory):
+    file_decryption_properties = crypto_factory.file_decryption_properties(
+        kms_connection_config, decryption_config)
+    assert(file_decryption_properties is not None)
+    meta = pq.read_metadata(
+        path, decryption_properties=file_decryption_properties)
+    assert(meta.num_columns == 3)
+    schema = pq.read_schema(
+        path, decryption_properties=file_decryption_properties)
+    assert(len(schema.names) == 3)
+
+    result = pq.ParquetFile(
+        path, decryption_properties=file_decryption_properties)
+    return result.read(use_threads=False)
+
+
+def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    and then read it using wrong keys."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        encryption_algorithm="AES_GCM_V1",
+        cache_lifetime=timedelta(minutes=5.0),
+        data_key_length_bits=256)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={
+            FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+            COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+        }
+    )
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    # Write with encryption properties
+    write_encrypted_parquet(path, data_table, encryption_config,
+                            kms_connection_config, crypto_factory)
+    verify_file_encrypted(path)
+
+    # Read with decryption properties
+    wrong_kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={
+            # Wrong keys - mixup in names
+            FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"),
+            COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+        }
+    )
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+    with pytest.raises(ValueError, match=r"Incorrect master key used"):
+        read_encrypted_parquet(
+            path, decryption_config, wrong_kms_connection_config,
+            crypto_factory)
+
+
+def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    but then try to read it without decryption properties."""
+    test_encrypted_parquet_write_read(tempdir, data_table)
+    # Read without decryption properties
+    with pytest.raises(IOError, match=r"no decryption"):
+        pq.ParquetFile(tempdir / PARQUET_NAME).read()
+
+
+def test_encrypted_parquet_read_metadata_no_decryption_config(
+        tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    but then try to read its metadata without decryption properties."""
+    test_encrypted_parquet_write_read(tempdir, data_table)
+    # Read metadata without decryption properties
+    with pytest.raises(IOError, match=r"no decryption"):
+        pq.read_metadata(tempdir / PARQUET_NAME)
+
+
+def test_encrypted_parquet_read_schema_no_decryption_config(
+        tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    but then try to read its schema without decryption properties."""
+    test_encrypted_parquet_write_read(tempdir, data_table)
+    with pytest.raises(IOError, match=r"no decryption"):
+        pq.read_schema(tempdir / PARQUET_NAME)
+
+
+def test_encrypted_parquet_write_no_col_key(tempdir, data_table):
+    """Write an encrypted parquet, but give only footer key,
+    without column key."""
+    path = tempdir / 'encrypted_table_no_col_key.in_mem.parquet'
+
+    # Encrypt the footer with the footer key
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={
+            FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+            COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+        }
+    )
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(OSError,
+                       match="Either column_keys or uniform_encryption "
+                       "must be set"):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_write_kms_error(tempdir, data_table,
+                                           basic_encryption_config):
+    """Write an encrypted parquet, but raise KeyError in KmsClient."""
+    path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    def kms_factory(kms_connection_configuration):
+        # Empty master keys map will cause KeyError to be raised
+        # on wrap/unwrap calls
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(KeyError, match="footer_key"):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_write_kms_specific_error(tempdir, data_table,
+                                                    basic_encryption_config):
+    """Write an encrypted parquet, but raise KeyError in KmsClient."""
+    path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    class ThrowingKmsClient(pe.KmsClient):
+        """A KmsClient implementation that throws exception in
+        wrap/unwrap calls
+        """
+
+        def __init__(self, config):
+            """Create an InMemoryKmsClient instance."""
+            pe.KmsClient.__init__(self)
+            self.config = config
+
+        def wrap_key(self, key_bytes, master_key_identifier):
+            raise ValueError("Cannot Wrap Key")
+
+        def unwrap_key(self, wrapped_key, master_key_identifier):
+            raise ValueError("Cannot Unwrap Key")
+
+    def kms_factory(kms_connection_configuration):
+        # Exception thrown in wrap/unwrap calls
+        return ThrowingKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(ValueError, match="Cannot Wrap Key"):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_write_kms_factory_error(tempdir, data_table,
+                                                   basic_encryption_config):
+    """Write an encrypted parquet, but raise ValueError in kms_factory."""
+    path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    def kms_factory(kms_connection_configuration):
+        raise ValueError('Cannot create KmsClient')
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(ValueError,
+                       match="Cannot create KmsClient"):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_write_kms_factory_type_error(
+        tempdir, data_table, basic_encryption_config):
+    """Write an encrypted parquet, but use wrong KMS client type
+    that doesn't implement KmsClient."""
+    path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    class WrongTypeKmsClient():
+        """This is not an implementation of KmsClient.
+        """
+
+        def __init__(self, config):
+            self.master_keys_map = config.custom_kms_conf
+
+        def wrap_key(self, key_bytes, master_key_identifier):
+            return None
+
+        def unwrap_key(self, wrapped_key, master_key_identifier):
+            return None
+
+    def kms_factory(kms_connection_configuration):
+        return WrongTypeKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(TypeError):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_encryption_configuration():
+    def validate_encryption_configuration(encryption_config):
+        assert(FOOTER_KEY_NAME == encryption_config.footer_key)
+        assert(["a", "b"] == encryption_config.column_keys[COL_KEY_NAME])
+        assert("AES_GCM_CTR_V1" == encryption_config.encryption_algorithm)
+        assert(encryption_config.plaintext_footer)
+        assert(not encryption_config.double_wrapping)
+        assert(timedelta(minutes=10.0) == encryption_config.cache_lifetime)
+        assert(not encryption_config.internal_key_material)
+        assert(192 == encryption_config.data_key_length_bits)
+
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={COL_KEY_NAME: ["a", "b"], },
+        encryption_algorithm="AES_GCM_CTR_V1",
+        plaintext_footer=True,
+        double_wrapping=False,
+        cache_lifetime=timedelta(minutes=10.0),
+        internal_key_material=False,
+        data_key_length_bits=192,
+    )
+    validate_encryption_configuration(encryption_config)
+
+    encryption_config_1 = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME)
+    encryption_config_1.column_keys = {COL_KEY_NAME: ["a", "b"], }
+    encryption_config_1.encryption_algorithm = "AES_GCM_CTR_V1"
+    encryption_config_1.plaintext_footer = True
+    encryption_config_1.double_wrapping = False
+    encryption_config_1.cache_lifetime = timedelta(minutes=10.0)
+    encryption_config_1.internal_key_material = False
+    encryption_config_1.data_key_length_bits = 192
+    validate_encryption_configuration(encryption_config_1)
+
+
+def test_encrypted_parquet_decryption_configuration():
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=10.0))
+    assert(timedelta(minutes=10.0) == decryption_config.cache_lifetime)
+
+    decryption_config_1 = pe.DecryptionConfiguration()
+    decryption_config_1.cache_lifetime = timedelta(minutes=10.0)
+    assert(timedelta(minutes=10.0) == decryption_config_1.cache_lifetime)
+
+
+def test_encrypted_parquet_kms_configuration():
+    def validate_kms_connection_config(kms_connection_config):
+        assert("Instance1" == kms_connection_config.kms_instance_id)
+        assert("URL1" == kms_connection_config.kms_instance_url)
+        assert("MyToken" == kms_connection_config.key_access_token)
+        assert({"key1": "key_material_1", "key2": "key_material_2"} ==
+               kms_connection_config.custom_kms_conf)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        kms_instance_id="Instance1",
+        kms_instance_url="URL1",
+        key_access_token="MyToken",
+        custom_kms_conf={
+            "key1": "key_material_1",
+            "key2": "key_material_2",
+        })
+    validate_kms_connection_config(kms_connection_config)
+
+    kms_connection_config_1 = pe.KmsConnectionConfig()
+    kms_connection_config_1.kms_instance_id = "Instance1"
+    kms_connection_config_1.kms_instance_url = "URL1"
+    kms_connection_config_1.key_access_token = "MyToken"
+    kms_connection_config_1.custom_kms_conf = {
+        "key1": "key_material_1",
+        "key2": "key_material_2",
+    }
+    validate_kms_connection_config(kms_connection_config_1)
+
+
+@pytest.mark.xfail(reason="Plaintext footer - reading plaintext column subset"
+                   " reads encrypted columns too")
+def test_encrypted_parquet_write_read_plain_footer_single_wrapping(
+        tempdir, data_table):
+    """Write an encrypted parquet, with plaintext footer
+    and with single wrapping,
+    verify it's encrypted, and then read plaintext columns."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        plaintext_footer=True,
+        double_wrapping=False)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={
+            FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+            COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+        }
+    )
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    # Write with encryption properties
+    write_encrypted_parquet(path, data_table, encryption_config,
+                            kms_connection_config, crypto_factory)
+
+    # # Read without decryption properties only the plaintext column
+    # result = pq.ParquetFile(path)
+    # result_table = result.read(columns='c', use_threads=False)
+    # assert table.num_rows == result_table.num_rows
+
+
+@pytest.mark.xfail(reason="External key material not supported yet")
+def test_encrypted_parquet_write_external(tempdir, data_table):
+    """Write an encrypted parquet, with external key
+    material.
+    Currently it's not implemented, so should throw
+    an exception"""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the file with the footer key
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={},
+        internal_key_material=False)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8")}
+    )
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    # Write with encryption properties
+    write_encrypted_parquet(path, data_table, encryption_config,
+                            kms_connection_config, crypto_factory)
+
+
+@pytest.mark.skip(reason="ARROW-14114: Multithreaded read sometimes fails"
+                  "decryption finalization or with Segmentation fault")
+def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config):
+    """Write an encrypted parquet, verify it's encrypted,
+    and then read it multithreaded in a loop."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext
+    encryption_config = basic_encryption_config
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={
+            FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+            COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+        }
+    )
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+
+    # Write with encryption properties
+    write_encrypted_parquet(path, data_table, encryption_config,
+                            kms_connection_config, crypto_factory)
+    verify_file_encrypted(path)
+
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+
+    for i in range(50):
+        # Read with decryption properties
+        file_decryption_properties = crypto_factory.file_decryption_properties(
+            kms_connection_config, decryption_config)
+        assert(file_decryption_properties is not None)
+
+        result = pq.ParquetFile(
+            path, decryption_properties=file_decryption_properties)
+        result_table = result.read(use_threads=True)
+        assert data_table.equals(result_table)
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_metadata.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_metadata.py
@ -0,0 +1,528 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import decimal
+from collections import OrderedDict
+
+import numpy as np
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests.parquet.common import _check_roundtrip, make_sample_file
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+
+@pytest.mark.pandas
+def test_parquet_metadata_api():
+    df = alltypes_sample(size=10000)
+    df = df.reindex(columns=sorted(df.columns))
+    df.index = np.random.randint(0, 1000000, size=len(df))
+
+    fileh = make_sample_file(df)
+    ncols = len(df.columns)
+
+    # Series of sniff tests
+    meta = fileh.metadata
+    repr(meta)
+    assert meta.num_rows == len(df)
+    assert meta.num_columns == ncols + 1  # +1 for index
+    assert meta.num_row_groups == 1
+    assert meta.format_version == '2.6'
+    assert 'parquet-cpp' in meta.created_by
+    assert isinstance(meta.serialized_size, int)
+    assert isinstance(meta.metadata, dict)
+
+    # Schema
+    schema = fileh.schema
+    assert meta.schema is schema
+    assert len(schema) == ncols + 1  # +1 for index
+    repr(schema)
+
+    col = schema[0]
+    repr(col)
+    assert col.name == df.columns[0]
+    assert col.max_definition_level == 1
+    assert col.max_repetition_level == 0
+    assert col.max_repetition_level == 0
+
+    assert col.physical_type == 'BOOLEAN'
+    assert col.converted_type == 'NONE'
+
+    with pytest.raises(IndexError):
+        schema[ncols + 1]  # +1 for index
+
+    with pytest.raises(IndexError):
+        schema[-1]
+
+    # Row group
+    for rg in range(meta.num_row_groups):
+        rg_meta = meta.row_group(rg)
+        assert isinstance(rg_meta, pq.RowGroupMetaData)
+        repr(rg_meta)
+
+        for col in range(rg_meta.num_columns):
+            col_meta = rg_meta.column(col)
+            assert isinstance(col_meta, pq.ColumnChunkMetaData)
+            repr(col_meta)
+
+    with pytest.raises(IndexError):
+        meta.row_group(-1)
+
+    with pytest.raises(IndexError):
+        meta.row_group(meta.num_row_groups + 1)
+
+    rg_meta = meta.row_group(0)
+    assert rg_meta.num_rows == len(df)
+    assert rg_meta.num_columns == ncols + 1  # +1 for index
+    assert rg_meta.total_byte_size > 0
+
+    with pytest.raises(IndexError):
+        col_meta = rg_meta.column(-1)
+
+    with pytest.raises(IndexError):
+        col_meta = rg_meta.column(ncols + 2)
+
+    col_meta = rg_meta.column(0)
+    assert col_meta.file_offset > 0
+    assert col_meta.file_path == ''  # created from BytesIO
+    assert col_meta.physical_type == 'BOOLEAN'
+    assert col_meta.num_values == 10000
+    assert col_meta.path_in_schema == 'bool'
+    assert col_meta.is_stats_set is True
+    assert isinstance(col_meta.statistics, pq.Statistics)
+    assert col_meta.compression == 'SNAPPY'
+    assert col_meta.encodings == ('PLAIN', 'RLE')
+    assert col_meta.has_dictionary_page is False
+    assert col_meta.dictionary_page_offset is None
+    assert col_meta.data_page_offset > 0
+    assert col_meta.total_compressed_size > 0
+    assert col_meta.total_uncompressed_size > 0
+    with pytest.raises(NotImplementedError):
+        col_meta.has_index_page
+    with pytest.raises(NotImplementedError):
+        col_meta.index_page_offset
+
+
+def test_parquet_metadata_lifetime(tempdir):
+    # ARROW-6642 - ensure that chained access keeps parent objects alive
+    table = pa.table({'a': [1, 2, 3]})
+    pq.write_table(table, tempdir / 'test_metadata_segfault.parquet')
+    parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet')
+    parquet_file.metadata.row_group(0).column(0).statistics
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    (
+        'data',
+        'type',
+        'physical_type',
+        'min_value',
+        'max_value',
+        'null_count',
+        'num_values',
+        'distinct_count'
+    ),
+    [
+        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
+        (
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
+            'FLOAT', -1.1, 4.4, 1, 4, 0
+        ),
+        (
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
+            'DOUBLE', -1.1, 4.4, 1, 4, 0
+        ),
+        (
+            ['', 'b', chr(1000), None, 'aaa'], pa.binary(),
+            'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0
+        ),
+        (
+            [True, False, False, True, True], pa.bool_(),
+            'BOOLEAN', False, True, 0, 5, 0
+        ),
+        (
+            [b'\x00', b'b', b'12', None, b'aaa'], pa.binary(),
+            'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0
+        ),
+    ]
+)
+def test_parquet_column_statistics_api(data, type, physical_type, min_value,
+                                       max_value, null_count, num_values,
+                                       distinct_count):
+    df = pd.DataFrame({'data': data})
+    schema = pa.schema([pa.field('data', type)])
+    table = pa.Table.from_pandas(df, schema=schema, safe=False)
+    fileh = make_sample_file(table)
+
+    meta = fileh.metadata
+
+    rg_meta = meta.row_group(0)
+    col_meta = rg_meta.column(0)
+
+    stat = col_meta.statistics
+    assert stat.has_min_max
+    assert _close(type, stat.min, min_value)
+    assert _close(type, stat.max, max_value)
+    assert stat.null_count == null_count
+    assert stat.num_values == num_values
+    # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
+    # method, missing distinct_count is represented as zero instead of None
+    assert stat.distinct_count == distinct_count
+    assert stat.physical_type == physical_type
+
+
+def _close(type, left, right):
+    if type == pa.float32():
+        return abs(left - right) < 1E-7
+    elif type == pa.float64():
+        return abs(left - right) < 1E-13
+    else:
+        return left == right
+
+
+# ARROW-6339
+@pytest.mark.pandas
+def test_parquet_raise_on_unset_statistics():
+    df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")})
+    meta = make_sample_file(pa.Table.from_pandas(df)).metadata
+
+    assert not meta.row_group(0).column(0).statistics.has_min_max
+    assert meta.row_group(0).column(0).statistics.max is None
+
+
+def test_statistics_convert_logical_types(tempdir):
+    # ARROW-5166, ARROW-4139
+
+    # (min, max, type)
+    cases = [(10, 11164359321221007157, pa.uint64()),
+             (10, 4294967295, pa.uint32()),
+             ("ähnlich", "öffentlich", pa.utf8()),
+             (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
+              pa.time32('ms')),
+             (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
+              pa.time64('us')),
+             (datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
+              datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
+              pa.timestamp('ms')),
+             (datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
+              datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
+              pa.timestamp('us')),
+             (datetime.date(2019, 6, 24),
+              datetime.date(2019, 6, 25),
+              pa.date32()),
+             (decimal.Decimal("20.123"),
+              decimal.Decimal("20.124"),
+              pa.decimal128(12, 5))]
+
+    for i, (min_val, max_val, typ) in enumerate(cases):
+        t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)],
+                                 ['col'])
+        path = str(tempdir / ('example{}.parquet'.format(i)))
+        pq.write_table(t, path, version='2.6')
+        pf = pq.ParquetFile(path)
+        stats = pf.metadata.row_group(0).column(0).statistics
+        assert stats.min == min_val
+        assert stats.max == max_val
+
+
+def test_parquet_write_disable_statistics(tempdir):
+    table = pa.Table.from_pydict(
+        OrderedDict([
+            ('a', pa.array([1, 2, 3])),
+            ('b', pa.array(['a', 'b', 'c']))
+        ])
+    )
+    _write_table(table, tempdir / 'data.parquet')
+    meta = pq.read_metadata(tempdir / 'data.parquet')
+    for col in [0, 1]:
+        cc = meta.row_group(0).column(col)
+        assert cc.is_stats_set is True
+        assert cc.statistics is not None
+
+    _write_table(table, tempdir / 'data2.parquet', write_statistics=False)
+    meta = pq.read_metadata(tempdir / 'data2.parquet')
+    for col in [0, 1]:
+        cc = meta.row_group(0).column(col)
+        assert cc.is_stats_set is False
+        assert cc.statistics is None
+
+    _write_table(table, tempdir / 'data3.parquet', write_statistics=['a'])
+    meta = pq.read_metadata(tempdir / 'data3.parquet')
+    cc_a = meta.row_group(0).column(0)
+    cc_b = meta.row_group(0).column(1)
+    assert cc_a.is_stats_set is True
+    assert cc_b.is_stats_set is False
+    assert cc_a.statistics is not None
+    assert cc_b.statistics is None
+
+
+def test_field_id_metadata():
+    # ARROW-7080
+    field_id = b'PARQUET:field_id'
+    inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
+    middle = pa.field('middle', pa.struct(
+        [inner]), metadata={field_id: b'101'})
+    fields = [
+        pa.field('basic', pa.int32(), metadata={
+                 b'other': b'abc', field_id: b'1'}),
+        pa.field(
+            'list',
+            pa.list_(pa.field('list-inner', pa.int32(),
+                              metadata={field_id: b'10'})),
+            metadata={field_id: b'11'}),
+        pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
+        pa.field('no-metadata', pa.int32()),
+        pa.field('non-integral-field-id', pa.int32(),
+                 metadata={field_id: b'xyz'}),
+        pa.field('negative-field-id', pa.int32(),
+                 metadata={field_id: b'-1000'})
+    ]
+    arrs = [[] for _ in fields]
+    table = pa.table(arrs, schema=pa.schema(fields))
+
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+
+    pf = pq.ParquetFile(pa.BufferReader(contents))
+    schema = pf.schema_arrow
+
+    assert schema[0].metadata[field_id] == b'1'
+    assert schema[0].metadata[b'other'] == b'abc'
+
+    list_field = schema[1]
+    assert list_field.metadata[field_id] == b'11'
+
+    list_item_field = list_field.type.value_field
+    assert list_item_field.metadata[field_id] == b'10'
+
+    struct_field = schema[2]
+    assert struct_field.metadata[field_id] == b'102'
+
+    struct_middle_field = struct_field.type[0]
+    assert struct_middle_field.metadata[field_id] == b'101'
+
+    struct_inner_field = struct_middle_field.type[0]
+    assert struct_inner_field.metadata[field_id] == b'100'
+
+    assert schema[3].metadata is None
+    # Invalid input is passed through (ok) but does not
+    # have field_id in parquet (not tested)
+    assert schema[4].metadata[field_id] == b'xyz'
+    assert schema[5].metadata[field_id] == b'-1000'
+
+
+@pytest.mark.pandas
+def test_multi_dataset_metadata(tempdir):
+    filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]
+    metapath = str(tempdir / "_metadata")
+
+    # create a test dataset
+    df = pd.DataFrame({
+        'one': [1, 2, 3],
+        'two': [-1, -2, -3],
+        'three': [[1, 2], [2, 3], [3, 4]],
+    })
+    table = pa.Table.from_pandas(df)
+
+    # write dataset twice and collect/merge metadata
+    _meta = None
+    for filename in filenames:
+        meta = []
+        pq.write_table(table, str(tempdir / filename),
+                       metadata_collector=meta)
+        meta[0].set_file_path(filename)
+        if _meta is None:
+            _meta = meta[0]
+        else:
+            _meta.append_row_groups(meta[0])
+
+    # Write merged metadata-only file
+    with open(metapath, "wb") as f:
+        _meta.write_metadata_file(f)
+
+    # Read back the metadata
+    meta = pq.read_metadata(metapath)
+    md = meta.to_dict()
+    _md = _meta.to_dict()
+    for key in _md:
+        if key != 'serialized_size':
+            assert _md[key] == md[key]
+    assert _md['num_columns'] == 3
+    assert _md['num_rows'] == 6
+    assert _md['num_row_groups'] == 2
+    assert _md['serialized_size'] == 0
+    assert md['serialized_size'] > 0
+
+
+def test_write_metadata(tempdir):
+    path = str(tempdir / "metadata")
+    schema = pa.schema([("a", "int64"), ("b", "float64")])
+
+    # write a pyarrow schema
+    pq.write_metadata(schema, path)
+    parquet_meta = pq.read_metadata(path)
+    schema_as_arrow = parquet_meta.schema.to_arrow_schema()
+    assert schema_as_arrow.equals(schema)
+
+    # ARROW-8980: Check that the ARROW:schema metadata key was removed
+    if schema_as_arrow.metadata:
+        assert b'ARROW:schema' not in schema_as_arrow.metadata
+
+    # pass through writer keyword arguments
+    for version in ["1.0", "2.0", "2.4", "2.6"]:
+        pq.write_metadata(schema, path, version=version)
+        parquet_meta = pq.read_metadata(path)
+        # The version is stored as a single integer in the Parquet metadata,
+        # so it cannot correctly express dotted format versions
+        expected_version = "1.0" if version == "1.0" else "2.6"
+        assert parquet_meta.format_version == expected_version
+
+    # metadata_collector: list of FileMetaData objects
+    table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema)
+    pq.write_table(table, tempdir / "data.parquet")
+    parquet_meta = pq.read_metadata(str(tempdir / "data.parquet"))
+    pq.write_metadata(
+        schema, path, metadata_collector=[parquet_meta, parquet_meta]
+    )
+    parquet_meta_mult = pq.read_metadata(path)
+    assert parquet_meta_mult.num_row_groups == 2
+
+    # append metadata with different schema raises an error
+    with pytest.raises(RuntimeError, match="requires equal schemas"):
+        pq.write_metadata(
+            pa.schema([("a", "int32"), ("b", "null")]),
+            path, metadata_collector=[parquet_meta, parquet_meta]
+        )
+
+
+def test_table_large_metadata():
+    # ARROW-8694
+    my_schema = pa.schema([pa.field('f0', 'double')],
+                          metadata={'large': 'x' * 10000000})
+
+    table = pa.table([np.arange(10)], schema=my_schema)
+    _check_roundtrip(table)
+
+
+@pytest.mark.pandas
+def test_compare_schemas():
+    df = alltypes_sample(size=10000)
+
+    fileh = make_sample_file(df)
+    fileh2 = make_sample_file(df)
+    fileh3 = make_sample_file(df[df.columns[::2]])
+
+    # ParquetSchema
+    assert isinstance(fileh.schema, pq.ParquetSchema)
+    assert fileh.schema.equals(fileh.schema)
+    assert fileh.schema == fileh.schema
+    assert fileh.schema.equals(fileh2.schema)
+    assert fileh.schema == fileh2.schema
+    assert fileh.schema != 'arbitrary object'
+    assert not fileh.schema.equals(fileh3.schema)
+    assert fileh.schema != fileh3.schema
+
+    # ColumnSchema
+    assert isinstance(fileh.schema[0], pq.ColumnSchema)
+    assert fileh.schema[0].equals(fileh.schema[0])
+    assert fileh.schema[0] == fileh.schema[0]
+    assert not fileh.schema[0].equals(fileh.schema[1])
+    assert fileh.schema[0] != fileh.schema[1]
+    assert fileh.schema[0] != 'arbitrary object'
+
+
+@pytest.mark.pandas
+def test_read_schema(tempdir):
+    N = 100
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'values': np.random.randn(N)
+    }, columns=['index', 'values'])
+
+    data_path = tempdir / 'test.parquet'
+
+    table = pa.Table.from_pandas(df)
+    _write_table(table, data_path)
+
+    read1 = pq.read_schema(data_path)
+    read2 = pq.read_schema(data_path, memory_map=True)
+    assert table.schema.equals(read1)
+    assert table.schema.equals(read2)
+
+    assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
+
+
+def test_parquet_metadata_empty_to_dict(tempdir):
+    # https://issues.apache.org/jira/browse/ARROW-10146
+    table = pa.table({"a": pa.array([], type="int64")})
+    pq.write_table(table, tempdir / "data.parquet")
+    metadata = pq.read_metadata(tempdir / "data.parquet")
+    # ensure this doesn't error / statistics set to None
+    metadata_dict = metadata.to_dict()
+    assert len(metadata_dict["row_groups"]) == 1
+    assert len(metadata_dict["row_groups"][0]["columns"]) == 1
+    assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_metadata_exceeds_message_size():
+    # ARROW-13655: Thrift may enable a default message size that limits
+    # the size of Parquet metadata that can be written.
+    NCOLS = 1000
+    NREPEATS = 4000
+
+    table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)})
+
+    with pa.BufferOutputStream() as out:
+        pq.write_table(table, out)
+        buf = out.getvalue()
+
+    original_metadata = pq.read_metadata(pa.BufferReader(buf))
+    metadata = pq.read_metadata(pa.BufferReader(buf))
+    for i in range(NREPEATS):
+        metadata.append_row_groups(original_metadata)
+
+    with pa.BufferOutputStream() as out:
+        metadata.write_metadata_file(out)
+        buf = out.getvalue()
+
+    metadata = pq.read_metadata(pa.BufferReader(buf))
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_pandas.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_pandas.py
@ -0,0 +1,707 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+import json
+
+import numpy as np
+import pytest
+
+import pyarrow as pa
+from pyarrow.fs import LocalFileSystem, SubTreeFileSystem
+from pyarrow.tests.parquet.common import (
+    parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
+from pyarrow.util import guid
+from pyarrow.vendored.version import Version
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
+                                              _write_table)
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
+                                              alltypes_sample)
+except ImportError:
+    pd = tm = None
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_custom_metadata(tempdir):
+    df = alltypes_sample(size=10000)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert b'pandas' in arrow_table.schema.metadata
+
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+
+    metadata = pq.read_metadata(filename).metadata
+    assert b'pandas' in metadata
+
+    js = json.loads(metadata[b'pandas'].decode('utf8'))
+    assert js['index_columns'] == [{'kind': 'range',
+                                    'name': None,
+                                    'start': 0, 'stop': 10000,
+                                    'step': 1}]
+
+
+@pytest.mark.pandas
+def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
+    # ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
+    schema = pa.schema([
+        pa.field('int', pa.int16()),
+        pa.field('float', pa.float32()),
+        pa.field('string', pa.string())
+    ])
+    df1 = pd.DataFrame({
+        'int': np.arange(3, dtype=np.uint8),
+        'float': np.arange(3, dtype=np.float32),
+        'string': ['ABBA', 'EDDA', 'ACDC']
+    })
+    df2 = pd.DataFrame({
+        'int': [4, 5],
+        'float': [1.1, None],
+        'string': [None, None]
+    })
+    table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
+    table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
+
+    assert not table1.schema.equals(table2.schema, check_metadata=True)
+    assert table1.schema.equals(table2.schema)
+
+    writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
+    writer.write_table(table1)
+    writer.write_table(table2)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset):
+    df = alltypes_sample(size=10)
+    df.columns = pd.MultiIndex.from_tuples(
+        list(zip(df.columns, df.columns[::-1])),
+        names=['level_1', 'level_2']
+    )
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert arrow_table.schema.pandas_metadata is not None
+
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+
+    table_read = pq.read_pandas(
+        filename, use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(
+    tempdir, use_legacy_dataset
+):
+    df = alltypes_sample(size=10000)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    js = arrow_table.schema.pandas_metadata
+    assert not js['index_columns']
+    # ARROW-2170
+    # While index_columns should be empty, columns needs to be filled still.
+    assert js['columns']
+
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+    table_read = pq.read_pandas(
+        filename, use_legacy_dataset=use_legacy_dataset)
+
+    js = table_read.schema.pandas_metadata
+    assert not js['index_columns']
+
+    read_metadata = table_read.schema.metadata
+    assert arrow_table.schema.metadata == read_metadata
+
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+# TODO(dataset) duplicate column selection actually gives duplicate columns now
+@pytest.mark.pandas
+@parametrize_legacy_dataset_not_supported
+def test_pandas_column_selection(tempdir, use_legacy_dataset):
+    size = 10000
+    np.random.seed(0)
+    df = pd.DataFrame({
+        'uint8': np.arange(size, dtype=np.uint8),
+        'uint16': np.arange(size, dtype=np.uint16)
+    })
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    _write_table(arrow_table, filename)
+    table_read = _read_table(
+        filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+
+    tm.assert_frame_equal(df[['uint8']], df_read)
+
+    # ARROW-4267: Selection of duplicate columns still leads to these columns
+    # being read uniquely.
+    table_read = _read_table(
+        filename, columns=['uint8', 'uint8'],
+        use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+
+    tm.assert_frame_equal(df[['uint8']], df_read)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset):
+    df = _test_dataframe(10000)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    _write_table(arrow_table, imos, version='2.6')
+    buf = imos.getvalue()
+    reader = pa.BufferReader(buf)
+    df_read = _read_table(
+        reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_read_pandas_column_subset(tempdir, use_legacy_dataset):
+    df = _test_dataframe(10000)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    _write_table(arrow_table, imos, version='2.6')
+    buf = imos.getvalue()
+    reader = pa.BufferReader(buf)
+    df_read = pq.read_pandas(
+        reader, columns=['strings', 'uint8'],
+        use_legacy_dataset=use_legacy_dataset
+    ).to_pandas()
+    tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset):
+    df = _test_dataframe(0)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    _write_table(arrow_table, imos, version='2.6')
+    buf = imos.getvalue()
+    reader = pa.BufferReader(buf)
+    df_read = _read_table(
+        reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_pandas_can_write_nested_data(tempdir):
+    data = {
+        "agg_col": [
+            {"page_type": 1},
+            {"record_type": 1},
+            {"non_consecutive_home": 0},
+        ],
+        "uid_first": "1001"
+    }
+    df = pd.DataFrame(data=data)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    # This succeeds under V2
+    _write_table(arrow_table, imos)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
+    filename = tempdir / 'pandas_pyfile_roundtrip.parquet'
+    size = 5
+    df = pd.DataFrame({
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        'strings': ['foo', 'bar', None, 'baz', 'qux']
+    })
+
+    arrow_table = pa.Table.from_pandas(df)
+
+    with filename.open('wb') as f:
+        _write_table(arrow_table, f, version="1.0")
+
+    data = io.BytesIO(filename.read_bytes())
+
+    table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset):
+    size = 10000
+    np.random.seed(0)
+    df = pd.DataFrame({
+        'uint8': np.arange(size, dtype=np.uint8),
+        'uint16': np.arange(size, dtype=np.uint16),
+        'uint32': np.arange(size, dtype=np.uint32),
+        'uint64': np.arange(size, dtype=np.uint64),
+        'int8': np.arange(size, dtype=np.int16),
+        'int16': np.arange(size, dtype=np.int16),
+        'int32': np.arange(size, dtype=np.int32),
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0
+    })
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+
+    for use_dictionary in [True, False]:
+        _write_table(arrow_table, filename, version='2.6',
+                     use_dictionary=use_dictionary)
+        table_read = _read_table(
+            filename, use_legacy_dataset=use_legacy_dataset)
+        df_read = table_read.to_pandas()
+        tm.assert_frame_equal(df, df_read)
+
+    for write_statistics in [True, False]:
+        _write_table(arrow_table, filename, version='2.6',
+                     write_statistics=write_statistics)
+        table_read = _read_table(filename,
+                                 use_legacy_dataset=use_legacy_dataset)
+        df_read = table_read.to_pandas()
+        tm.assert_frame_equal(df, df_read)
+
+    for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
+        if (compression != 'NONE' and
+                not pa.lib.Codec.is_available(compression)):
+            continue
+        _write_table(arrow_table, filename, version='2.6',
+                     compression=compression)
+        table_read = _read_table(
+            filename, use_legacy_dataset=use_legacy_dataset)
+        df_read = table_read.to_pandas()
+        tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_spark_flavor_preserves_pandas_metadata():
+    df = _test_dataframe(size=100)
+    df.index = np.arange(0, 10 * len(df), 10)
+    df.index.name = 'foo'
+
+    result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
+                                              'flavor': 'spark'})
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_index_column_name_duplicate(tempdir, use_legacy_dataset):
+    data = {
+        'close': {
+            pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
+            pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
+        },
+        'time': {
+            pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
+                '2017-06-30 01:31:00'
+            ),
+            pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
+                '2017-06-30 01:32:00'
+            ),
+        }
+    }
+    path = str(tempdir / 'data.parquet')
+    dfx = pd.DataFrame(data).set_index('time', drop=False)
+    tdfx = pa.Table.from_pandas(dfx)
+    _write_table(tdfx, path)
+    arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
+    result_df = arrow_table.to_pandas()
+    tm.assert_frame_equal(result_df, dfx)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_multiindex_duplicate_values(tempdir, use_legacy_dataset):
+    num_rows = 3
+    numbers = list(range(num_rows))
+    index = pd.MultiIndex.from_arrays(
+        [['foo', 'foo', 'bar'], numbers],
+        names=['foobar', 'some_numbers'],
+    )
+
+    df = pd.DataFrame({'numbers': numbers}, index=index)
+    table = pa.Table.from_pandas(df)
+
+    filename = tempdir / 'dup_multi_index_levels.parquet'
+
+    _write_table(table, filename)
+    result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
+    assert table.equals(result_table)
+
+    result_df = result_table.to_pandas()
+    tm.assert_frame_equal(result_df, df)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_backwards_compatible_index_naming(datadir, use_legacy_dataset):
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}',
+                           index_col=None, header=0, engine='python')
+    table = _read_table(
+        datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_backwards_compatible_index_multi_level_named(
+    datadir, use_legacy_dataset
+):
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(
+        io.BytesIO(expected_string), sep=r'\s{2,}',
+        index_col=['cut', 'color', 'clarity'],
+        header=0, engine='python'
+    ).sort_index()
+
+    table = _read_table(datadir / 'v0.7.1.all-named-index.parquet',
+                        use_legacy_dataset=use_legacy_dataset)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_backwards_compatible_index_multi_level_some_named(
+        datadir, use_legacy_dataset
+):
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(
+        io.BytesIO(expected_string),
+        sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'],
+        header=0, engine='python'
+    ).sort_index()
+    expected.index = expected.index.set_names(['cut', None, 'clarity'])
+
+    table = _read_table(datadir / 'v0.7.1.some-named-index.parquet',
+                        use_legacy_dataset=use_legacy_dataset)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_backwards_compatible_column_metadata_handling(
+    datadir, use_legacy_dataset
+):
+    expected = pd.DataFrame(
+        {'a': [1, 2, 3], 'b': [.1, .2, .3],
+         'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
+    expected.index = pd.MultiIndex.from_arrays(
+        [['a', 'b', 'c'],
+         pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
+        names=['index', None])
+
+    path = datadir / 'v0.7.1.column-metadata-handling.parquet'
+    table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+    table = _read_table(
+        path, columns=['a'], use_legacy_dataset=use_legacy_dataset)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_categorical_index_survives_roundtrip(use_legacy_dataset):
+    # ARROW-3652, addressed by ARROW-3246
+    df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2'])
+    df['c1'] = df['c1'].astype('category')
+    df = df.set_index(['c1'])
+
+    table = pa.Table.from_pandas(df)
+    bos = pa.BufferOutputStream()
+    pq.write_table(table, bos)
+    ref_df = pq.read_pandas(
+        bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
+    assert isinstance(ref_df.index, pd.CategoricalIndex)
+    assert ref_df.index.equals(df.index)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_categorical_order_survives_roundtrip(use_legacy_dataset):
+    # ARROW-6302
+    df = pd.DataFrame({"a": pd.Categorical(
+        ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)})
+
+    table = pa.Table.from_pandas(df)
+    bos = pa.BufferOutputStream()
+    pq.write_table(table, bos)
+
+    contents = bos.getvalue()
+    result = pq.read_pandas(
+        contents, use_legacy_dataset=use_legacy_dataset).to_pandas()
+
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_categorical_na_type_row_groups(use_legacy_dataset):
+    # ARROW-5085
+    df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
+    df_category = df.astype({"col": "category", "int": "category"})
+    table = pa.Table.from_pandas(df)
+    table_cat = pa.Table.from_pandas(df_category)
+    buf = pa.BufferOutputStream()
+
+    # it works
+    pq.write_table(table_cat, buf, version='2.6', chunk_size=10)
+    result = pq.read_table(
+        buf.getvalue(), use_legacy_dataset=use_legacy_dataset)
+
+    # Result is non-categorical
+    assert result[0].equals(table[0])
+    assert result[1].equals(table[1])
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_pandas_categorical_roundtrip(use_legacy_dataset):
+    # ARROW-5480, this was enabled by ARROW-3246
+
+    # Have one of the categories unobserved and include a null (-1)
+    codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
+    categories = ['foo', 'bar', 'baz']
+    df = pd.DataFrame({'x': pd.Categorical.from_codes(
+        codes, categories=categories)})
+
+    buf = pa.BufferOutputStream()
+    pq.write_table(pa.table(df), buf)
+
+    result = pq.read_table(
+        buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
+    assert result.x.dtype == 'category'
+    assert (result.x.cat.categories == categories).all()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_write_to_dataset_pandas_preserve_extensiondtypes(
+    tempdir, use_legacy_dataset
+):
+    # ARROW-8251 - preserve pandas extension dtypes in roundtrip
+    if Version(pd.__version__) < Version("1.0.0"):
+        pytest.skip("__arrow_array__ added to pandas in 1.0.0")
+
+    df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
+    df['col'] = df['col'].astype("Int64")
+    table = pa.table(df)
+
+    pq.write_to_dataset(
+        table, str(tempdir / "case1"), partition_cols=['part'],
+        use_legacy_dataset=use_legacy_dataset
+    )
+    result = pq.read_table(
+        str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
+    ).to_pandas()
+    tm.assert_frame_equal(result[["col"]], df[["col"]])
+
+    pq.write_to_dataset(
+        table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
+    )
+    result = pq.read_table(
+        str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
+    ).to_pandas()
+    tm.assert_frame_equal(result[["col"]], df[["col"]])
+
+    pq.write_table(table, str(tempdir / "data.parquet"))
+    result = pq.read_table(
+        str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
+    ).to_pandas()
+    tm.assert_frame_equal(result[["col"]], df[["col"]])
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset):
+    # ARROW-8251 - preserve pandas index in roundtrip
+
+    df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
+    df.index = pd.Index(['a', 'b', 'c'], name="idx")
+    table = pa.table(df)
+    df_cat = df[["col", "part"]].copy()
+    df_cat["part"] = df_cat["part"].astype("category")
+
+    pq.write_to_dataset(
+        table, str(tempdir / "case1"), partition_cols=['part'],
+        use_legacy_dataset=use_legacy_dataset
+    )
+    result = pq.read_table(
+        str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
+    ).to_pandas()
+    tm.assert_frame_equal(result, df_cat)
+
+    pq.write_to_dataset(
+        table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
+    )
+    result = pq.read_table(
+        str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
+    ).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    pq.write_table(table, str(tempdir / "data.parquet"))
+    result = pq.read_table(
+        str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
+    ).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('preserve_index', [True, False, None])
+def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
+    # ARROW-1103
+    nfiles = 5
+    size = 5
+
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    test_data = []
+    frames = []
+    paths = []
+    for i in range(nfiles):
+        df = _test_dataframe(size, seed=i)
+        df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
+
+        path = dirpath / '{}.parquet'.format(i)
+
+        table = pa.Table.from_pandas(df, preserve_index=preserve_index)
+
+        # Obliterate metadata
+        table = table.replace_schema_metadata(None)
+        assert table.schema.metadata is None
+
+        _write_table(table, path)
+        test_data.append(table)
+        frames.append(df)
+        paths.append(path)
+
+    # Write _metadata common file
+    table_for_metadata = pa.Table.from_pandas(
+        df, preserve_index=preserve_index
+    )
+    pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata')
+
+    dataset = pq.ParquetDataset(dirpath)
+    columns = ['uint8', 'strings']
+    result = dataset.read_pandas(columns=columns).to_pandas()
+    expected = pd.concat([x[columns] for x in frames])
+    expected.index.name = (
+        df.index.name if preserve_index is not False else None)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+def test_read_pandas_passthrough_keywords(tempdir):
+    # ARROW-11464 - previously not all keywords were passed through (such as
+    # the filesystem keyword)
+    df = pd.DataFrame({'a': [1, 2, 3]})
+
+    filename = tempdir / 'data.parquet'
+    _write_table(df, filename)
+
+    result = pq.read_pandas(
+        'data.parquet',
+        filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem())
+    )
+    assert result.equals(pa.table(df))
+
+
+@pytest.mark.pandas
+def test_read_pandas_map_fields(tempdir):
+    # ARROW-10140 - table created from Pandas with mapping fields
+    df = pd.DataFrame({
+        'col1': pd.Series([
+            [('id', 'something'), ('value2', 'else')],
+            [('id', 'something2'), ('value', 'else2')],
+        ]),
+        'col2': pd.Series(['foo', 'bar'])
+    })
+
+    filename = tempdir / 'data.parquet'
+
+    udt = pa.map_(pa.string(), pa.string())
+    schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())])
+    arrow_table = pa.Table.from_pandas(df, schema)
+
+    _write_table(arrow_table, filename)
+
+    result = pq.read_pandas(filename).to_pandas()
+    tm.assert_frame_equal(result, df)
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_parquet_file.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_parquet_file.py
@ -0,0 +1,274 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+import os
+
+import pytest
+
+import pyarrow as pa
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _write_table
+except ImportError:
+    pq = None
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+
+@pytest.mark.pandas
+def test_pass_separate_metadata():
+    # ARROW-471
+    df = alltypes_sample(size=10000)
+
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, compression='snappy', version='2.6')
+
+    buf.seek(0)
+    metadata = pq.read_metadata(buf)
+
+    buf.seek(0)
+
+    fileh = pq.ParquetFile(buf, metadata=metadata)
+
+    tm.assert_frame_equal(df, fileh.read().to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_single_row_group():
+    # ARROW-471
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+
+    pf = pq.ParquetFile(buf)
+
+    assert pf.num_row_groups == K
+
+    row_groups = [pf.read_row_group(i) for i in range(K)]
+    result = pa.concat_tables(row_groups)
+    tm.assert_frame_equal(df, result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_single_row_group_with_column_subset():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf)
+
+    cols = list(df.columns[:2])
+    row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
+    result = pa.concat_tables(row_groups)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+    # ARROW-4267: Selection of duplicate columns still leads to these columns
+    # being read uniquely.
+    row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)]
+    result = pa.concat_tables(row_groups)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_multiple_row_groups():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+
+    pf = pq.ParquetFile(buf)
+
+    assert pf.num_row_groups == K
+
+    result = pf.read_row_groups(range(K))
+    tm.assert_frame_equal(df, result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_multiple_row_groups_with_column_subset():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf)
+
+    cols = list(df.columns[:2])
+    result = pf.read_row_groups(range(K), columns=cols)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+    # ARROW-4267: Selection of duplicate columns still leads to these columns
+    # being read uniquely.
+    result = pf.read_row_groups(range(K), columns=cols + cols)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_scan_contents():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf)
+
+    assert pf.scan_contents() == 10000
+    assert pf.scan_contents(df.columns[:4]) == 10000
+
+
+def test_parquet_file_pass_directory_instead_of_file(tempdir):
+    # ARROW-7208
+    path = tempdir / 'directory'
+    os.mkdir(str(path))
+
+    with pytest.raises(IOError, match="Expected file path"):
+        pq.ParquetFile(path)
+
+
+def test_read_column_invalid_index():
+    table = pa.table([pa.array([4, 5]), pa.array(["foo", "bar"])],
+                     names=['ints', 'strs'])
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    f = pq.ParquetFile(bio.getvalue())
+    assert f.reader.read_column(0).to_pylist() == [4, 5]
+    assert f.reader.read_column(1).to_pylist() == ["foo", "bar"]
+    for index in (-1, 2):
+        with pytest.raises((ValueError, IndexError)):
+            f.reader.read_column(index)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('batch_size', [300, 1000, 1300])
+def test_iter_batches_columns_reader(tempdir, batch_size):
+    total_size = 3000
+    chunk_size = 1000
+    # TODO: Add categorical support
+    df = alltypes_sample(size=total_size)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    _write_table(arrow_table, filename, version='2.6',
+                 coerce_timestamps='ms', chunk_size=chunk_size)
+
+    file_ = pq.ParquetFile(filename)
+    for columns in [df.columns[:10], df.columns[10:]]:
+        batches = file_.iter_batches(batch_size=batch_size, columns=columns)
+        batch_starts = range(0, total_size+batch_size, batch_size)
+        for batch, start in zip(batches, batch_starts):
+            end = min(total_size, start + batch_size)
+            tm.assert_frame_equal(
+                batch.to_pandas(),
+                df.iloc[start:end, :].loc[:, columns].reset_index(drop=True)
+            )
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('chunk_size', [1000])
+def test_iter_batches_reader(tempdir, chunk_size):
+    df = alltypes_sample(size=10000, categorical=True)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert arrow_table.schema.pandas_metadata is not None
+
+    _write_table(arrow_table, filename, version='2.6',
+                 coerce_timestamps='ms', chunk_size=chunk_size)
+
+    file_ = pq.ParquetFile(filename)
+
+    def get_all_batches(f):
+        for row_group in range(f.num_row_groups):
+            batches = f.iter_batches(
+                batch_size=900,
+                row_groups=[row_group],
+            )
+
+            for batch in batches:
+                yield batch
+
+    batches = list(get_all_batches(file_))
+    batch_no = 0
+
+    for i in range(file_.num_row_groups):
+        tm.assert_frame_equal(
+            batches[batch_no].to_pandas(),
+            file_.read_row_groups([i]).to_pandas().head(900)
+        )
+
+        batch_no += 1
+
+        tm.assert_frame_equal(
+            batches[batch_no].to_pandas().reset_index(drop=True),
+            file_.read_row_groups([i]).to_pandas().iloc[900:].reset_index(
+                drop=True
+            )
+        )
+
+        batch_no += 1
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('pre_buffer', [False, True])
+def test_pre_buffer(pre_buffer):
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf, pre_buffer=pre_buffer)
+    assert pf.read().num_rows == N
--- a/.venv/Lib/site-packages/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/.venv/Lib/site-packages/pyarrow/tests/parquet/test_parquet_writer.py
@ -0,0 +1,322 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+from pyarrow import fs
+from pyarrow.filesystem import FileSystem, LocalFileSystem
+from pyarrow.tests.parquet.common import parametrize_legacy_dataset
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _test_dataframe
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+except ImportError:
+    pd = tm = None
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_parquet_incremental_file_build(tempdir, use_legacy_dataset):
+    df = _test_dataframe(100)
+    df['unique_id'] = 0
+
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    out = pa.BufferOutputStream()
+
+    writer = pq.ParquetWriter(out, arrow_table.schema, version='2.6')
+
+    frames = []
+    for i in range(10):
+        df['unique_id'] = i
+        arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+        writer.write_table(arrow_table)
+
+        frames.append(df.copy())
+
+    writer.close()
+
+    buf = out.getvalue()
+    result = _read_table(
+        pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
+
+    expected = pd.concat(frames, ignore_index=True)
+    tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+def test_validate_schema_write_table(tempdir):
+    # ARROW-2926
+    simple_fields = [
+        pa.field('POS', pa.uint32()),
+        pa.field('desc', pa.string())
+    ]
+
+    simple_schema = pa.schema(simple_fields)
+
+    # simple_table schema does not match simple_schema
+    simple_from_array = [pa.array([1]), pa.array(['bla'])]
+    simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])
+
+    path = tempdir / 'simple_validate_schema.parquet'
+
+    with pq.ParquetWriter(path, simple_schema,
+                          version='2.6',
+                          compression='snappy', flavor='spark') as w:
+        with pytest.raises(ValueError):
+            w.write_table(simple_table)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_parquet_writer_context_obj(tempdir, use_legacy_dataset):
+    df = _test_dataframe(100)
+    df['unique_id'] = 0
+
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    out = pa.BufferOutputStream()
+
+    with pq.ParquetWriter(out, arrow_table.schema, version='2.6') as writer:
+
+        frames = []
+        for i in range(10):
+            df['unique_id'] = i
+            arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+            writer.write_table(arrow_table)
+
+            frames.append(df.copy())
+
+    buf = out.getvalue()
+    result = _read_table(
+        pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
+
+    expected = pd.concat(frames, ignore_index=True)
+    tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_parquet_writer_context_obj_with_exception(
+    tempdir, use_legacy_dataset
+):
+    df = _test_dataframe(100)
+    df['unique_id'] = 0
+
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    out = pa.BufferOutputStream()
+    error_text = 'Artificial Error'
+
+    try:
+        with pq.ParquetWriter(out,
+                              arrow_table.schema,
+                              version='2.6') as writer:
+
+            frames = []
+            for i in range(10):
+                df['unique_id'] = i
+                arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+                writer.write_table(arrow_table)
+                frames.append(df.copy())
+                if i == 5:
+                    raise ValueError(error_text)
+    except Exception as e:
+        assert str(e) == error_text
+
+    buf = out.getvalue()
+    result = _read_table(
+        pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
+
+    expected = pd.concat(frames, ignore_index=True)
+    tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("filesystem", [
+    None,
+    LocalFileSystem._get_instance(),
+    fs.LocalFileSystem(),
+])
+def test_parquet_writer_write_wrappers(tempdir, filesystem):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
+    path_table = str(tempdir / 'data_table.parquet')
+    path_batch = str(tempdir / 'data_batch.parquet')
+
+    with pq.ParquetWriter(
+        path_table, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(path_table).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_batch, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_batch(batch)
+
+    result = _read_table(path_batch).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_table, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write(table)
+
+    result = _read_table(path_table).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_batch, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write(batch)
+
+    result = _read_table(path_batch).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("filesystem", [
+    None,
+    LocalFileSystem._get_instance(),
+    fs.LocalFileSystem(),
+])
+def test_parquet_writer_filesystem_local(tempdir, filesystem):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+
+    with pq.ParquetWriter(
+        path, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(path).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_parquet_writer_filesystem_s3(s3_example_fs):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    fs, uri, path = s3_example_fs
+
+    with pq.ParquetWriter(
+        path, table.schema, filesystem=fs, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(uri).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_parquet_writer_filesystem_s3_uri(s3_example_fs):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    fs, uri, path = s3_example_fs
+
+    with pq.ParquetWriter(uri, table.schema, version='2.6') as writer:
+        writer.write_table(table)
+
+    result = _read_table(path, filesystem=fs).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    fs, directory = s3_example_s3fs
+    path = directory + "/test.parquet"
+
+    with pq.ParquetWriter(
+        path, table.schema, filesystem=fs, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(path, filesystem=fs).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+def test_parquet_writer_filesystem_buffer_raises():
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    filesystem = fs.LocalFileSystem()
+
+    # Should raise ValueError when filesystem is passed with file-like object
+    with pytest.raises(ValueError, match="specified path is file-like"):
+        pq.ParquetWriter(
+            pa.BufferOutputStream(), table.schema, filesystem=filesystem
+        )
+
+
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset):
+    out = pa.BufferOutputStream()
+
+    class CustomFS(FileSystem):
+        def __init__(self):
+            self.path = None
+            self.mode = None
+
+        def open(self, path, mode='rb'):
+            self.path = path
+            self.mode = mode
+            return out
+
+    fs = CustomFS()
+    fname = 'expected_fname.parquet'
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    with pq.ParquetWriter(fname, table.schema, filesystem=fs, version='2.6') \
+            as writer:
+        writer.write_table(table)
+
+    assert fs.path == fname
+    assert fs.mode == 'wb'
+    assert out.closed
+
+    buf = out.getvalue()
+    table_read = _read_table(
+        pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df_read, df)
+
+    # Should raise ValueError when filesystem is passed with file-like object
+    with pytest.raises(ValueError) as err_info:
+        pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=fs)
+        expected_msg = ("filesystem passed but where is file-like, so"
+                        " there is nothing to open with filesystem.")
+        assert str(err_info) == expected_msg