first commit

2025-12-13 21:19:19 +00:00 · 2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/api.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/api.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/discovery.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/file_csv.h"
+#include "arrow/dataset/file_ipc.h"
+#include "arrow/dataset/file_orc.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/dataset/scanner.h"
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/dataset.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/dataset.h
@@ -0,0 +1,251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+namespace dataset {
+
+using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
+
+/// \brief A granular piece of a Dataset, such as an individual file.
+///
+/// A Fragment can be read/scanned separately from other fragments. It yields a
+/// collection of RecordBatches when scanned, encapsulated in one or more
+/// ScanTasks.
+///
+/// Note that Fragments have well defined physical schemas which are reconciled by
+/// the Datasets which contain them; these physical schemas may differ from a parent
+/// Dataset's schema and the physical schemas of sibling Fragments.
+class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
+ public:
+  /// \brief Return the physical schema of the Fragment.
+  ///
+  /// The physical schema is also called the writer schema.
+  /// This method is blocking and may suffer from high latency filesystem.
+  /// The schema is cached after being read once, or may be specified at construction.
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchema();
+
+  /// An asynchronous version of Scan
+  virtual Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options) = 0;
+
+  /// \brief Count the number of rows in this fragment matching the filter using metadata
+  /// only. That is, this method may perform I/O, but will not load data.
+  ///
+  /// If this is not possible, resolve with an empty optional. The fragment can perform
+  /// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request.
+  virtual Future<util::optional<int64_t>> CountRows(
+      compute::Expression predicate, const std::shared_ptr<ScanOptions>& options);
+
+  virtual std::string type_name() const = 0;
+  virtual std::string ToString() const { return type_name(); }
+
+  /// \brief An expression which evaluates to true for all data viewed by this
+  /// Fragment.
+  const compute::Expression& partition_expression() const {
+    return partition_expression_;
+  }
+
+  virtual ~Fragment() = default;
+
+ protected:
+  Fragment() = default;
+  explicit Fragment(compute::Expression partition_expression,
+                    std::shared_ptr<Schema> physical_schema);
+
+  virtual Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() = 0;
+
+  util::Mutex physical_schema_mutex_;
+  compute::Expression partition_expression_ = compute::literal(true);
+  std::shared_ptr<Schema> physical_schema_;
+};
+
+/// \brief Per-scan options for fragment(s) in a dataset.
+///
+/// These options are not intrinsic to the format or fragment itself, but do affect
+/// the results of a scan. These are options which make sense to change between
+/// repeated reads of the same dataset, such as format-specific conversion options
+/// (that do not affect the schema).
+///
+/// \ingroup dataset-scanning
+class ARROW_DS_EXPORT FragmentScanOptions {
+ public:
+  virtual std::string type_name() const = 0;
+  virtual std::string ToString() const { return type_name(); }
+  virtual ~FragmentScanOptions() = default;
+};
+
+/// \defgroup dataset-implementations Concrete implementations
+///
+/// @{
+
+/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
+/// RecordBatch.
+class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
+ public:
+  InMemoryFragment(std::shared_ptr<Schema> schema, RecordBatchVector record_batches,
+                   compute::Expression = compute::literal(true));
+  explicit InMemoryFragment(RecordBatchVector record_batches,
+                            compute::Expression = compute::literal(true));
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options) override;
+  Future<util::optional<int64_t>> CountRows(
+      compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  std::string type_name() const override { return "in-memory"; }
+
+ protected:
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
+
+  RecordBatchVector record_batches_;
+};
+
+/// @}
+
+/// \brief A container of zero or more Fragments.
+///
+/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
+/// directory. A Dataset has a schema to which Fragments must align during a
+/// scan operation. This is analogous to Avro's reader and writer schema.
+class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
+ public:
+  /// \brief Begin to build a new Scan operation against this Dataset
+  Result<std::shared_ptr<ScannerBuilder>> NewScan();
+
+  /// \brief GetFragments returns an iterator of Fragments given a predicate.
+  Result<FragmentIterator> GetFragments(compute::Expression predicate);
+  Result<FragmentIterator> GetFragments();
+
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
+
+  /// \brief An expression which evaluates to true for all data viewed by this Dataset.
+  /// May be null, which indicates no information is available.
+  const compute::Expression& partition_expression() const {
+    return partition_expression_;
+  }
+
+  /// \brief The name identifying the kind of Dataset
+  virtual std::string type_name() const = 0;
+
+  /// \brief Return a copy of this Dataset with a different schema.
+  ///
+  /// The copy will view the same Fragments. If the new schema is not compatible with the
+  /// original dataset's schema then an error will be raised.
+  virtual Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const = 0;
+
+  virtual ~Dataset() = default;
+
+ protected:
+  explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
+
+  Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression);
+
+  virtual Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) = 0;
+
+  std::shared_ptr<Schema> schema_;
+  compute::Expression partition_expression_ = compute::literal(true);
+};
+
+/// \addtogroup dataset-implementations
+///
+/// @{
+
+/// \brief A Source which yields fragments wrapping a stream of record batches.
+///
+/// The record batches must match the schema provided to the source at construction.
+class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
+ public:
+  class RecordBatchGenerator {
+   public:
+    virtual ~RecordBatchGenerator() = default;
+    virtual RecordBatchIterator Get() const = 0;
+  };
+
+  /// Construct a dataset from a schema and a factory of record batch iterators.
+  InMemoryDataset(std::shared_ptr<Schema> schema,
+                  std::shared_ptr<RecordBatchGenerator> get_batches)
+      : Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
+
+  /// Convenience constructor taking a fixed list of batches
+  InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);
+
+  /// Convenience constructor taking a Table
+  explicit InMemoryDataset(std::shared_ptr<Table> table);
+
+  std::string type_name() const override { return "in-memory"; }
+
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
+ protected:
+  Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
+
+  std::shared_ptr<RecordBatchGenerator> get_batches_;
+};
+
+/// \brief A Dataset wrapping child Datasets.
+class ARROW_DS_EXPORT UnionDataset : public Dataset {
+ public:
+  /// \brief Construct a UnionDataset wrapping child Datasets.
+  ///
+  /// \param[in] schema the schema of the resulting dataset.
+  /// \param[in] children one or more child Datasets. Their schemas must be identical to
+  /// schema.
+  static Result<std::shared_ptr<UnionDataset>> Make(std::shared_ptr<Schema> schema,
+                                                    DatasetVector children);
+
+  const DatasetVector& children() const { return children_; }
+
+  std::string type_name() const override { return "union"; }
+
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
+ protected:
+  Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
+
+  explicit UnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
+      : Dataset(std::move(schema)), children_(std::move(children)) {}
+
+  DatasetVector children_;
+
+  friend class UnionDatasetFactory;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/dataset/file_base.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/util/async_util.h"
+#include "arrow/util/future.h"
+
+namespace arrow {
+namespace dataset {
+namespace internal {
+
+// This lines up with our other defaults in the scanner and execution plan
+constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024;
+
+/// \brief Utility class that manages a set of writers to different paths
+///
+/// Writers may be closed and reopened (and a new file created) based on the dataset
+/// write options (for example, max_rows_per_file or max_open_files)
+///
+/// The dataset writer enforces its own back pressure based on the # of rows (as opposed
+/// to # of batches which is how it is typically enforced elsewhere) and # of files.
+class ARROW_DS_EXPORT DatasetWriter {
+ public:
+  /// \brief Create a dataset writer
+  ///
+  /// Will fail if basename_template is invalid or if there is existing data and
+  /// existing_data_behavior is kError
+  ///
+  /// \param write_options options to control how the data should be written
+  /// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer
+  ///                        will ask for backpressure
+  static Result<std::unique_ptr<DatasetWriter>> Make(
+      FileSystemDatasetWriteOptions write_options,
+      uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
+
+  ~DatasetWriter();
+
+  /// \brief Write a batch to the dataset
+  /// \param[in] batch The batch to write
+  /// \param[in] directory The directory to write to
+  ///
+  /// Note: The written filename will be {directory}/{filename_factory(i)} where i is a
+  /// counter controlled by `max_open_files` and `max_rows_per_file`
+  ///
+  /// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches
+  /// may be written to the same file.
+  ///
+  /// The returned future will be marked finished when the record batch has been queued
+  /// to be written.  If the returned future is unfinished then this indicates the dataset
+  /// writer's queue is full and the data provider should pause.
+  ///
+  /// This method is NOT async reentrant.  The returned future will only be unfinished
+  /// if back pressure needs to be applied.  Async reentrancy is not necessary for
+  /// concurrent writes to happen.  Calling this method again before the previous future
+  /// completes will not just violate max_rows_queued but likely lead to race conditions.
+  ///
+  /// One thing to note is that the ordering of your data can affect your maximum
+  /// potential parallelism.  If this seems odd then consider a dataset where the first
+  /// 1000 batches go to the same directory and then the 1001st batch goes to a different
+  /// directory.  The only way to get two parallel writes immediately would be to queue
+  /// all 1000 pending writes to the first directory.
+  Future<> WriteRecordBatch(std::shared_ptr<RecordBatch> batch,
+                            const std::string& directory, const std::string& prefix = "");
+
+  /// Finish all pending writes and close any open files
+  Future<> Finish();
+
+ protected:
+  DatasetWriter(FileSystemDatasetWriteOptions write_options,
+                uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
+
+  class DatasetWriterImpl;
+  std::unique_ptr<DatasetWriterImpl, util::DestroyingDeleter<DatasetWriterImpl>> impl_;
+};
+
+}  // namespace internal
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/discovery.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/discovery.h
@@ -0,0 +1,271 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logic for automatically determining the structure of multi-file
+/// dataset with possible partitioning according to available
+/// partitioning
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/dataset/partition.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/filesystem/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/variant.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \defgroup dataset-discovery Discovery API
+///
+/// @{
+
+struct InspectOptions {
+  /// See `fragments` property.
+  static constexpr int kInspectAllFragments = -1;
+
+  /// Indicate how many fragments should be inspected to infer the unified dataset
+  /// schema. Limiting the number of fragments accessed improves the latency of
+  /// the discovery process when dealing with a high number of fragments and/or
+  /// high latency file systems.
+  ///
+  /// The default value of `1` inspects the schema of the first (in no particular
+  /// order) fragment only. If the dataset has a uniform schema for all fragments,
+  /// this default is the optimal value. In order to inspect all fragments and
+  /// robustly unify their potentially varying schemas, set this option to
+  /// `kInspectAllFragments`. A value of `0` disables inspection of fragments
+  /// altogether so only the partitioning schema will be inspected.
+  int fragments = 1;
+};
+
+struct FinishOptions {
+  /// Finalize the dataset with this given schema. If the schema is not
+  /// provided, infer the schema via the Inspect, see the `inspect_options`
+  /// property.
+  std::shared_ptr<Schema> schema = NULLPTR;
+
+  /// If the schema is not provided, it will be discovered by passing the
+  /// following options to `DatasetDiscovery::Inspect`.
+  InspectOptions inspect_options{};
+
+  /// Indicate if the given Schema (when specified), should be validated against
+  /// the fragments' schemas. `inspect_options` will control how many fragments
+  /// are checked.
+  bool validate_fragments = false;
+};
+
+/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected
+/// schema before materializing said Dataset.
+class ARROW_DS_EXPORT DatasetFactory {
+ public:
+  /// \brief Get the schemas of the Fragments and Partitioning.
+  virtual Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) = 0;
+
+  /// \brief Get unified schema for the resulting Dataset.
+  Result<std::shared_ptr<Schema>> Inspect(InspectOptions options = {});
+
+  /// \brief Create a Dataset
+  Result<std::shared_ptr<Dataset>> Finish();
+  /// \brief Create a Dataset with the given schema (see \a InspectOptions::schema)
+  Result<std::shared_ptr<Dataset>> Finish(std::shared_ptr<Schema> schema);
+  /// \brief Create a Dataset with the given options
+  virtual Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) = 0;
+
+  /// \brief Optional root partition for the resulting Dataset.
+  const compute::Expression& root_partition() const { return root_partition_; }
+  /// \brief Set the root partition for the resulting Dataset.
+  Status SetRootPartition(compute::Expression partition) {
+    root_partition_ = std::move(partition);
+    return Status::OK();
+  }
+
+  virtual ~DatasetFactory() = default;
+
+ protected:
+  DatasetFactory();
+
+  compute::Expression root_partition_;
+};
+
+/// @}
+
+/// \brief DatasetFactory provides a way to inspect/discover a Dataset's
+/// expected schema before materialization.
+/// \ingroup dataset-implementations
+class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory {
+ public:
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::vector<std::shared_ptr<DatasetFactory>> factories);
+
+  /// \brief Return the list of child DatasetFactory
+  const std::vector<std::shared_ptr<DatasetFactory>>& factories() const {
+    return factories_;
+  }
+
+  /// \brief Get the schemas of the Datasets.
+  ///
+  /// Instead of applying options globally, it applies at each child factory.
+  /// This will not respect `options.fragments` exactly, but will respect the
+  /// spirit of peeking the first fragments or all of them.
+  Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) override;
+
+  /// \brief Create a Dataset.
+  Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
+
+ protected:
+  explicit UnionDatasetFactory(std::vector<std::shared_ptr<DatasetFactory>> factories);
+
+  std::vector<std::shared_ptr<DatasetFactory>> factories_;
+};
+
+/// \ingroup dataset-filesystem
+struct FileSystemFactoryOptions {
+  /// Either an explicit Partitioning or a PartitioningFactory to discover one.
+  ///
+  /// If a factory is provided, it will be used to infer a schema for partition fields
+  /// based on file and directory paths then construct a Partitioning. The default
+  /// is a Partitioning which will yield no partition information.
+  ///
+  /// The (explicit or discovered) partitioning will be applied to discovered files
+  /// and the resulting partition information embedded in the Dataset.
+  PartitioningOrFactory partitioning{Partitioning::Default()};
+
+  /// For the purposes of applying the partitioning, paths will be stripped
+  /// of the partition_base_dir. Files not matching the partition_base_dir
+  /// prefix will be skipped for partition discovery. The ignored files will still
+  /// be part of the Dataset, but will not have partition information.
+  ///
+  /// Example:
+  /// partition_base_dir = "/dataset";
+  ///
+  /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
+  ///
+  /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
+  ///
+  /// This is useful for partitioning which parses directory when ordering
+  /// is important, e.g. DirectoryPartitioning.
+  std::string partition_base_dir;
+
+  /// Invalid files (via selector or explicitly) will be excluded by checking
+  /// with the FileFormat::IsSupported method.  This will incur IO for each files
+  /// in a serial and single threaded fashion. Disabling this feature will skip the
+  /// IO, but unsupported files may be present in the Dataset
+  /// (resulting in an error at scan time).
+  bool exclude_invalid_files = false;
+
+  /// When discovering from a Selector (and not from an explicit file list), ignore
+  /// files and directories matching any of these prefixes.
+  ///
+  /// Example (with selector = "/dataset/**"):
+  /// selector_ignore_prefixes = {"_", ".DS_STORE" };
+  ///
+  /// - "/dataset/data.csv" -> not ignored
+  /// - "/dataset/_metadata" -> ignored
+  /// - "/dataset/.DS_STORE" -> ignored
+  /// - "/dataset/_hidden/dat" -> ignored
+  /// - "/dataset/nested/.DS_STORE" -> ignored
+  std::vector<std::string> selector_ignore_prefixes = {
+      ".",
+      "_",
+  };
+};
+
+/// \brief FileSystemDatasetFactory creates a Dataset from a vector of
+/// fs::FileInfo or a fs::FileSelector.
+/// \ingroup dataset-filesystem
+class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
+ public:
+  /// \brief Build a FileSystemDatasetFactory from an explicit list of
+  /// paths.
+  ///
+  /// \param[in] filesystem passed to FileSystemDataset
+  /// \param[in] paths passed to FileSystemDataset
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
+      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+
+  /// \brief Build a FileSystemDatasetFactory from a fs::FileSelector.
+  ///
+  /// The selector will expand to a vector of FileInfo. The expansion/crawling
+  /// is performed in this function call. Thus, the finalized Dataset is
+  /// working with a snapshot of the filesystem.
+  //
+  /// If options.partition_base_dir is not provided, it will be overwritten
+  /// with selector.base_dir.
+  ///
+  /// \param[in] filesystem passed to FileSystemDataset
+  /// \param[in] selector used to crawl and search files
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector,
+      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+
+  /// \brief Build a FileSystemDatasetFactory from an uri including filesystem
+  /// information.
+  ///
+  /// \param[in] uri passed to FileSystemDataset
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(std::string uri,
+                                                      std::shared_ptr<FileFormat> format,
+                                                      FileSystemFactoryOptions options);
+
+  /// \brief Build a FileSystemDatasetFactory from an explicit list of
+  /// file information.
+  ///
+  /// \param[in] filesystem passed to FileSystemDataset
+  /// \param[in] files passed to FileSystemDataset
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
+      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+
+  Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) override;
+
+  Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
+
+ protected:
+  FileSystemDatasetFactory(std::vector<fs::FileInfo> files,
+                           std::shared_ptr<fs::FileSystem> filesystem,
+                           std::shared_ptr<FileFormat> format,
+                           FileSystemFactoryOptions options);
+
+  Result<std::shared_ptr<Schema>> PartitionSchema();
+
+  std::vector<fs::FileInfo> files_;
+  std::shared_ptr<fs::FileSystem> fs_;
+  std::shared_ptr<FileFormat> format_;
+  FileSystemFactoryOptions options_;
+};
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_base.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_base.h
@@ -0,0 +1,433 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/partition.h"
+#include "arrow/dataset/scanner.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/io/file.h"
+#include "arrow/util/compression.h"
+
+namespace arrow {
+
+namespace dataset {
+
+/// \defgroup dataset-file-formats File formats for reading and writing datasets
+/// \defgroup dataset-filesystem File system datasets
+///
+/// @{
+
+/// \brief The path and filesystem where an actual file is located or a buffer which can
+/// be read like a file
+class ARROW_DS_EXPORT FileSource {
+ public:
+  FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
+             Compression::type compression = Compression::UNCOMPRESSED)
+      : file_info_(std::move(path)),
+        filesystem_(std::move(filesystem)),
+        compression_(compression) {}
+
+  FileSource(fs::FileInfo info, std::shared_ptr<fs::FileSystem> filesystem,
+             Compression::type compression = Compression::UNCOMPRESSED)
+      : file_info_(std::move(info)),
+        filesystem_(std::move(filesystem)),
+        compression_(compression) {}
+
+  explicit FileSource(std::shared_ptr<Buffer> buffer,
+                      Compression::type compression = Compression::UNCOMPRESSED)
+      : buffer_(std::move(buffer)), compression_(compression) {}
+
+  using CustomOpen = std::function<Result<std::shared_ptr<io::RandomAccessFile>>()>;
+  explicit FileSource(CustomOpen open) : custom_open_(std::move(open)) {}
+
+  using CustomOpenWithCompression =
+      std::function<Result<std::shared_ptr<io::RandomAccessFile>>(Compression::type)>;
+  explicit FileSource(CustomOpenWithCompression open_with_compression,
+                      Compression::type compression = Compression::UNCOMPRESSED)
+      : custom_open_(std::bind(std::move(open_with_compression), compression)),
+        compression_(compression) {}
+
+  explicit FileSource(std::shared_ptr<io::RandomAccessFile> file,
+                      Compression::type compression = Compression::UNCOMPRESSED)
+      : custom_open_([=] { return ToResult(file); }), compression_(compression) {}
+
+  FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {}
+
+  static std::vector<FileSource> FromPaths(const std::shared_ptr<fs::FileSystem>& fs,
+                                           std::vector<std::string> paths) {
+    std::vector<FileSource> sources;
+    for (auto&& path : paths) {
+      sources.emplace_back(std::move(path), fs);
+    }
+    return sources;
+  }
+
+  /// \brief Return the type of raw compression on the file, if any.
+  Compression::type compression() const { return compression_; }
+
+  /// \brief Return the file path, if any. Only valid when file source wraps a path.
+  const std::string& path() const {
+    static std::string buffer_path = "<Buffer>";
+    static std::string custom_open_path = "<Buffer>";
+    return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path;
+  }
+
+  /// \brief Return the filesystem, if any. Otherwise returns nullptr
+  const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
+
+  /// \brief Return the buffer containing the file, if any. Otherwise returns nullptr
+  const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
+
+  /// \brief Get a RandomAccessFile which views this file source
+  Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
+
+  /// \brief Get an InputStream which views this file source (and decompresses if needed)
+  /// \param[in] compression If nullopt, guess the compression scheme from the
+  ///     filename, else decompress with the given codec
+  Result<std::shared_ptr<io::InputStream>> OpenCompressed(
+      util::optional<Compression::type> compression = util::nullopt) const;
+
+ private:
+  static Result<std::shared_ptr<io::RandomAccessFile>> InvalidOpen() {
+    return Status::Invalid("Called Open() on an uninitialized FileSource");
+  }
+
+  fs::FileInfo file_info_;
+  std::shared_ptr<fs::FileSystem> filesystem_;
+  std::shared_ptr<Buffer> buffer_;
+  CustomOpen custom_open_;
+  Compression::type compression_ = Compression::UNCOMPRESSED;
+};
+
+/// \brief Base class for file format implementation
+class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileFormat> {
+ public:
+  /// Options affecting how this format is scanned.
+  ///
+  /// The options here can be overridden at scan time.
+  std::shared_ptr<FragmentScanOptions> default_fragment_scan_options;
+
+  virtual ~FileFormat() = default;
+
+  /// \brief The name identifying the kind of file format
+  virtual std::string type_name() const = 0;
+
+  virtual bool Equals(const FileFormat& other) const = 0;
+
+  /// \brief Indicate if the FileSource is supported/readable by this format.
+  virtual Result<bool> IsSupported(const FileSource& source) const = 0;
+
+  /// \brief Return the schema of the file if possible.
+  virtual Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const = 0;
+
+  virtual Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const = 0;
+
+  virtual Future<util::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options);
+
+  /// \brief Open a fragment
+  virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression,
+      std::shared_ptr<Schema> physical_schema);
+
+  /// \brief Create a FileFragment for a FileSource.
+  Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression);
+
+  /// \brief Create a FileFragment for a FileSource.
+  Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, std::shared_ptr<Schema> physical_schema = NULLPTR);
+
+  /// \brief Create a writer for this format.
+  virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const = 0;
+
+  /// \brief Get default write options for this format.
+  virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() = 0;
+};
+
+/// \brief A Fragment that is stored in a file with a known format
+class ARROW_DS_EXPORT FileFragment : public Fragment {
+ public:
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options) override;
+  Future<util::optional<int64_t>> CountRows(
+      compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  std::string type_name() const override { return format_->type_name(); }
+  std::string ToString() const override { return source_.path(); };
+
+  const FileSource& source() const { return source_; }
+  const std::shared_ptr<FileFormat>& format() const { return format_; }
+
+ protected:
+  FileFragment(FileSource source, std::shared_ptr<FileFormat> format,
+               compute::Expression partition_expression,
+               std::shared_ptr<Schema> physical_schema)
+      : Fragment(std::move(partition_expression), std::move(physical_schema)),
+        source_(std::move(source)),
+        format_(std::move(format)) {}
+
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
+
+  FileSource source_;
+  std::shared_ptr<FileFormat> format_;
+
+  friend class FileFormat;
+};
+
+/// \brief A Dataset of FileFragments.
+///
+/// A FileSystemDataset is composed of one or more FileFragment. The fragments
+/// are independent and don't need to share the same format and/or filesystem.
+class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
+ public:
+  /// \brief Create a FileSystemDataset.
+  ///
+  /// \param[in] schema the schema of the dataset
+  /// \param[in] root_partition the partition expression of the dataset
+  /// \param[in] format the format of each FileFragment.
+  /// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the
+  ///            fragments wrap buffers.
+  /// \param[in] fragments list of fragments to create the dataset from.
+  /// \param[in] partitioning the Partitioning object in case the dataset is created
+  ///            with a known partitioning (e.g. from a discovered partitioning
+  ///            through a DatasetFactory), or nullptr if not known.
+  ///
+  /// Note that fragments wrapping files resident in differing filesystems are not
+  /// permitted; to work with multiple filesystems use a UnionDataset.
+  ///
+  /// \return A constructed dataset.
+  static Result<std::shared_ptr<FileSystemDataset>> Make(
+      std::shared_ptr<Schema> schema, compute::Expression root_partition,
+      std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
+      std::vector<std::shared_ptr<FileFragment>> fragments,
+      std::shared_ptr<Partitioning> partitioning = NULLPTR);
+
+  /// \brief Write a dataset.
+  static Status Write(const FileSystemDatasetWriteOptions& write_options,
+                      std::shared_ptr<Scanner> scanner);
+
+  /// \brief Return the type name of the dataset.
+  std::string type_name() const override { return "filesystem"; }
+
+  /// \brief Replace the schema of the dataset.
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
+  /// \brief Return the path of files.
+  std::vector<std::string> files() const;
+
+  /// \brief Return the format.
+  const std::shared_ptr<FileFormat>& format() const { return format_; }
+
+  /// \brief Return the filesystem. May be nullptr if the fragments wrap buffers.
+  const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
+
+  /// \brief Return the partitioning. May be nullptr if the dataset was not constructed
+  /// with a partitioning.
+  const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
+
+  std::string ToString() const;
+
+ protected:
+  struct FragmentSubtrees;
+
+  explicit FileSystemDataset(std::shared_ptr<Schema> schema)
+      : Dataset(std::move(schema)) {}
+
+  FileSystemDataset(std::shared_ptr<Schema> schema,
+                    compute::Expression partition_expression)
+      : Dataset(std::move(schema), partition_expression) {}
+
+  Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
+
+  void SetupSubtreePruning();
+
+  std::shared_ptr<FileFormat> format_;
+  std::shared_ptr<fs::FileSystem> filesystem_;
+  std::vector<std::shared_ptr<FileFragment>> fragments_;
+  std::shared_ptr<Partitioning> partitioning_;
+
+  std::shared_ptr<FragmentSubtrees> subtrees_;
+};
+
+/// \brief Options for writing a file of this format.
+class ARROW_DS_EXPORT FileWriteOptions {
+ public:
+  virtual ~FileWriteOptions() = default;
+
+  const std::shared_ptr<FileFormat>& format() const { return format_; }
+
+  std::string type_name() const { return format_->type_name(); }
+
+ protected:
+  explicit FileWriteOptions(std::shared_ptr<FileFormat> format)
+      : format_(std::move(format)) {}
+
+  std::shared_ptr<FileFormat> format_;
+};
+
+/// \brief A writer for this format.
+class ARROW_DS_EXPORT FileWriter {
+ public:
+  virtual ~FileWriter() = default;
+
+  /// \brief Write the given batch.
+  virtual Status Write(const std::shared_ptr<RecordBatch>& batch) = 0;
+
+  /// \brief Write all batches from the reader.
+  Status Write(RecordBatchReader* batches);
+
+  /// \brief Indicate that writing is done.
+  virtual Future<> Finish();
+
+  const std::shared_ptr<FileFormat>& format() const { return options_->format(); }
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
+  const std::shared_ptr<FileWriteOptions>& options() const { return options_; }
+  const fs::FileLocator& destination() const { return destination_locator_; }
+
+ protected:
+  FileWriter(std::shared_ptr<Schema> schema, std::shared_ptr<FileWriteOptions> options,
+             std::shared_ptr<io::OutputStream> destination,
+             fs::FileLocator destination_locator)
+      : schema_(std::move(schema)),
+        options_(std::move(options)),
+        destination_(std::move(destination)),
+        destination_locator_(std::move(destination_locator)) {}
+
+  virtual Future<> FinishInternal() = 0;
+
+  std::shared_ptr<Schema> schema_;
+  std::shared_ptr<FileWriteOptions> options_;
+  std::shared_ptr<io::OutputStream> destination_;
+  fs::FileLocator destination_locator_;
+};
+
+/// \brief Options for writing a dataset.
+struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
+  /// Options for individual fragment writing.
+  std::shared_ptr<FileWriteOptions> file_write_options;
+
+  /// FileSystem into which a dataset will be written.
+  std::shared_ptr<fs::FileSystem> filesystem;
+
+  /// Root directory into which the dataset will be written.
+  std::string base_dir;
+
+  /// Partitioning used to generate fragment paths.
+  std::shared_ptr<Partitioning> partitioning;
+
+  /// Maximum number of partitions any batch may be written into, default is 1K.
+  int max_partitions = 1024;
+
+  /// Template string used to generate fragment basenames.
+  /// {i} will be replaced by an auto incremented integer.
+  std::string basename_template;
+
+  /// If greater than 0 then this will limit the maximum number of files that can be left
+  /// open. If an attempt is made to open too many files then the least recently used file
+  /// will be closed.  If this setting is set too low you may end up fragmenting your data
+  /// into many small files.
+  ///
+  /// The default is 900 which also allows some # of files to be open by the scanner
+  /// before hitting the default Linux limit of 1024
+  uint32_t max_open_files = 900;
+
+  /// If greater than 0 then this will limit how many rows are placed in any single file.
+  /// Otherwise there will be no limit and one file will be created in each output
+  /// directory unless files need to be closed to respect max_open_files
+  uint64_t max_rows_per_file = 0;
+
+  /// If greater than 0 then this will cause the dataset writer to batch incoming data
+  /// and only write the row groups to the disk when sufficient rows have accumulated.
+  /// The final row group size may be less than this value and other options such as
+  /// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes.
+  uint64_t min_rows_per_group = 0;
+
+  /// If greater than 0 then the dataset writer may split up large incoming batches into
+  /// multiple row groups.  If this value is set then min_rows_per_group should also be
+  /// set or else you may end up with very small row groups (e.g. if the incoming row
+  /// group size is just barely larger than this value).
+  uint64_t max_rows_per_group = 1 << 20;
+
+  /// Controls what happens if an output directory already exists.
+  ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError;
+
+  /// \brief If false the dataset writer will not create directories
+  /// This is mainly intended for filesystems that do not require directories such as S3.
+  bool create_dir = true;
+
+  /// Callback to be invoked against all FileWriters before
+  /// they are finalized with FileWriter::Finish().
+  std::function<Status(FileWriter*)> writer_pre_finish = [](FileWriter*) {
+    return Status::OK();
+  };
+
+  /// Callback to be invoked against all FileWriters after they have
+  /// called FileWriter::Finish().
+  std::function<Status(FileWriter*)> writer_post_finish = [](FileWriter*) {
+    return Status::OK();
+  };
+
+  const std::shared_ptr<FileFormat>& format() const {
+    return file_write_options->format();
+  }
+};
+
+/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions
+class ARROW_DS_EXPORT WriteNodeOptions : public compute::ExecNodeOptions {
+ public:
+  explicit WriteNodeOptions(
+      FileSystemDatasetWriteOptions options,
+      std::shared_ptr<const KeyValueMetadata> custom_metadata = NULLPTR)
+      : write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {}
+
+  /// \brief Options to control how to write the dataset
+  FileSystemDatasetWriteOptions write_options;
+  /// \brief Optional metadata to attach to written batches
+  std::shared_ptr<const KeyValueMetadata> custom_metadata;
+};
+
+/// @}
+
+namespace internal {
+ARROW_DS_EXPORT void InitializeDatasetWriter(
+    arrow::compute::ExecFactoryRegistry* registry);
+}
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_csv.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_csv.h
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/csv/options.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/status.h"
+#include "arrow/util/compression.h"
+
+namespace arrow {
+namespace dataset {
+
+constexpr char kCsvTypeName[] = "csv";
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+/// \brief A FileFormat implementation that reads from and writes to Csv files
+class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
+ public:
+  /// Options affecting the parsing of CSV files
+  csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
+
+  std::string type_name() const override { return kCsvTypeName; }
+
+  bool Equals(const FileFormat& other) const override;
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& scan_options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<util::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// \brief Per-scan options for CSV fragments
+struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
+  std::string type_name() const override { return kCsvTypeName; }
+
+  /// CSV conversion options
+  csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
+
+  /// CSV reading options
+  ///
+  /// Note that use_threads is always ignored.
+  csv::ReadOptions read_options = csv::ReadOptions::Defaults();
+};
+
+class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions {
+ public:
+  /// Options passed to csv::MakeCSVWriter.
+  std::shared_ptr<csv::WriteOptions> write_options;
+
+ protected:
+  using FileWriteOptions::FileWriteOptions;
+
+  friend class CsvFileFormat;
+};
+
+class ARROW_DS_EXPORT CsvFileWriter : public FileWriter {
+ public:
+  Status Write(const std::shared_ptr<RecordBatch>& batch) override;
+
+ private:
+  CsvFileWriter(std::shared_ptr<io::OutputStream> destination,
+                std::shared_ptr<ipc::RecordBatchWriter> writer,
+                std::shared_ptr<Schema> schema,
+                std::shared_ptr<CsvFileWriteOptions> options,
+                fs::FileLocator destination_locator);
+
+  Future<> FinishInternal() override;
+
+  std::shared_ptr<io::OutputStream> destination_;
+  std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
+
+  friend class CsvFileFormat;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_ipc.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_ipc.h
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+constexpr char kIpcTypeName[] = "ipc";
+
+/// \brief A FileFormat implementation that reads from and writes to Ipc files
+class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
+ public:
+  std::string type_name() const override { return kIpcTypeName; }
+
+  bool Equals(const FileFormat& other) const override {
+    return type_name() == other.type_name();
+  }
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<util::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// \brief Per-scan options for IPC fragments
+class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions {
+ public:
+  std::string type_name() const override { return kIpcTypeName; }
+
+  /// Options passed to the IPC file reader.
+  /// included_fields, memory_pool, and use_threads are ignored.
+  std::shared_ptr<ipc::IpcReadOptions> options;
+  /// If present, the async scanner will enable I/O coalescing.
+  /// This is ignored by the sync scanner.
+  std::shared_ptr<io::CacheOptions> cache_options;
+};
+
+class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions {
+ public:
+  /// Options passed to ipc::MakeFileWriter. use_threads is ignored
+  std::shared_ptr<ipc::IpcWriteOptions> options;
+
+  /// custom_metadata written to the file's footer
+  std::shared_ptr<const KeyValueMetadata> metadata;
+
+ protected:
+  using FileWriteOptions::FileWriteOptions;
+
+  friend class IpcFileFormat;
+};
+
+class ARROW_DS_EXPORT IpcFileWriter : public FileWriter {
+ public:
+  Status Write(const std::shared_ptr<RecordBatch>& batch) override;
+
+ private:
+  IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
+                std::shared_ptr<ipc::RecordBatchWriter> writer,
+                std::shared_ptr<Schema> schema,
+                std::shared_ptr<IpcFileWriteOptions> options,
+                fs::FileLocator destination_locator);
+
+  Future<> FinishInternal() override;
+
+  std::shared_ptr<io::OutputStream> destination_;
+  std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
+
+  friend class IpcFileFormat;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_orc.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_orc.h
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+constexpr char kOrcTypeName[] = "orc";
+
+/// \brief A FileFormat implementation that reads from and writes to ORC files
+class ARROW_DS_EXPORT OrcFileFormat : public FileFormat {
+ public:
+  std::string type_name() const override { return kOrcTypeName; }
+
+  bool Equals(const FileFormat& other) const override {
+    return type_name() == other.type_name();
+  }
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<util::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_parquet.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/file_parquet.h
@@ -0,0 +1,373 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/dataset/discovery.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/caching.h"
+#include "arrow/util/optional.h"
+
+namespace parquet {
+class ParquetFileReader;
+class Statistics;
+class ColumnChunkMetaData;
+class RowGroupMetaData;
+class FileMetaData;
+class FileDecryptionProperties;
+class FileEncryptionProperties;
+
+class ReaderProperties;
+class ArrowReaderProperties;
+
+class WriterProperties;
+class ArrowWriterProperties;
+
+namespace arrow {
+class FileReader;
+class FileWriter;
+struct SchemaManifest;
+}  // namespace arrow
+}  // namespace parquet
+
+namespace arrow {
+namespace dataset {
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+constexpr char kParquetTypeName[] = "parquet";
+
+/// \brief A FileFormat implementation that reads from Parquet files
+class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
+ public:
+  ParquetFileFormat() = default;
+
+  /// Convenience constructor which copies properties from a parquet::ReaderProperties.
+  /// memory_pool will be ignored.
+  explicit ParquetFileFormat(const parquet::ReaderProperties& reader_properties);
+
+  std::string type_name() const override { return kParquetTypeName; }
+
+  bool Equals(const FileFormat& other) const override;
+
+  struct ReaderOptions {
+    /// \defgroup parquet-file-format-arrow-reader-properties properties which correspond
+    /// to members of parquet::ArrowReaderProperties.
+    ///
+    /// We don't embed parquet::ReaderProperties directly because column names (rather
+    /// than indices) are used to indicate dictionary columns, and other options are
+    /// deferred to scan time.
+    ///
+    /// @{
+    std::unordered_set<std::string> dict_columns;
+    arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO;
+    /// @}
+  } reader_options;
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<util::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  using FileFormat::MakeFragment;
+
+  /// \brief Create a Fragment targeting all RowGroups.
+  Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression,
+      std::shared_ptr<Schema> physical_schema) override;
+
+  /// \brief Create a Fragment, restricted to the specified row groups.
+  Result<std::shared_ptr<ParquetFileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression,
+      std::shared_ptr<Schema> physical_schema, std::vector<int> row_groups);
+
+  /// \brief Return a FileReader on the given source.
+  Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
+      const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
+
+  Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
+      const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// \brief A FileFragment with parquet logic.
+///
+/// ParquetFileFragment provides a lazy (with respect to IO) interface to
+/// scan parquet files. Any heavy IO calls are deferred to the Scan() method.
+///
+/// The caller can provide an optional list of selected RowGroups to limit the
+/// number of scanned RowGroups, or to partition the scans across multiple
+/// threads.
+///
+/// Metadata can be explicitly provided, enabling pushdown predicate benefits without
+/// the potentially heavy IO of loading Metadata from the file system. This can induce
+/// significant performance boost when scanning high latency file systems.
+class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
+ public:
+  Result<FragmentVector> SplitByRowGroup(compute::Expression predicate);
+
+  /// \brief Return the RowGroups selected by this fragment.
+  const std::vector<int>& row_groups() const {
+    if (row_groups_) return *row_groups_;
+    static std::vector<int> empty;
+    return empty;
+  }
+
+  /// \brief Return the FileMetaData associated with this fragment.
+  const std::shared_ptr<parquet::FileMetaData>& metadata() const { return metadata_; }
+
+  /// \brief Ensure this fragment's FileMetaData is in memory.
+  Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR);
+
+  /// \brief Return fragment which selects a filtered subset of this fragment's RowGroups.
+  Result<std::shared_ptr<Fragment>> Subset(compute::Expression predicate);
+  Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);
+
+ private:
+  ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
+                      compute::Expression partition_expression,
+                      std::shared_ptr<Schema> physical_schema,
+                      util::optional<std::vector<int>> row_groups);
+
+  Status SetMetadata(std::shared_ptr<parquet::FileMetaData> metadata,
+                     std::shared_ptr<parquet::arrow::SchemaManifest> manifest);
+
+  // Overridden to opportunistically set metadata since a reader must be opened anyway.
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override {
+    ARROW_RETURN_NOT_OK(EnsureCompleteMetadata());
+    return physical_schema_;
+  }
+
+  /// Return a filtered subset of row group indices.
+  Result<std::vector<int>> FilterRowGroups(compute::Expression predicate);
+  /// Simplify the predicate against the statistics of each row group.
+  Result<std::vector<compute::Expression>> TestRowGroups(compute::Expression predicate);
+  /// Try to count rows matching the predicate using metadata. Expects
+  /// metadata to be present, and expects the predicate to have been
+  /// simplified against the partition expression already.
+  Result<util::optional<int64_t>> TryCountRows(compute::Expression predicate);
+
+  ParquetFileFormat& parquet_format_;
+
+  /// Indices of row groups selected by this fragment,
+  /// or util::nullopt if all row groups are selected.
+  util::optional<std::vector<int>> row_groups_;
+
+  std::vector<compute::Expression> statistics_expressions_;
+  std::vector<bool> statistics_expressions_complete_;
+  std::shared_ptr<parquet::FileMetaData> metadata_;
+  std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
+
+  friend class ParquetFileFormat;
+  friend class ParquetDatasetFactory;
+};
+
+/// \brief Per-scan options for Parquet fragments
+class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions {
+ public:
+  ParquetFragmentScanOptions();
+  std::string type_name() const override { return kParquetTypeName; }
+
+  /// Reader properties. Not all properties are respected: memory_pool comes from
+  /// ScanOptions.
+  std::shared_ptr<parquet::ReaderProperties> reader_properties;
+  /// Arrow reader properties. Not all properties are respected: batch_size comes from
+  /// ScanOptions. Additionally, dictionary columns come from
+  /// ParquetFileFormat::ReaderOptions::dict_columns.
+  std::shared_ptr<parquet::ArrowReaderProperties> arrow_reader_properties;
+};
+
+class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions {
+ public:
+  /// \brief Parquet writer properties.
+  std::shared_ptr<parquet::WriterProperties> writer_properties;
+
+  /// \brief Parquet Arrow writer properties.
+  std::shared_ptr<parquet::ArrowWriterProperties> arrow_writer_properties;
+
+ protected:
+  using FileWriteOptions::FileWriteOptions;
+
+  friend class ParquetFileFormat;
+};
+
+class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter {
+ public:
+  const std::shared_ptr<parquet::arrow::FileWriter>& parquet_writer() const {
+    return parquet_writer_;
+  }
+
+  Status Write(const std::shared_ptr<RecordBatch>& batch) override;
+
+ private:
+  ParquetFileWriter(std::shared_ptr<io::OutputStream> destination,
+                    std::shared_ptr<parquet::arrow::FileWriter> writer,
+                    std::shared_ptr<ParquetFileWriteOptions> options,
+                    fs::FileLocator destination_locator);
+
+  Future<> FinishInternal() override;
+
+  std::shared_ptr<parquet::arrow::FileWriter> parquet_writer_;
+
+  friend class ParquetFileFormat;
+};
+
+/// \brief Options for making a FileSystemDataset from a Parquet _metadata file.
+struct ParquetFactoryOptions {
+  /// Either an explicit Partitioning or a PartitioningFactory to discover one.
+  ///
+  /// If a factory is provided, it will be used to infer a schema for partition fields
+  /// based on file and directory paths then construct a Partitioning. The default
+  /// is a Partitioning which will yield no partition information.
+  ///
+  /// The (explicit or discovered) partitioning will be applied to discovered files
+  /// and the resulting partition information embedded in the Dataset.
+  PartitioningOrFactory partitioning{Partitioning::Default()};
+
+  /// For the purposes of applying the partitioning, paths will be stripped
+  /// of the partition_base_dir. Files not matching the partition_base_dir
+  /// prefix will be skipped for partition discovery. The ignored files will still
+  /// be part of the Dataset, but will not have partition information.
+  ///
+  /// Example:
+  /// partition_base_dir = "/dataset";
+  ///
+  /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
+  ///
+  /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
+  ///
+  /// This is useful for partitioning which parses directory when ordering
+  /// is important, e.g. DirectoryPartitioning.
+  std::string partition_base_dir;
+
+  /// Assert that all ColumnChunk paths are consistent. The parquet spec allows for
+  /// ColumnChunk data to be stored in multiple files, but ParquetDatasetFactory
+  /// supports only a single file with all ColumnChunk data. If this flag is set
+  /// construction of a ParquetDatasetFactory will raise an error if ColumnChunk
+  /// data is not resident in a single file.
+  bool validate_column_chunk_paths = false;
+};
+
+/// \brief Create FileSystemDataset from custom `_metadata` cache file.
+///
+/// Dask and other systems will generate a cache metadata file by concatenating
+/// the RowGroupMetaData of multiple parquet files into a single parquet file
+/// that only contains metadata and no ColumnChunk data.
+///
+/// ParquetDatasetFactory creates a FileSystemDataset composed of
+/// ParquetFileFragment where each fragment is pre-populated with the exact
+/// number of row groups and statistics for each columns.
+class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
+ public:
+  /// \brief Create a ParquetDatasetFactory from a metadata path.
+  ///
+  /// The `metadata_path` will be read from `filesystem`. Each RowGroup
+  /// contained in the metadata file will be relative to `dirname(metadata_path)`.
+  ///
+  /// \param[in] metadata_path path of the metadata parquet file
+  /// \param[in] filesystem from which to open/read the path
+  /// \param[in] format to read the file with.
+  /// \param[in] options see ParquetFactoryOptions
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
+      std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
+
+  /// \brief Create a ParquetDatasetFactory from a metadata source.
+  ///
+  /// Similar to the previous Make definition, but the metadata can be a Buffer
+  /// and the base_path is explicited instead of inferred from the metadata
+  /// path.
+  ///
+  /// \param[in] metadata source to open the metadata parquet file from
+  /// \param[in] base_path used as the prefix of every parquet files referenced
+  /// \param[in] filesystem from which to read the files referenced.
+  /// \param[in] format to read the file with.
+  /// \param[in] options see ParquetFactoryOptions
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      const FileSource& metadata, const std::string& base_path,
+      std::shared_ptr<fs::FileSystem> filesystem,
+      std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
+
+  Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) override;
+
+  Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
+
+ protected:
+  ParquetDatasetFactory(
+      std::shared_ptr<fs::FileSystem> filesystem,
+      std::shared_ptr<ParquetFileFormat> format,
+      std::shared_ptr<parquet::FileMetaData> metadata,
+      std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
+      std::shared_ptr<Schema> physical_schema, std::string base_path,
+      ParquetFactoryOptions options,
+      std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids)
+      : filesystem_(std::move(filesystem)),
+        format_(std::move(format)),
+        metadata_(std::move(metadata)),
+        manifest_(std::move(manifest)),
+        physical_schema_(std::move(physical_schema)),
+        base_path_(std::move(base_path)),
+        options_(std::move(options)),
+        paths_with_row_group_ids_(std::move(paths_with_row_group_ids)) {}
+
+  std::shared_ptr<fs::FileSystem> filesystem_;
+  std::shared_ptr<ParquetFileFormat> format_;
+  std::shared_ptr<parquet::FileMetaData> metadata_;
+  std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
+  std::shared_ptr<Schema> physical_schema_;
+  std::string base_path_;
+  ParquetFactoryOptions options_;
+  std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids_;
+
+ private:
+  Result<std::vector<std::shared_ptr<FileFragment>>> CollectParquetFragments(
+      const Partitioning& partitioning);
+
+  Result<std::shared_ptr<Schema>> PartitionSchema();
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/partition.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/partition.h
@@ -0,0 +1,409 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+
+namespace dataset {
+
+constexpr char kFilenamePartitionSep = '_';
+
+// ----------------------------------------------------------------------
+// Partitioning
+
+/// \defgroup dataset-partitioning Partitioning API
+///
+/// @{
+
+/// \brief Interface for parsing partition expressions from string partition
+/// identifiers.
+///
+/// For example, the identifier "foo=5" might be parsed to an equality expression
+/// between the "foo" field and the value 5.
+///
+/// Some partitionings may store the field names in a metadata
+/// store instead of in file paths, for example
+/// dataset_root/2009/11/... could be used when the partition fields
+/// are "year" and "month"
+///
+/// Paths are consumed from left to right. Paths must be relative to
+/// the root of a partition; path prefixes must be removed before passing
+/// the path to a partitioning for parsing.
+class ARROW_DS_EXPORT Partitioning {
+ public:
+  virtual ~Partitioning() = default;
+
+  /// \brief The name identifying the kind of partitioning
+  virtual std::string type_name() const = 0;
+
+  /// \brief If the input batch shares any fields with this partitioning,
+  /// produce sub-batches which satisfy mutually exclusive Expressions.
+  struct PartitionedBatches {
+    RecordBatchVector batches;
+    std::vector<compute::Expression> expressions;
+  };
+  virtual Result<PartitionedBatches> Partition(
+      const std::shared_ptr<RecordBatch>& batch) const = 0;
+
+  /// \brief Parse a path into a partition expression
+  virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
+
+  struct PartitionPathFormat {
+    std::string directory, prefix;
+  };
+
+  virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
+
+  /// \brief A default Partitioning which always yields scalar(true)
+  static std::shared_ptr<Partitioning> Default();
+
+  /// \brief The partition schema.
+  const std::shared_ptr<Schema>& schema() { return schema_; }
+
+ protected:
+  explicit Partitioning(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
+
+  std::shared_ptr<Schema> schema_;
+};
+
+/// \brief The encoding of partition segments.
+enum class SegmentEncoding : int8_t {
+  /// No encoding.
+  None = 0,
+  /// Segment values are URL-encoded.
+  Uri = 1,
+};
+
+ARROW_DS_EXPORT
+std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding);
+
+/// \brief Options for key-value based partitioning (hive/directory).
+struct ARROW_DS_EXPORT KeyValuePartitioningOptions {
+  /// After splitting a path into components, decode the path components
+  /// before parsing according to this scheme.
+  SegmentEncoding segment_encoding = SegmentEncoding::Uri;
+};
+
+/// \brief Options for inferring a partitioning.
+struct ARROW_DS_EXPORT PartitioningFactoryOptions {
+  /// When inferring a schema for partition fields, yield dictionary encoded types
+  /// instead of plain. This can be more efficient when materializing virtual
+  /// columns, and Expressions parsed by the finished Partitioning will include
+  /// dictionaries of all unique inspected values for each field.
+  bool infer_dictionary = false;
+  /// Optionally, an expected schema can be provided, in which case inference
+  /// will only check discovered fields against the schema and update internal
+  /// state (such as dictionaries).
+  std::shared_ptr<Schema> schema;
+  /// After splitting a path into components, decode the path components
+  /// before parsing according to this scheme.
+  SegmentEncoding segment_encoding = SegmentEncoding::Uri;
+
+  KeyValuePartitioningOptions AsPartitioningOptions() const;
+};
+
+/// \brief Options for inferring a hive-style partitioning.
+struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions {
+  /// The hive partitioning scheme maps null to a hard coded fallback string.
+  std::string null_fallback;
+
+  HivePartitioningOptions AsHivePartitioningOptions() const;
+};
+
+/// \brief PartitioningFactory provides creation of a partitioning  when the
+/// specific schema must be inferred from available paths (no explicit schema is known).
+class ARROW_DS_EXPORT PartitioningFactory {
+ public:
+  virtual ~PartitioningFactory() = default;
+
+  /// \brief The name identifying the kind of partitioning
+  virtual std::string type_name() const = 0;
+
+  /// Get the schema for the resulting Partitioning.
+  /// This may reset internal state, for example dictionaries of unique representations.
+  virtual Result<std::shared_ptr<Schema>> Inspect(
+      const std::vector<std::string>& paths) = 0;
+
+  /// Create a partitioning using the provided schema
+  /// (fields may be dropped).
+  virtual Result<std::shared_ptr<Partitioning>> Finish(
+      const std::shared_ptr<Schema>& schema) const = 0;
+};
+
+/// \brief Subclass for the common case of a partitioning which yields an equality
+/// expression for each segment
+class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
+ public:
+  /// An unconverted equality expression consisting of a field name and the representation
+  /// of a scalar value
+  struct Key {
+    std::string name;
+    util::optional<std::string> value;
+  };
+
+  Result<PartitionedBatches> Partition(
+      const std::shared_ptr<RecordBatch>& batch) const override;
+
+  Result<compute::Expression> Parse(const std::string& path) const override;
+
+  Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
+
+  const ArrayVector& dictionaries() const { return dictionaries_; }
+
+ protected:
+  KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
+                       KeyValuePartitioningOptions options)
+      : Partitioning(std::move(schema)),
+        dictionaries_(std::move(dictionaries)),
+        options_(options) {
+    if (dictionaries_.empty()) {
+      dictionaries_.resize(schema_->num_fields());
+    }
+  }
+
+  virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
+
+  virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
+
+  /// Convert a Key to a full expression.
+  Result<compute::Expression> ConvertKey(const Key& key) const;
+
+  Result<std::vector<std::string>> FormatPartitionSegments(
+      const ScalarVector& values) const;
+  Result<std::vector<Key>> ParsePartitionSegments(
+      const std::vector<std::string>& segments) const;
+
+  ArrayVector dictionaries_;
+  KeyValuePartitioningOptions options_;
+};
+
+/// \brief DirectoryPartitioning parses one segment of a path for each field in its
+/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse
+/// must contain segments for each field.
+///
+/// For example given schema<year:int16, month:int8> the path "/2009/11" would be
+/// parsed to ("year"_ == 2009 and "month"_ == 11)
+class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
+ public:
+  /// If a field in schema is of dictionary type, the corresponding element of
+  /// dictionaries must be contain the dictionary of values for that field.
+  explicit DirectoryPartitioning(std::shared_ptr<Schema> schema,
+                                 ArrayVector dictionaries = {},
+                                 KeyValuePartitioningOptions options = {});
+
+  std::string type_name() const override { return "directory"; }
+
+  /// \brief Create a factory for a directory partitioning.
+  ///
+  /// \param[in] field_names The names for the partition fields. Types will be
+  ///     inferred.
+  static std::shared_ptr<PartitioningFactory> MakeFactory(
+      std::vector<std::string> field_names, PartitioningFactoryOptions = {});
+
+ private:
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+
+  Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
+};
+
+/// \brief The default fallback used for null values in a Hive-style partitioning.
+static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__";
+
+struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions {
+  std::string null_fallback = kDefaultHiveNullFallback;
+
+  static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) {
+    HivePartitioningOptions options;
+    options.null_fallback = std::move(fallback);
+    return options;
+  }
+};
+
+/// \brief Multi-level, directory based partitioning
+/// originating from Apache Hive with all data files stored in the
+/// leaf directories. Data is partitioned by static values of a
+/// particular column in the schema. Partition keys are represented in
+/// the form $key=$value in directory names.
+/// Field order is ignored, as are missing or unrecognized field names.
+///
+/// For example given schema<year:int16, month:int8, day:int8> the path
+/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321)
+class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
+ public:
+  /// If a field in schema is of dictionary type, the corresponding element of
+  /// dictionaries must be contain the dictionary of values for that field.
+  explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
+                            std::string null_fallback = kDefaultHiveNullFallback)
+      : KeyValuePartitioning(std::move(schema), std::move(dictionaries),
+                             KeyValuePartitioningOptions()),
+        hive_options_(
+            HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) {
+  }
+
+  explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
+                            HivePartitioningOptions options)
+      : KeyValuePartitioning(std::move(schema), std::move(dictionaries), options),
+        hive_options_(options) {}
+
+  std::string type_name() const override { return "hive"; }
+  std::string null_fallback() const { return hive_options_.null_fallback; }
+  const HivePartitioningOptions& options() const { return hive_options_; }
+
+  static Result<util::optional<Key>> ParseKey(const std::string& segment,
+                                              const HivePartitioningOptions& options);
+
+  /// \brief Create a factory for a hive partitioning.
+  static std::shared_ptr<PartitioningFactory> MakeFactory(
+      HivePartitioningFactoryOptions = {});
+
+ private:
+  const HivePartitioningOptions hive_options_;
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+
+  Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
+};
+
+/// \brief Implementation provided by lambda or other callable
+class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
+ public:
+  using ParseImpl = std::function<Result<compute::Expression>(const std::string&)>;
+
+  using FormatImpl =
+      std::function<Result<PartitionPathFormat>(const compute::Expression&)>;
+
+  FunctionPartitioning(std::shared_ptr<Schema> schema, ParseImpl parse_impl,
+                       FormatImpl format_impl = NULLPTR, std::string name = "function")
+      : Partitioning(std::move(schema)),
+        parse_impl_(std::move(parse_impl)),
+        format_impl_(std::move(format_impl)),
+        name_(std::move(name)) {}
+
+  std::string type_name() const override { return name_; }
+
+  Result<compute::Expression> Parse(const std::string& path) const override {
+    return parse_impl_(path);
+  }
+
+  Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
+    if (format_impl_) {
+      return format_impl_(expr);
+    }
+    return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning");
+  }
+
+  Result<PartitionedBatches> Partition(
+      const std::shared_ptr<RecordBatch>& batch) const override {
+    return Status::NotImplemented("partitioning batches from ", type_name(),
+                                  " Partitioning");
+  }
+
+ private:
+  ParseImpl parse_impl_;
+  FormatImpl format_impl_;
+  std::string name_;
+};
+
+class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
+ public:
+  /// \brief Construct a FilenamePartitioning from its components.
+  ///
+  /// If a field in schema is of dictionary type, the corresponding element of
+  /// dictionaries must be contain the dictionary of values for that field.
+  explicit FilenamePartitioning(std::shared_ptr<Schema> schema,
+                                ArrayVector dictionaries = {},
+                                KeyValuePartitioningOptions options = {});
+
+  std::string type_name() const override { return "filename"; }
+
+  /// \brief Create a factory for a filename partitioning.
+  ///
+  /// \param[in] field_names The names for the partition fields. Types will be
+  ///     inferred.
+  static std::shared_ptr<PartitioningFactory> MakeFactory(
+      std::vector<std::string> field_names, PartitioningFactoryOptions = {});
+
+ private:
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+
+  Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
+};
+
+/// \brief Remove a prefix and the filename of a path.
+///
+/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"`
+ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
+                                                   const std::string& prefix);
+
+/// \brief Vector version of StripPrefixAndFilename.
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+    const std::vector<std::string>& paths, const std::string& prefix);
+
+/// \brief Vector version of StripPrefixAndFilename.
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+    const std::vector<fs::FileInfo>& files, const std::string& prefix);
+
+/// \brief Either a Partitioning or a PartitioningFactory
+class ARROW_DS_EXPORT PartitioningOrFactory {
+ public:
+  explicit PartitioningOrFactory(std::shared_ptr<Partitioning> partitioning)
+      : partitioning_(std::move(partitioning)) {}
+
+  explicit PartitioningOrFactory(std::shared_ptr<PartitioningFactory> factory)
+      : factory_(std::move(factory)) {}
+
+  PartitioningOrFactory& operator=(std::shared_ptr<Partitioning> partitioning) {
+    return *this = PartitioningOrFactory(std::move(partitioning));
+  }
+
+  PartitioningOrFactory& operator=(std::shared_ptr<PartitioningFactory> factory) {
+    return *this = PartitioningOrFactory(std::move(factory));
+  }
+
+  /// \brief The partitioning (if given).
+  const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
+
+  /// \brief The partition factory (if given).
+  const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
+
+  /// \brief Get the partition schema, inferring it with the given factory if needed.
+  Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
+
+ private:
+  std::shared_ptr<PartitioningFactory> factory_;
+  std::shared_ptr<Partitioning> partitioning_;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/pch.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/pch.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Often-used headers, for precompiling.
+// If updating this header, please make sure you check compilation speed
+// before checking in.  Adding headers which are not used extremely often
+// may incur a slowdown, since it makes the precompiled header heavier to load.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/scanner.h"
+#include "arrow/pch.h"
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/plan.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/plan.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/dataset/visibility.h"
+
+namespace arrow {
+namespace dataset {
+namespace internal {
+
+/// Register dataset-based exec nodes with the exec node registry
+///
+/// This function must be called before using dataset ExecNode factories
+ARROW_DS_EXPORT void Initialize();
+
+}  // namespace internal
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/projector.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/projector.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include "arrow/dataset/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace dataset {
+
+// FIXME this is superceded by compute::Expression::Bind
+ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to);
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/scanner.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/scanner.h
@@ -0,0 +1,432 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/exec/options.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/projector.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+
+using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
+
+namespace dataset {
+
+/// \defgroup dataset-scanning Scanning API
+///
+/// @{
+
+constexpr int64_t kDefaultBatchSize = 1 << 17;  // 128Ki rows
+// This will yield 64 batches ~ 8Mi rows
+constexpr int32_t kDefaultBatchReadahead = 16;
+constexpr int32_t kDefaultFragmentReadahead = 4;
+
+/// Scan-specific options, which can be changed between scans of the same dataset.
+struct ARROW_DS_EXPORT ScanOptions {
+  /// A row filter (which will be pushed down to partitioning/reading if supported).
+  compute::Expression filter = compute::literal(true);
+  /// A projection expression (which can add/remove/rename columns).
+  compute::Expression projection;
+
+  /// Schema with which batches will be read from fragments. This is also known as the
+  /// "reader schema" it will be used (for example) in constructing CSV file readers to
+  /// identify column types for parsing. Usually only a subset of its fields (see
+  /// MaterializedFields) will be materialized during a scan.
+  std::shared_ptr<Schema> dataset_schema;
+
+  /// Schema of projected record batches. This is independent of dataset_schema as its
+  /// fields are derived from the projection. For example, let
+  ///
+  ///   dataset_schema = {"a": int32, "b": int32, "id": utf8}
+  ///   projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"})
+  ///
+  /// (no filter specified). In this case, the projected_schema would be
+  ///
+  ///   {"a_plus_b": int32}
+  std::shared_ptr<Schema> projected_schema;
+
+  /// Maximum row count for scanned batches.
+  int64_t batch_size = kDefaultBatchSize;
+
+  /// How many batches to read ahead within a file
+  ///
+  /// Set to 0 to disable batch readahead
+  ///
+  /// Note: May not be supported by all formats
+  /// Note: Will be ignored if use_threads is set to false
+  int32_t batch_readahead = kDefaultBatchReadahead;
+
+  /// How many files to read ahead
+  ///
+  /// Set to 0 to disable fragment readahead
+  ///
+  /// Note: May not be enforced by all scanners
+  /// Note: Will be ignored if use_threads is set to false
+  int32_t fragment_readahead = kDefaultFragmentReadahead;
+
+  /// A pool from which materialized and scanned arrays will be allocated.
+  MemoryPool* pool = arrow::default_memory_pool();
+
+  /// IOContext for any IO tasks
+  ///
+  /// Note: The IOContext executor will be ignored if use_threads is set to false
+  io::IOContext io_context;
+
+  /// If true the scanner will scan in parallel
+  ///
+  /// Note: If true, this will use threads from both the cpu_executor and the
+  /// io_context.executor
+  /// Note: This  must be true in order for any readahead to happen
+  bool use_threads = false;
+
+  /// Fragment-specific scan options.
+  std::shared_ptr<FragmentScanOptions> fragment_scan_options;
+
+  /// Return a vector of FieldRefs that require materialization.
+  ///
+  /// This is usually the union of the fields referenced in the projection and the
+  /// filter expression. Examples:
+  ///
+  /// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"]
+  /// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b"]
+  ///
+  /// This is needed for expression where a field may not be directly
+  /// used in the final projection but is still required to evaluate the
+  /// expression.
+  ///
+  /// This is used by Fragment implementations to apply the column
+  /// sub-selection optimization.
+  std::vector<FieldRef> MaterializedFields() const;
+
+  /// Parameters which control when the plan should pause for a slow consumer
+  compute::BackpressureOptions backpressure =
+      compute::BackpressureOptions::DefaultBackpressure();
+};
+
+/// \brief Describes a projection
+struct ARROW_DS_EXPORT ProjectionDescr {
+  /// \brief The projection expression itself
+  /// This expression must be a call to make_struct
+  compute::Expression expression;
+  /// \brief The output schema of the projection.
+
+  /// This can be calculated from the input schema and the expression but it
+  /// is cached here for convenience.
+  std::shared_ptr<Schema> schema;
+
+  /// \brief Create a ProjectionDescr by binding an expression to the dataset schema
+  ///
+  /// expression must return a struct type
+  static Result<ProjectionDescr> FromStructExpression(
+      const compute::Expression& expression, const Schema& dataset_schema);
+
+  /// \brief Create a ProjectionDescr from expressions/names for each field
+  static Result<ProjectionDescr> FromExpressions(std::vector<compute::Expression> exprs,
+                                                 std::vector<std::string> names,
+                                                 const Schema& dataset_schema);
+
+  /// \brief Create a default projection referencing fields in the dataset schema
+  static Result<ProjectionDescr> FromNames(std::vector<std::string> names,
+                                           const Schema& dataset_schema);
+
+  /// \brief Make a projection that projects every field in the dataset schema
+  static Result<ProjectionDescr> Default(const Schema& dataset_schema);
+};
+
+/// \brief Utility method to set the projection expression and schema
+ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection);
+
+/// \brief Combines a record batch with the fragment that the record batch originated
+/// from
+///
+/// Knowing the source fragment can be useful for debugging & understanding loaded
+/// data
+struct TaggedRecordBatch {
+  std::shared_ptr<RecordBatch> record_batch;
+  std::shared_ptr<Fragment> fragment;
+};
+using TaggedRecordBatchGenerator = std::function<Future<TaggedRecordBatch>()>;
+using TaggedRecordBatchIterator = Iterator<TaggedRecordBatch>;
+
+/// \brief Combines a tagged batch with positional information
+///
+/// This is returned when scanning batches in an unordered fashion.  This information is
+/// needed if you ever want to reassemble the batches in order
+struct EnumeratedRecordBatch {
+  Enumerated<std::shared_ptr<RecordBatch>> record_batch;
+  Enumerated<std::shared_ptr<Fragment>> fragment;
+};
+using EnumeratedRecordBatchGenerator = std::function<Future<EnumeratedRecordBatch>()>;
+using EnumeratedRecordBatchIterator = Iterator<EnumeratedRecordBatch>;
+
+/// @}
+
+}  // namespace dataset
+
+template <>
+struct IterationTraits<dataset::TaggedRecordBatch> {
+  static dataset::TaggedRecordBatch End() {
+    return dataset::TaggedRecordBatch{NULLPTR, NULLPTR};
+  }
+  static bool IsEnd(const dataset::TaggedRecordBatch& val) {
+    return val.record_batch == NULLPTR;
+  }
+};
+
+template <>
+struct IterationTraits<dataset::EnumeratedRecordBatch> {
+  static dataset::EnumeratedRecordBatch End() {
+    return dataset::EnumeratedRecordBatch{
+        IterationEnd<Enumerated<std::shared_ptr<RecordBatch>>>(),
+        IterationEnd<Enumerated<std::shared_ptr<dataset::Fragment>>>()};
+  }
+  static bool IsEnd(const dataset::EnumeratedRecordBatch& val) {
+    return IsIterationEnd(val.fragment);
+  }
+};
+
+namespace dataset {
+
+/// \defgroup dataset-scanning Scanning API
+///
+/// @{
+
+/// \brief A scanner glues together several dataset classes to load in data.
+/// The dataset contains a collection of fragments and partitioning rules.
+///
+/// The fragments identify independently loadable units of data (i.e. each fragment has
+/// a potentially unique schema and possibly even format.  It should be possible to read
+/// fragments in parallel if desired).
+///
+/// The fragment's format contains the logic necessary to actually create a task to load
+/// the fragment into memory.  That task may or may not support parallel execution of
+/// its own.
+///
+/// The scanner is then responsible for creating scan tasks from every fragment in the
+/// dataset and (potentially) sequencing the loaded record batches together.
+///
+/// The scanner should not buffer the entire dataset in memory (unless asked) instead
+/// yielding record batches as soon as they are ready to scan.  Various readahead
+/// properties control how much data is allowed to be scanned before pausing to let a
+/// slow consumer catchup.
+///
+/// Today the scanner also handles projection & filtering although that may change in
+/// the future.
+class ARROW_DS_EXPORT Scanner {
+ public:
+  virtual ~Scanner() = default;
+
+  /// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads
+  /// are used (via use_threads), the visitor will be invoked from those threads and is
+  /// responsible for any synchronization.
+  virtual Status Scan(std::function<Status(TaggedRecordBatch)> visitor) = 0;
+  /// \brief Convert a Scanner into a Table.
+  ///
+  /// Use this convenience utility with care. This will serially materialize the
+  /// Scan result in memory before creating the Table.
+  virtual Result<std::shared_ptr<Table>> ToTable() = 0;
+  /// \brief Scan the dataset into a stream of record batches.  Each batch is tagged
+  /// with the fragment it originated from.  The batches will arrive in order.  The
+  /// order of fragments is determined by the dataset.
+  ///
+  /// Note: The scanner will perform some readahead but will avoid materializing too
+  /// much in memory (this is goverended by the readahead options and use_threads option).
+  /// If the readahead queue fills up then I/O will pause until the calling thread catches
+  /// up.
+  virtual Result<TaggedRecordBatchIterator> ScanBatches() = 0;
+  virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync() = 0;
+  virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync(
+      ::arrow::internal::Executor* cpu_thread_pool) = 0;
+  /// \brief Scan the dataset into a stream of record batches.  Unlike ScanBatches this
+  /// method may allow record batches to be returned out of order.  This allows for more
+  /// efficient scanning: some fragments may be accessed more quickly than others (e.g.
+  /// may be cached in RAM or just happen to get scheduled earlier by the I/O)
+  ///
+  /// To make up for the out-of-order iteration each batch is further tagged with
+  /// positional information.
+  virtual Result<EnumeratedRecordBatchIterator> ScanBatchesUnordered() = 0;
+  virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync() = 0;
+  virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync(
+      ::arrow::internal::Executor* cpu_thread_pool) = 0;
+  /// \brief A convenience to synchronously load the given rows by index.
+  ///
+  /// Will only consume as many batches as needed from ScanBatches().
+  virtual Result<std::shared_ptr<Table>> TakeRows(const Array& indices) = 0;
+  /// \brief Get the first N rows.
+  virtual Result<std::shared_ptr<Table>> Head(int64_t num_rows) = 0;
+  /// \brief Count rows matching a predicate.
+  ///
+  /// This method will push down the predicate and compute the result based on fragment
+  /// metadata if possible.
+  virtual Result<int64_t> CountRows() = 0;
+  /// \brief Convert the Scanner to a RecordBatchReader so it can be
+  /// easily used with APIs that expect a reader.
+  virtual Result<std::shared_ptr<RecordBatchReader>> ToRecordBatchReader() = 0;
+
+  /// \brief Get the options for this scan.
+  const std::shared_ptr<ScanOptions>& options() const { return scan_options_; }
+  /// \brief Get the dataset that this scanner will scan
+  virtual const std::shared_ptr<Dataset>& dataset() const = 0;
+
+ protected:
+  explicit Scanner(std::shared_ptr<ScanOptions> scan_options)
+      : scan_options_(std::move(scan_options)) {}
+
+  Result<EnumeratedRecordBatchIterator> AddPositioningToInOrderScan(
+      TaggedRecordBatchIterator scan);
+
+  const std::shared_ptr<ScanOptions> scan_options_;
+};
+
+/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used
+/// to pass information, notably a potential filter expression and a subset of
+/// columns to materialize.
+class ARROW_DS_EXPORT ScannerBuilder {
+ public:
+  explicit ScannerBuilder(std::shared_ptr<Dataset> dataset);
+
+  ScannerBuilder(std::shared_ptr<Dataset> dataset,
+                 std::shared_ptr<ScanOptions> scan_options);
+
+  ScannerBuilder(std::shared_ptr<Schema> schema, std::shared_ptr<Fragment> fragment,
+                 std::shared_ptr<ScanOptions> scan_options);
+
+  /// \brief Make a scanner from a record batch reader.
+  ///
+  /// The resulting scanner can be scanned only once. This is intended
+  /// to support writing data from streaming sources or other sources
+  /// that can be iterated only once.
+  static std::shared_ptr<ScannerBuilder> FromRecordBatchReader(
+      std::shared_ptr<RecordBatchReader> reader);
+
+  /// \brief Set the subset of columns to materialize.
+  ///
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] columns list of columns to project. Order and duplicates will
+  ///            be preserved.
+  ///
+  /// \return Failure if any column name does not exists in the dataset's
+  ///         Schema.
+  Status Project(std::vector<std::string> columns);
+
+  /// \brief Set expressions which will be evaluated to produce the materialized
+  /// columns.
+  ///
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] exprs expressions to evaluate to produce columns.
+  /// \param[in] names list of names for the resulting columns.
+  ///
+  /// \return Failure if any referenced column does not exists in the dataset's
+  ///         Schema.
+  Status Project(std::vector<compute::Expression> exprs, std::vector<std::string> names);
+
+  /// \brief Set the filter expression to return only rows matching the filter.
+  ///
+  /// The predicate will be passed down to Sources and corresponding
+  /// Fragments to exploit predicate pushdown if possible using
+  /// partition information or Fragment internal metadata, e.g. Parquet statistics.
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] filter expression to filter rows with.
+  ///
+  /// \return Failure if any referenced columns does not exist in the dataset's
+  ///         Schema.
+  Status Filter(const compute::Expression& filter);
+
+  /// \brief Indicate if the Scanner should make use of the available
+  ///        ThreadPool found in ScanOptions;
+  Status UseThreads(bool use_threads = true);
+
+  /// \brief Limit how many fragments the scanner will read at once
+  Status FragmentReadahead(int fragment_readahead);
+
+  /// \brief Set the maximum number of rows per RecordBatch.
+  ///
+  /// \param[in] batch_size the maximum number of rows.
+  /// \returns An error if the number for batch is not greater than 0.
+  ///
+  /// This option provides a control limiting the memory owned by any RecordBatch.
+  Status BatchSize(int64_t batch_size);
+
+  /// \brief Set the pool from which materialized and scanned arrays will be allocated.
+  Status Pool(MemoryPool* pool);
+
+  /// \brief Set fragment-specific scan options.
+  Status FragmentScanOptions(std::shared_ptr<FragmentScanOptions> fragment_scan_options);
+
+  /// \brief Override default backpressure configuration
+  Status Backpressure(compute::BackpressureOptions backpressure);
+
+  /// \brief Return the constructed now-immutable Scanner object
+  Result<std::shared_ptr<Scanner>> Finish();
+
+  const std::shared_ptr<Schema>& schema() const;
+  const std::shared_ptr<Schema>& projected_schema() const;
+
+ private:
+  std::shared_ptr<Dataset> dataset_;
+  std::shared_ptr<ScanOptions> scan_options_ = std::make_shared<ScanOptions>();
+};
+
+/// \brief Construct a source ExecNode which yields batches from a dataset scan.
+///
+/// Does not construct associated filter or project nodes.
+/// Yielded batches will be augmented with fragment/batch indices to enable stable
+/// ordering for simple ExecPlans.
+class ARROW_DS_EXPORT ScanNodeOptions : public compute::ExecNodeOptions {
+ public:
+  explicit ScanNodeOptions(std::shared_ptr<Dataset> dataset,
+                           std::shared_ptr<ScanOptions> scan_options,
+                           bool require_sequenced_output = false)
+      : dataset(std::move(dataset)),
+        scan_options(std::move(scan_options)),
+        require_sequenced_output(require_sequenced_output) {}
+
+  std::shared_ptr<Dataset> dataset;
+  std::shared_ptr<ScanOptions> scan_options;
+  bool require_sequenced_output;
+};
+
+/// @}
+
+namespace internal {
+ARROW_DS_EXPORT void InitializeScanner(arrow::compute::ExecFactoryRegistry* registry);
+}  // namespace internal
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/test_util.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/test_util.h
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/type_fwd.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/type_fwd.h
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"  // IWYU pragma: export
+#include "arrow/dataset/visibility.h"
+#include "arrow/filesystem/type_fwd.h"  // IWYU pragma: export
+#include "arrow/type_fwd.h"             // IWYU pragma: export
+
+namespace arrow {
+namespace dataset {
+
+class Dataset;
+class DatasetFactory;
+using DatasetVector = std::vector<std::shared_ptr<Dataset>>;
+
+class UnionDataset;
+class UnionDatasetFactory;
+
+class Fragment;
+using FragmentIterator = Iterator<std::shared_ptr<Fragment>>;
+using FragmentVector = std::vector<std::shared_ptr<Fragment>>;
+
+class FragmentScanOptions;
+
+class FileSource;
+class FileFormat;
+class FileFragment;
+class FileWriter;
+class FileWriteOptions;
+class FileSystemDataset;
+class FileSystemDatasetFactory;
+struct FileSystemDatasetWriteOptions;
+
+/// \brief Controls what happens if files exist in an output directory during a dataset
+/// write
+enum class ExistingDataBehavior : int8_t {
+  /// Deletes all files in a directory the first time that directory is encountered
+  kDeleteMatchingPartitions,
+  /// Ignores existing files, overwriting any that happen to have the same name as an
+  /// output file
+  kOverwriteOrIgnore,
+  /// Returns an error if there are any files or subdirectories in the output directory
+  kError,
+};
+
+class InMemoryDataset;
+
+class CsvFileFormat;
+class CsvFileWriter;
+class CsvFileWriteOptions;
+struct CsvFragmentScanOptions;
+
+class IpcFileFormat;
+class IpcFileWriter;
+class IpcFileWriteOptions;
+class IpcFragmentScanOptions;
+
+class ParquetFileFormat;
+class ParquetFileFragment;
+class ParquetFragmentScanOptions;
+class ParquetFileWriter;
+class ParquetFileWriteOptions;
+
+class Partitioning;
+class PartitioningFactory;
+class PartitioningOrFactory;
+struct KeyValuePartitioningOptions;
+class DirectoryPartitioning;
+class HivePartitioning;
+struct HivePartitioningOptions;
+class FilenamePartitioning;
+struct FilenamePartitioningOptions;
+
+struct ScanOptions;
+
+class Scanner;
+
+class ScannerBuilder;
+
+class ScanTask;
+using ScanTaskVector = std::vector<std::shared_ptr<ScanTask>>;
+using ScanTaskIterator = Iterator<std::shared_ptr<ScanTask>>;
+
+}  // namespace dataset
+}  // namespace arrow
--- a/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/visibility.h
+++ b/.venv/Lib/site-packages/pyarrow/include/arrow/dataset/visibility.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4251)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
+#ifdef ARROW_DS_STATIC
+#define ARROW_DS_EXPORT
+#elif defined(ARROW_DS_EXPORTING)
+#define ARROW_DS_EXPORT __declspec(dllexport)
+#else
+#define ARROW_DS_EXPORT __declspec(dllimport)
+#endif
+
+#define ARROW_DS_NO_EXPORT
+#else  // Not Windows
+#ifndef ARROW_DS_EXPORT
+#define ARROW_DS_EXPORT __attribute__((visibility("default")))
+#endif
+#ifndef ARROW_DS_NO_EXPORT
+#define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden")))
+#endif
+#endif  // Non-Windows
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif