first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include "arrow/compute/exec/expression.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/discovery.h"
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/file_csv.h"
#include "arrow/dataset/file_ipc.h"
#include "arrow/dataset/file_orc.h"
#include "arrow/dataset/file_parquet.h"
#include "arrow/dataset/scanner.h"

View File

@@ -0,0 +1,251 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/compute/exec/expression.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/util/macros.h"
#include "arrow/util/mutex.h"
#include "arrow/util/optional.h"
namespace arrow {
namespace dataset {
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
/// \brief A granular piece of a Dataset, such as an individual file.
///
/// A Fragment can be read/scanned separately from other fragments. It yields a
/// collection of RecordBatches when scanned, encapsulated in one or more
/// ScanTasks.
///
/// Note that Fragments have well defined physical schemas which are reconciled by
/// the Datasets which contain them; these physical schemas may differ from a parent
/// Dataset's schema and the physical schemas of sibling Fragments.
class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
public:
/// \brief Return the physical schema of the Fragment.
///
/// The physical schema is also called the writer schema.
/// This method is blocking and may suffer from high latency filesystem.
/// The schema is cached after being read once, or may be specified at construction.
Result<std::shared_ptr<Schema>> ReadPhysicalSchema();
/// An asynchronous version of Scan
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options) = 0;
/// \brief Count the number of rows in this fragment matching the filter using metadata
/// only. That is, this method may perform I/O, but will not load data.
///
/// If this is not possible, resolve with an empty optional. The fragment can perform
/// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request.
virtual Future<util::optional<int64_t>> CountRows(
compute::Expression predicate, const std::shared_ptr<ScanOptions>& options);
virtual std::string type_name() const = 0;
virtual std::string ToString() const { return type_name(); }
/// \brief An expression which evaluates to true for all data viewed by this
/// Fragment.
const compute::Expression& partition_expression() const {
return partition_expression_;
}
virtual ~Fragment() = default;
protected:
Fragment() = default;
explicit Fragment(compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema);
virtual Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() = 0;
util::Mutex physical_schema_mutex_;
compute::Expression partition_expression_ = compute::literal(true);
std::shared_ptr<Schema> physical_schema_;
};
/// \brief Per-scan options for fragment(s) in a dataset.
///
/// These options are not intrinsic to the format or fragment itself, but do affect
/// the results of a scan. These are options which make sense to change between
/// repeated reads of the same dataset, such as format-specific conversion options
/// (that do not affect the schema).
///
/// \ingroup dataset-scanning
class ARROW_DS_EXPORT FragmentScanOptions {
public:
virtual std::string type_name() const = 0;
virtual std::string ToString() const { return type_name(); }
virtual ~FragmentScanOptions() = default;
};
/// \defgroup dataset-implementations Concrete implementations
///
/// @{
/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
/// RecordBatch.
class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
public:
InMemoryFragment(std::shared_ptr<Schema> schema, RecordBatchVector record_batches,
compute::Expression = compute::literal(true));
explicit InMemoryFragment(RecordBatchVector record_batches,
compute::Expression = compute::literal(true));
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options) override;
Future<util::optional<int64_t>> CountRows(
compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
std::string type_name() const override { return "in-memory"; }
protected:
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
RecordBatchVector record_batches_;
};
/// @}
/// \brief A container of zero or more Fragments.
///
/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
/// directory. A Dataset has a schema to which Fragments must align during a
/// scan operation. This is analogous to Avro's reader and writer schema.
class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
public:
/// \brief Begin to build a new Scan operation against this Dataset
Result<std::shared_ptr<ScannerBuilder>> NewScan();
/// \brief GetFragments returns an iterator of Fragments given a predicate.
Result<FragmentIterator> GetFragments(compute::Expression predicate);
Result<FragmentIterator> GetFragments();
const std::shared_ptr<Schema>& schema() const { return schema_; }
/// \brief An expression which evaluates to true for all data viewed by this Dataset.
/// May be null, which indicates no information is available.
const compute::Expression& partition_expression() const {
return partition_expression_;
}
/// \brief The name identifying the kind of Dataset
virtual std::string type_name() const = 0;
/// \brief Return a copy of this Dataset with a different schema.
///
/// The copy will view the same Fragments. If the new schema is not compatible with the
/// original dataset's schema then an error will be raised.
virtual Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const = 0;
virtual ~Dataset() = default;
protected:
explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression);
virtual Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) = 0;
std::shared_ptr<Schema> schema_;
compute::Expression partition_expression_ = compute::literal(true);
};
/// \addtogroup dataset-implementations
///
/// @{
/// \brief A Source which yields fragments wrapping a stream of record batches.
///
/// The record batches must match the schema provided to the source at construction.
class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
public:
class RecordBatchGenerator {
public:
virtual ~RecordBatchGenerator() = default;
virtual RecordBatchIterator Get() const = 0;
};
/// Construct a dataset from a schema and a factory of record batch iterators.
InMemoryDataset(std::shared_ptr<Schema> schema,
std::shared_ptr<RecordBatchGenerator> get_batches)
: Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
/// Convenience constructor taking a fixed list of batches
InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);
/// Convenience constructor taking a Table
explicit InMemoryDataset(std::shared_ptr<Table> table);
std::string type_name() const override { return "in-memory"; }
Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const override;
protected:
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
std::shared_ptr<RecordBatchGenerator> get_batches_;
};
/// \brief A Dataset wrapping child Datasets.
class ARROW_DS_EXPORT UnionDataset : public Dataset {
public:
/// \brief Construct a UnionDataset wrapping child Datasets.
///
/// \param[in] schema the schema of the resulting dataset.
/// \param[in] children one or more child Datasets. Their schemas must be identical to
/// schema.
static Result<std::shared_ptr<UnionDataset>> Make(std::shared_ptr<Schema> schema,
DatasetVector children);
const DatasetVector& children() const { return children_; }
std::string type_name() const override { return "union"; }
Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const override;
protected:
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
explicit UnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
: Dataset(std::move(schema)), children_(std::move(children)) {}
DatasetVector children_;
friend class UnionDatasetFactory;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,98 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "arrow/dataset/file_base.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
#include "arrow/util/async_util.h"
#include "arrow/util/future.h"
namespace arrow {
namespace dataset {
namespace internal {
// This lines up with our other defaults in the scanner and execution plan
constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024;
/// \brief Utility class that manages a set of writers to different paths
///
/// Writers may be closed and reopened (and a new file created) based on the dataset
/// write options (for example, max_rows_per_file or max_open_files)
///
/// The dataset writer enforces its own back pressure based on the # of rows (as opposed
/// to # of batches which is how it is typically enforced elsewhere) and # of files.
class ARROW_DS_EXPORT DatasetWriter {
public:
/// \brief Create a dataset writer
///
/// Will fail if basename_template is invalid or if there is existing data and
/// existing_data_behavior is kError
///
/// \param write_options options to control how the data should be written
/// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer
/// will ask for backpressure
static Result<std::unique_ptr<DatasetWriter>> Make(
FileSystemDatasetWriteOptions write_options,
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
~DatasetWriter();
/// \brief Write a batch to the dataset
/// \param[in] batch The batch to write
/// \param[in] directory The directory to write to
///
/// Note: The written filename will be {directory}/{filename_factory(i)} where i is a
/// counter controlled by `max_open_files` and `max_rows_per_file`
///
/// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches
/// may be written to the same file.
///
/// The returned future will be marked finished when the record batch has been queued
/// to be written. If the returned future is unfinished then this indicates the dataset
/// writer's queue is full and the data provider should pause.
///
/// This method is NOT async reentrant. The returned future will only be unfinished
/// if back pressure needs to be applied. Async reentrancy is not necessary for
/// concurrent writes to happen. Calling this method again before the previous future
/// completes will not just violate max_rows_queued but likely lead to race conditions.
///
/// One thing to note is that the ordering of your data can affect your maximum
/// potential parallelism. If this seems odd then consider a dataset where the first
/// 1000 batches go to the same directory and then the 1001st batch goes to a different
/// directory. The only way to get two parallel writes immediately would be to queue
/// all 1000 pending writes to the first directory.
Future<> WriteRecordBatch(std::shared_ptr<RecordBatch> batch,
const std::string& directory, const std::string& prefix = "");
/// Finish all pending writes and close any open files
Future<> Finish();
protected:
DatasetWriter(FileSystemDatasetWriteOptions write_options,
uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
class DatasetWriterImpl;
std::unique_ptr<DatasetWriterImpl, util::DestroyingDeleter<DatasetWriterImpl>> impl_;
};
} // namespace internal
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,271 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
/// Logic for automatically determining the structure of multi-file
/// dataset with possible partitioning according to available
/// partitioning
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/dataset/partition.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/filesystem/type_fwd.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/variant.h"
namespace arrow {
namespace dataset {
/// \defgroup dataset-discovery Discovery API
///
/// @{
struct InspectOptions {
/// See `fragments` property.
static constexpr int kInspectAllFragments = -1;
/// Indicate how many fragments should be inspected to infer the unified dataset
/// schema. Limiting the number of fragments accessed improves the latency of
/// the discovery process when dealing with a high number of fragments and/or
/// high latency file systems.
///
/// The default value of `1` inspects the schema of the first (in no particular
/// order) fragment only. If the dataset has a uniform schema for all fragments,
/// this default is the optimal value. In order to inspect all fragments and
/// robustly unify their potentially varying schemas, set this option to
/// `kInspectAllFragments`. A value of `0` disables inspection of fragments
/// altogether so only the partitioning schema will be inspected.
int fragments = 1;
};
struct FinishOptions {
/// Finalize the dataset with this given schema. If the schema is not
/// provided, infer the schema via the Inspect, see the `inspect_options`
/// property.
std::shared_ptr<Schema> schema = NULLPTR;
/// If the schema is not provided, it will be discovered by passing the
/// following options to `DatasetDiscovery::Inspect`.
InspectOptions inspect_options{};
/// Indicate if the given Schema (when specified), should be validated against
/// the fragments' schemas. `inspect_options` will control how many fragments
/// are checked.
bool validate_fragments = false;
};
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected
/// schema before materializing said Dataset.
class ARROW_DS_EXPORT DatasetFactory {
public:
/// \brief Get the schemas of the Fragments and Partitioning.
virtual Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) = 0;
/// \brief Get unified schema for the resulting Dataset.
Result<std::shared_ptr<Schema>> Inspect(InspectOptions options = {});
/// \brief Create a Dataset
Result<std::shared_ptr<Dataset>> Finish();
/// \brief Create a Dataset with the given schema (see \a InspectOptions::schema)
Result<std::shared_ptr<Dataset>> Finish(std::shared_ptr<Schema> schema);
/// \brief Create a Dataset with the given options
virtual Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) = 0;
/// \brief Optional root partition for the resulting Dataset.
const compute::Expression& root_partition() const { return root_partition_; }
/// \brief Set the root partition for the resulting Dataset.
Status SetRootPartition(compute::Expression partition) {
root_partition_ = std::move(partition);
return Status::OK();
}
virtual ~DatasetFactory() = default;
protected:
DatasetFactory();
compute::Expression root_partition_;
};
/// @}
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's
/// expected schema before materialization.
/// \ingroup dataset-implementations
class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory {
public:
static Result<std::shared_ptr<DatasetFactory>> Make(
std::vector<std::shared_ptr<DatasetFactory>> factories);
/// \brief Return the list of child DatasetFactory
const std::vector<std::shared_ptr<DatasetFactory>>& factories() const {
return factories_;
}
/// \brief Get the schemas of the Datasets.
///
/// Instead of applying options globally, it applies at each child factory.
/// This will not respect `options.fragments` exactly, but will respect the
/// spirit of peeking the first fragments or all of them.
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) override;
/// \brief Create a Dataset.
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
protected:
explicit UnionDatasetFactory(std::vector<std::shared_ptr<DatasetFactory>> factories);
std::vector<std::shared_ptr<DatasetFactory>> factories_;
};
/// \ingroup dataset-filesystem
struct FileSystemFactoryOptions {
/// Either an explicit Partitioning or a PartitioningFactory to discover one.
///
/// If a factory is provided, it will be used to infer a schema for partition fields
/// based on file and directory paths then construct a Partitioning. The default
/// is a Partitioning which will yield no partition information.
///
/// The (explicit or discovered) partitioning will be applied to discovered files
/// and the resulting partition information embedded in the Dataset.
PartitioningOrFactory partitioning{Partitioning::Default()};
/// For the purposes of applying the partitioning, paths will be stripped
/// of the partition_base_dir. Files not matching the partition_base_dir
/// prefix will be skipped for partition discovery. The ignored files will still
/// be part of the Dataset, but will not have partition information.
///
/// Example:
/// partition_base_dir = "/dataset";
///
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
///
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
///
/// This is useful for partitioning which parses directory when ordering
/// is important, e.g. DirectoryPartitioning.
std::string partition_base_dir;
/// Invalid files (via selector or explicitly) will be excluded by checking
/// with the FileFormat::IsSupported method. This will incur IO for each files
/// in a serial and single threaded fashion. Disabling this feature will skip the
/// IO, but unsupported files may be present in the Dataset
/// (resulting in an error at scan time).
bool exclude_invalid_files = false;
/// When discovering from a Selector (and not from an explicit file list), ignore
/// files and directories matching any of these prefixes.
///
/// Example (with selector = "/dataset/**"):
/// selector_ignore_prefixes = {"_", ".DS_STORE" };
///
/// - "/dataset/data.csv" -> not ignored
/// - "/dataset/_metadata" -> ignored
/// - "/dataset/.DS_STORE" -> ignored
/// - "/dataset/_hidden/dat" -> ignored
/// - "/dataset/nested/.DS_STORE" -> ignored
std::vector<std::string> selector_ignore_prefixes = {
".",
"_",
};
};
/// \brief FileSystemDatasetFactory creates a Dataset from a vector of
/// fs::FileInfo or a fs::FileSelector.
/// \ingroup dataset-filesystem
class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
public:
/// \brief Build a FileSystemDatasetFactory from an explicit list of
/// paths.
///
/// \param[in] filesystem passed to FileSystemDataset
/// \param[in] paths passed to FileSystemDataset
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
/// \brief Build a FileSystemDatasetFactory from a fs::FileSelector.
///
/// The selector will expand to a vector of FileInfo. The expansion/crawling
/// is performed in this function call. Thus, the finalized Dataset is
/// working with a snapshot of the filesystem.
//
/// If options.partition_base_dir is not provided, it will be overwritten
/// with selector.base_dir.
///
/// \param[in] filesystem passed to FileSystemDataset
/// \param[in] selector used to crawl and search files
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(
std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
/// \brief Build a FileSystemDatasetFactory from an uri including filesystem
/// information.
///
/// \param[in] uri passed to FileSystemDataset
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(std::string uri,
std::shared_ptr<FileFormat> format,
FileSystemFactoryOptions options);
/// \brief Build a FileSystemDatasetFactory from an explicit list of
/// file information.
///
/// \param[in] filesystem passed to FileSystemDataset
/// \param[in] files passed to FileSystemDataset
/// \param[in] format passed to FileSystemDataset
/// \param[in] options see FileSystemFactoryOptions for more information.
static Result<std::shared_ptr<DatasetFactory>> Make(
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) override;
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
protected:
FileSystemDatasetFactory(std::vector<fs::FileInfo> files,
std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<FileFormat> format,
FileSystemFactoryOptions options);
Result<std::shared_ptr<Schema>> PartitionSchema();
std::vector<fs::FileInfo> files_;
std::shared_ptr<fs::FileSystem> fs_;
std::shared_ptr<FileFormat> format_;
FileSystemFactoryOptions options_;
};
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,433 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/partition.h"
#include "arrow/dataset/scanner.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/filesystem/filesystem.h"
#include "arrow/io/file.h"
#include "arrow/util/compression.h"
namespace arrow {
namespace dataset {
/// \defgroup dataset-file-formats File formats for reading and writing datasets
/// \defgroup dataset-filesystem File system datasets
///
/// @{
/// \brief The path and filesystem where an actual file is located or a buffer which can
/// be read like a file
class ARROW_DS_EXPORT FileSource {
public:
FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
Compression::type compression = Compression::UNCOMPRESSED)
: file_info_(std::move(path)),
filesystem_(std::move(filesystem)),
compression_(compression) {}
FileSource(fs::FileInfo info, std::shared_ptr<fs::FileSystem> filesystem,
Compression::type compression = Compression::UNCOMPRESSED)
: file_info_(std::move(info)),
filesystem_(std::move(filesystem)),
compression_(compression) {}
explicit FileSource(std::shared_ptr<Buffer> buffer,
Compression::type compression = Compression::UNCOMPRESSED)
: buffer_(std::move(buffer)), compression_(compression) {}
using CustomOpen = std::function<Result<std::shared_ptr<io::RandomAccessFile>>()>;
explicit FileSource(CustomOpen open) : custom_open_(std::move(open)) {}
using CustomOpenWithCompression =
std::function<Result<std::shared_ptr<io::RandomAccessFile>>(Compression::type)>;
explicit FileSource(CustomOpenWithCompression open_with_compression,
Compression::type compression = Compression::UNCOMPRESSED)
: custom_open_(std::bind(std::move(open_with_compression), compression)),
compression_(compression) {}
explicit FileSource(std::shared_ptr<io::RandomAccessFile> file,
Compression::type compression = Compression::UNCOMPRESSED)
: custom_open_([=] { return ToResult(file); }), compression_(compression) {}
FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {}
static std::vector<FileSource> FromPaths(const std::shared_ptr<fs::FileSystem>& fs,
std::vector<std::string> paths) {
std::vector<FileSource> sources;
for (auto&& path : paths) {
sources.emplace_back(std::move(path), fs);
}
return sources;
}
/// \brief Return the type of raw compression on the file, if any.
Compression::type compression() const { return compression_; }
/// \brief Return the file path, if any. Only valid when file source wraps a path.
const std::string& path() const {
static std::string buffer_path = "<Buffer>";
static std::string custom_open_path = "<Buffer>";
return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path;
}
/// \brief Return the filesystem, if any. Otherwise returns nullptr
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
/// \brief Return the buffer containing the file, if any. Otherwise returns nullptr
const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
/// \brief Get a RandomAccessFile which views this file source
Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
/// \brief Get an InputStream which views this file source (and decompresses if needed)
/// \param[in] compression If nullopt, guess the compression scheme from the
/// filename, else decompress with the given codec
Result<std::shared_ptr<io::InputStream>> OpenCompressed(
util::optional<Compression::type> compression = util::nullopt) const;
private:
static Result<std::shared_ptr<io::RandomAccessFile>> InvalidOpen() {
return Status::Invalid("Called Open() on an uninitialized FileSource");
}
fs::FileInfo file_info_;
std::shared_ptr<fs::FileSystem> filesystem_;
std::shared_ptr<Buffer> buffer_;
CustomOpen custom_open_;
Compression::type compression_ = Compression::UNCOMPRESSED;
};
/// \brief Base class for file format implementation
class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileFormat> {
public:
/// Options affecting how this format is scanned.
///
/// The options here can be overridden at scan time.
std::shared_ptr<FragmentScanOptions> default_fragment_scan_options;
virtual ~FileFormat() = default;
/// \brief The name identifying the kind of file format
virtual std::string type_name() const = 0;
virtual bool Equals(const FileFormat& other) const = 0;
/// \brief Indicate if the FileSource is supported/readable by this format.
virtual Result<bool> IsSupported(const FileSource& source) const = 0;
/// \brief Return the schema of the file if possible.
virtual Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const = 0;
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const = 0;
virtual Future<util::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options);
/// \brief Open a fragment
virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema);
/// \brief Create a FileFragment for a FileSource.
Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression);
/// \brief Create a FileFragment for a FileSource.
Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, std::shared_ptr<Schema> physical_schema = NULLPTR);
/// \brief Create a writer for this format.
virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const = 0;
/// \brief Get default write options for this format.
virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() = 0;
};
/// \brief A Fragment that is stored in a file with a known format
class ARROW_DS_EXPORT FileFragment : public Fragment {
public:
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options) override;
Future<util::optional<int64_t>> CountRows(
compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
std::string type_name() const override { return format_->type_name(); }
std::string ToString() const override { return source_.path(); };
const FileSource& source() const { return source_; }
const std::shared_ptr<FileFormat>& format() const { return format_; }
protected:
FileFragment(FileSource source, std::shared_ptr<FileFormat> format,
compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema)
: Fragment(std::move(partition_expression), std::move(physical_schema)),
source_(std::move(source)),
format_(std::move(format)) {}
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
FileSource source_;
std::shared_ptr<FileFormat> format_;
friend class FileFormat;
};
/// \brief A Dataset of FileFragments.
///
/// A FileSystemDataset is composed of one or more FileFragment. The fragments
/// are independent and don't need to share the same format and/or filesystem.
class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
public:
/// \brief Create a FileSystemDataset.
///
/// \param[in] schema the schema of the dataset
/// \param[in] root_partition the partition expression of the dataset
/// \param[in] format the format of each FileFragment.
/// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the
/// fragments wrap buffers.
/// \param[in] fragments list of fragments to create the dataset from.
/// \param[in] partitioning the Partitioning object in case the dataset is created
/// with a known partitioning (e.g. from a discovered partitioning
/// through a DatasetFactory), or nullptr if not known.
///
/// Note that fragments wrapping files resident in differing filesystems are not
/// permitted; to work with multiple filesystems use a UnionDataset.
///
/// \return A constructed dataset.
static Result<std::shared_ptr<FileSystemDataset>> Make(
std::shared_ptr<Schema> schema, compute::Expression root_partition,
std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
std::vector<std::shared_ptr<FileFragment>> fragments,
std::shared_ptr<Partitioning> partitioning = NULLPTR);
/// \brief Write a dataset.
static Status Write(const FileSystemDatasetWriteOptions& write_options,
std::shared_ptr<Scanner> scanner);
/// \brief Return the type name of the dataset.
std::string type_name() const override { return "filesystem"; }
/// \brief Replace the schema of the dataset.
Result<std::shared_ptr<Dataset>> ReplaceSchema(
std::shared_ptr<Schema> schema) const override;
/// \brief Return the path of files.
std::vector<std::string> files() const;
/// \brief Return the format.
const std::shared_ptr<FileFormat>& format() const { return format_; }
/// \brief Return the filesystem. May be nullptr if the fragments wrap buffers.
const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
/// \brief Return the partitioning. May be nullptr if the dataset was not constructed
/// with a partitioning.
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
std::string ToString() const;
protected:
struct FragmentSubtrees;
explicit FileSystemDataset(std::shared_ptr<Schema> schema)
: Dataset(std::move(schema)) {}
FileSystemDataset(std::shared_ptr<Schema> schema,
compute::Expression partition_expression)
: Dataset(std::move(schema), partition_expression) {}
Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
void SetupSubtreePruning();
std::shared_ptr<FileFormat> format_;
std::shared_ptr<fs::FileSystem> filesystem_;
std::vector<std::shared_ptr<FileFragment>> fragments_;
std::shared_ptr<Partitioning> partitioning_;
std::shared_ptr<FragmentSubtrees> subtrees_;
};
/// \brief Options for writing a file of this format.
class ARROW_DS_EXPORT FileWriteOptions {
public:
virtual ~FileWriteOptions() = default;
const std::shared_ptr<FileFormat>& format() const { return format_; }
std::string type_name() const { return format_->type_name(); }
protected:
explicit FileWriteOptions(std::shared_ptr<FileFormat> format)
: format_(std::move(format)) {}
std::shared_ptr<FileFormat> format_;
};
/// \brief A writer for this format.
class ARROW_DS_EXPORT FileWriter {
public:
virtual ~FileWriter() = default;
/// \brief Write the given batch.
virtual Status Write(const std::shared_ptr<RecordBatch>& batch) = 0;
/// \brief Write all batches from the reader.
Status Write(RecordBatchReader* batches);
/// \brief Indicate that writing is done.
virtual Future<> Finish();
const std::shared_ptr<FileFormat>& format() const { return options_->format(); }
const std::shared_ptr<Schema>& schema() const { return schema_; }
const std::shared_ptr<FileWriteOptions>& options() const { return options_; }
const fs::FileLocator& destination() const { return destination_locator_; }
protected:
FileWriter(std::shared_ptr<Schema> schema, std::shared_ptr<FileWriteOptions> options,
std::shared_ptr<io::OutputStream> destination,
fs::FileLocator destination_locator)
: schema_(std::move(schema)),
options_(std::move(options)),
destination_(std::move(destination)),
destination_locator_(std::move(destination_locator)) {}
virtual Future<> FinishInternal() = 0;
std::shared_ptr<Schema> schema_;
std::shared_ptr<FileWriteOptions> options_;
std::shared_ptr<io::OutputStream> destination_;
fs::FileLocator destination_locator_;
};
/// \brief Options for writing a dataset.
struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
/// Options for individual fragment writing.
std::shared_ptr<FileWriteOptions> file_write_options;
/// FileSystem into which a dataset will be written.
std::shared_ptr<fs::FileSystem> filesystem;
/// Root directory into which the dataset will be written.
std::string base_dir;
/// Partitioning used to generate fragment paths.
std::shared_ptr<Partitioning> partitioning;
/// Maximum number of partitions any batch may be written into, default is 1K.
int max_partitions = 1024;
/// Template string used to generate fragment basenames.
/// {i} will be replaced by an auto incremented integer.
std::string basename_template;
/// If greater than 0 then this will limit the maximum number of files that can be left
/// open. If an attempt is made to open too many files then the least recently used file
/// will be closed. If this setting is set too low you may end up fragmenting your data
/// into many small files.
///
/// The default is 900 which also allows some # of files to be open by the scanner
/// before hitting the default Linux limit of 1024
uint32_t max_open_files = 900;
/// If greater than 0 then this will limit how many rows are placed in any single file.
/// Otherwise there will be no limit and one file will be created in each output
/// directory unless files need to be closed to respect max_open_files
uint64_t max_rows_per_file = 0;
/// If greater than 0 then this will cause the dataset writer to batch incoming data
/// and only write the row groups to the disk when sufficient rows have accumulated.
/// The final row group size may be less than this value and other options such as
/// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes.
uint64_t min_rows_per_group = 0;
/// If greater than 0 then the dataset writer may split up large incoming batches into
/// multiple row groups. If this value is set then min_rows_per_group should also be
/// set or else you may end up with very small row groups (e.g. if the incoming row
/// group size is just barely larger than this value).
uint64_t max_rows_per_group = 1 << 20;
/// Controls what happens if an output directory already exists.
ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError;
/// \brief If false the dataset writer will not create directories
/// This is mainly intended for filesystems that do not require directories such as S3.
bool create_dir = true;
/// Callback to be invoked against all FileWriters before
/// they are finalized with FileWriter::Finish().
std::function<Status(FileWriter*)> writer_pre_finish = [](FileWriter*) {
return Status::OK();
};
/// Callback to be invoked against all FileWriters after they have
/// called FileWriter::Finish().
std::function<Status(FileWriter*)> writer_post_finish = [](FileWriter*) {
return Status::OK();
};
const std::shared_ptr<FileFormat>& format() const {
return file_write_options->format();
}
};
/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions
class ARROW_DS_EXPORT WriteNodeOptions : public compute::ExecNodeOptions {
public:
explicit WriteNodeOptions(
FileSystemDatasetWriteOptions options,
std::shared_ptr<const KeyValueMetadata> custom_metadata = NULLPTR)
: write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {}
/// \brief Options to control how to write the dataset
FileSystemDatasetWriteOptions write_options;
/// \brief Optional metadata to attach to written batches
std::shared_ptr<const KeyValueMetadata> custom_metadata;
};
/// @}
namespace internal {
ARROW_DS_EXPORT void InitializeDatasetWriter(
arrow::compute::ExecFactoryRegistry* registry);
}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,118 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include "arrow/csv/options.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/ipc/type_fwd.h"
#include "arrow/status.h"
#include "arrow/util/compression.h"
namespace arrow {
namespace dataset {
constexpr char kCsvTypeName[] = "csv";
/// \addtogroup dataset-file-formats
///
/// @{
/// \brief A FileFormat implementation that reads from and writes to Csv files
class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
public:
/// Options affecting the parsing of CSV files
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
std::string type_name() const override { return kCsvTypeName; }
bool Equals(const FileFormat& other) const override;
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& scan_options,
const std::shared_ptr<FileFragment>& file) const override;
Future<util::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// \brief Per-scan options for CSV fragments
struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
std::string type_name() const override { return kCsvTypeName; }
/// CSV conversion options
csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
/// CSV reading options
///
/// Note that use_threads is always ignored.
csv::ReadOptions read_options = csv::ReadOptions::Defaults();
};
class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions {
public:
/// Options passed to csv::MakeCSVWriter.
std::shared_ptr<csv::WriteOptions> write_options;
protected:
using FileWriteOptions::FileWriteOptions;
friend class CsvFileFormat;
};
class ARROW_DS_EXPORT CsvFileWriter : public FileWriter {
public:
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
private:
CsvFileWriter(std::shared_ptr<io::OutputStream> destination,
std::shared_ptr<ipc::RecordBatchWriter> writer,
std::shared_ptr<Schema> schema,
std::shared_ptr<CsvFileWriteOptions> options,
fs::FileLocator destination_locator);
Future<> FinishInternal() override;
std::shared_ptr<io::OutputStream> destination_;
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
friend class CsvFileFormat;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,120 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/type_fwd.h"
#include "arrow/ipc/type_fwd.h"
#include "arrow/result.h"
namespace arrow {
namespace dataset {
/// \addtogroup dataset-file-formats
///
/// @{
constexpr char kIpcTypeName[] = "ipc";
/// \brief A FileFormat implementation that reads from and writes to Ipc files
class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
public:
std::string type_name() const override { return kIpcTypeName; }
bool Equals(const FileFormat& other) const override {
return type_name() == other.type_name();
}
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const override;
Future<util::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// \brief Per-scan options for IPC fragments
class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions {
public:
std::string type_name() const override { return kIpcTypeName; }
/// Options passed to the IPC file reader.
/// included_fields, memory_pool, and use_threads are ignored.
std::shared_ptr<ipc::IpcReadOptions> options;
/// If present, the async scanner will enable I/O coalescing.
/// This is ignored by the sync scanner.
std::shared_ptr<io::CacheOptions> cache_options;
};
class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions {
public:
/// Options passed to ipc::MakeFileWriter. use_threads is ignored
std::shared_ptr<ipc::IpcWriteOptions> options;
/// custom_metadata written to the file's footer
std::shared_ptr<const KeyValueMetadata> metadata;
protected:
using FileWriteOptions::FileWriteOptions;
friend class IpcFileFormat;
};
class ARROW_DS_EXPORT IpcFileWriter : public FileWriter {
public:
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
private:
IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
std::shared_ptr<ipc::RecordBatchWriter> writer,
std::shared_ptr<Schema> schema,
std::shared_ptr<IpcFileWriteOptions> options,
fs::FileLocator destination_locator);
Future<> FinishInternal() override;
std::shared_ptr<io::OutputStream> destination_;
std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
friend class IpcFileFormat;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,73 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/type_fwd.h"
#include "arrow/result.h"
namespace arrow {
namespace dataset {
/// \addtogroup dataset-file-formats
///
/// @{
constexpr char kOrcTypeName[] = "orc";
/// \brief A FileFormat implementation that reads from and writes to ORC files
class ARROW_DS_EXPORT OrcFileFormat : public FileFormat {
public:
std::string type_name() const override { return kOrcTypeName; }
bool Equals(const FileFormat& other) const override {
return type_name() == other.type_name();
}
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const override;
Future<util::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,373 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "arrow/dataset/discovery.h"
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/caching.h"
#include "arrow/util/optional.h"
namespace parquet {
class ParquetFileReader;
class Statistics;
class ColumnChunkMetaData;
class RowGroupMetaData;
class FileMetaData;
class FileDecryptionProperties;
class FileEncryptionProperties;
class ReaderProperties;
class ArrowReaderProperties;
class WriterProperties;
class ArrowWriterProperties;
namespace arrow {
class FileReader;
class FileWriter;
struct SchemaManifest;
} // namespace arrow
} // namespace parquet
namespace arrow {
namespace dataset {
/// \addtogroup dataset-file-formats
///
/// @{
constexpr char kParquetTypeName[] = "parquet";
/// \brief A FileFormat implementation that reads from Parquet files
class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
public:
ParquetFileFormat() = default;
/// Convenience constructor which copies properties from a parquet::ReaderProperties.
/// memory_pool will be ignored.
explicit ParquetFileFormat(const parquet::ReaderProperties& reader_properties);
std::string type_name() const override { return kParquetTypeName; }
bool Equals(const FileFormat& other) const override;
struct ReaderOptions {
/// \defgroup parquet-file-format-arrow-reader-properties properties which correspond
/// to members of parquet::ArrowReaderProperties.
///
/// We don't embed parquet::ReaderProperties directly because column names (rather
/// than indices) are used to indicate dictionary columns, and other options are
/// deferred to scan time.
///
/// @{
std::unordered_set<std::string> dict_columns;
arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO;
/// @}
} reader_options;
Result<bool> IsSupported(const FileSource& source) const override;
/// \brief Return the schema of the file if possible.
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
Result<RecordBatchGenerator> ScanBatchesAsync(
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file) const override;
Future<util::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
const std::shared_ptr<ScanOptions>& options) override;
using FileFormat::MakeFragment;
/// \brief Create a Fragment targeting all RowGroups.
Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema) override;
/// \brief Create a Fragment, restricted to the specified row groups.
Result<std::shared_ptr<ParquetFileFragment>> MakeFragment(
FileSource source, compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema, std::vector<int> row_groups);
/// \brief Return a FileReader on the given source.
Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
Result<std::shared_ptr<FileWriter>> MakeWriter(
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
std::shared_ptr<FileWriteOptions> options,
fs::FileLocator destination_locator) const override;
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
};
/// \brief A FileFragment with parquet logic.
///
/// ParquetFileFragment provides a lazy (with respect to IO) interface to
/// scan parquet files. Any heavy IO calls are deferred to the Scan() method.
///
/// The caller can provide an optional list of selected RowGroups to limit the
/// number of scanned RowGroups, or to partition the scans across multiple
/// threads.
///
/// Metadata can be explicitly provided, enabling pushdown predicate benefits without
/// the potentially heavy IO of loading Metadata from the file system. This can induce
/// significant performance boost when scanning high latency file systems.
class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
public:
Result<FragmentVector> SplitByRowGroup(compute::Expression predicate);
/// \brief Return the RowGroups selected by this fragment.
const std::vector<int>& row_groups() const {
if (row_groups_) return *row_groups_;
static std::vector<int> empty;
return empty;
}
/// \brief Return the FileMetaData associated with this fragment.
const std::shared_ptr<parquet::FileMetaData>& metadata() const { return metadata_; }
/// \brief Ensure this fragment's FileMetaData is in memory.
Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR);
/// \brief Return fragment which selects a filtered subset of this fragment's RowGroups.
Result<std::shared_ptr<Fragment>> Subset(compute::Expression predicate);
Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);
private:
ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
compute::Expression partition_expression,
std::shared_ptr<Schema> physical_schema,
util::optional<std::vector<int>> row_groups);
Status SetMetadata(std::shared_ptr<parquet::FileMetaData> metadata,
std::shared_ptr<parquet::arrow::SchemaManifest> manifest);
// Overridden to opportunistically set metadata since a reader must be opened anyway.
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override {
ARROW_RETURN_NOT_OK(EnsureCompleteMetadata());
return physical_schema_;
}
/// Return a filtered subset of row group indices.
Result<std::vector<int>> FilterRowGroups(compute::Expression predicate);
/// Simplify the predicate against the statistics of each row group.
Result<std::vector<compute::Expression>> TestRowGroups(compute::Expression predicate);
/// Try to count rows matching the predicate using metadata. Expects
/// metadata to be present, and expects the predicate to have been
/// simplified against the partition expression already.
Result<util::optional<int64_t>> TryCountRows(compute::Expression predicate);
ParquetFileFormat& parquet_format_;
/// Indices of row groups selected by this fragment,
/// or util::nullopt if all row groups are selected.
util::optional<std::vector<int>> row_groups_;
std::vector<compute::Expression> statistics_expressions_;
std::vector<bool> statistics_expressions_complete_;
std::shared_ptr<parquet::FileMetaData> metadata_;
std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
friend class ParquetFileFormat;
friend class ParquetDatasetFactory;
};
/// \brief Per-scan options for Parquet fragments
class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions {
public:
ParquetFragmentScanOptions();
std::string type_name() const override { return kParquetTypeName; }
/// Reader properties. Not all properties are respected: memory_pool comes from
/// ScanOptions.
std::shared_ptr<parquet::ReaderProperties> reader_properties;
/// Arrow reader properties. Not all properties are respected: batch_size comes from
/// ScanOptions. Additionally, dictionary columns come from
/// ParquetFileFormat::ReaderOptions::dict_columns.
std::shared_ptr<parquet::ArrowReaderProperties> arrow_reader_properties;
};
class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions {
public:
/// \brief Parquet writer properties.
std::shared_ptr<parquet::WriterProperties> writer_properties;
/// \brief Parquet Arrow writer properties.
std::shared_ptr<parquet::ArrowWriterProperties> arrow_writer_properties;
protected:
using FileWriteOptions::FileWriteOptions;
friend class ParquetFileFormat;
};
class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter {
public:
const std::shared_ptr<parquet::arrow::FileWriter>& parquet_writer() const {
return parquet_writer_;
}
Status Write(const std::shared_ptr<RecordBatch>& batch) override;
private:
ParquetFileWriter(std::shared_ptr<io::OutputStream> destination,
std::shared_ptr<parquet::arrow::FileWriter> writer,
std::shared_ptr<ParquetFileWriteOptions> options,
fs::FileLocator destination_locator);
Future<> FinishInternal() override;
std::shared_ptr<parquet::arrow::FileWriter> parquet_writer_;
friend class ParquetFileFormat;
};
/// \brief Options for making a FileSystemDataset from a Parquet _metadata file.
struct ParquetFactoryOptions {
/// Either an explicit Partitioning or a PartitioningFactory to discover one.
///
/// If a factory is provided, it will be used to infer a schema for partition fields
/// based on file and directory paths then construct a Partitioning. The default
/// is a Partitioning which will yield no partition information.
///
/// The (explicit or discovered) partitioning will be applied to discovered files
/// and the resulting partition information embedded in the Dataset.
PartitioningOrFactory partitioning{Partitioning::Default()};
/// For the purposes of applying the partitioning, paths will be stripped
/// of the partition_base_dir. Files not matching the partition_base_dir
/// prefix will be skipped for partition discovery. The ignored files will still
/// be part of the Dataset, but will not have partition information.
///
/// Example:
/// partition_base_dir = "/dataset";
///
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
///
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
///
/// This is useful for partitioning which parses directory when ordering
/// is important, e.g. DirectoryPartitioning.
std::string partition_base_dir;
/// Assert that all ColumnChunk paths are consistent. The parquet spec allows for
/// ColumnChunk data to be stored in multiple files, but ParquetDatasetFactory
/// supports only a single file with all ColumnChunk data. If this flag is set
/// construction of a ParquetDatasetFactory will raise an error if ColumnChunk
/// data is not resident in a single file.
bool validate_column_chunk_paths = false;
};
/// \brief Create FileSystemDataset from custom `_metadata` cache file.
///
/// Dask and other systems will generate a cache metadata file by concatenating
/// the RowGroupMetaData of multiple parquet files into a single parquet file
/// that only contains metadata and no ColumnChunk data.
///
/// ParquetDatasetFactory creates a FileSystemDataset composed of
/// ParquetFileFragment where each fragment is pre-populated with the exact
/// number of row groups and statistics for each columns.
class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
public:
/// \brief Create a ParquetDatasetFactory from a metadata path.
///
/// The `metadata_path` will be read from `filesystem`. Each RowGroup
/// contained in the metadata file will be relative to `dirname(metadata_path)`.
///
/// \param[in] metadata_path path of the metadata parquet file
/// \param[in] filesystem from which to open/read the path
/// \param[in] format to read the file with.
/// \param[in] options see ParquetFactoryOptions
static Result<std::shared_ptr<DatasetFactory>> Make(
const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
/// \brief Create a ParquetDatasetFactory from a metadata source.
///
/// Similar to the previous Make definition, but the metadata can be a Buffer
/// and the base_path is explicited instead of inferred from the metadata
/// path.
///
/// \param[in] metadata source to open the metadata parquet file from
/// \param[in] base_path used as the prefix of every parquet files referenced
/// \param[in] filesystem from which to read the files referenced.
/// \param[in] format to read the file with.
/// \param[in] options see ParquetFactoryOptions
static Result<std::shared_ptr<DatasetFactory>> Make(
const FileSource& metadata, const std::string& base_path,
std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
InspectOptions options) override;
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
protected:
ParquetDatasetFactory(
std::shared_ptr<fs::FileSystem> filesystem,
std::shared_ptr<ParquetFileFormat> format,
std::shared_ptr<parquet::FileMetaData> metadata,
std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
std::shared_ptr<Schema> physical_schema, std::string base_path,
ParquetFactoryOptions options,
std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids)
: filesystem_(std::move(filesystem)),
format_(std::move(format)),
metadata_(std::move(metadata)),
manifest_(std::move(manifest)),
physical_schema_(std::move(physical_schema)),
base_path_(std::move(base_path)),
options_(std::move(options)),
paths_with_row_group_ids_(std::move(paths_with_row_group_ids)) {}
std::shared_ptr<fs::FileSystem> filesystem_;
std::shared_ptr<ParquetFileFormat> format_;
std::shared_ptr<parquet::FileMetaData> metadata_;
std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
std::shared_ptr<Schema> physical_schema_;
std::string base_path_;
ParquetFactoryOptions options_;
std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids_;
private:
Result<std::vector<std::shared_ptr<FileFragment>>> CollectParquetFragments(
const Partitioning& partitioning);
Result<std::shared_ptr<Schema>> PartitionSchema();
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,409 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <iosfwd>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "arrow/compute/exec/expression.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/util/optional.h"
namespace arrow {
namespace dataset {
constexpr char kFilenamePartitionSep = '_';
// ----------------------------------------------------------------------
// Partitioning
/// \defgroup dataset-partitioning Partitioning API
///
/// @{
/// \brief Interface for parsing partition expressions from string partition
/// identifiers.
///
/// For example, the identifier "foo=5" might be parsed to an equality expression
/// between the "foo" field and the value 5.
///
/// Some partitionings may store the field names in a metadata
/// store instead of in file paths, for example
/// dataset_root/2009/11/... could be used when the partition fields
/// are "year" and "month"
///
/// Paths are consumed from left to right. Paths must be relative to
/// the root of a partition; path prefixes must be removed before passing
/// the path to a partitioning for parsing.
class ARROW_DS_EXPORT Partitioning {
public:
virtual ~Partitioning() = default;
/// \brief The name identifying the kind of partitioning
virtual std::string type_name() const = 0;
/// \brief If the input batch shares any fields with this partitioning,
/// produce sub-batches which satisfy mutually exclusive Expressions.
struct PartitionedBatches {
RecordBatchVector batches;
std::vector<compute::Expression> expressions;
};
virtual Result<PartitionedBatches> Partition(
const std::shared_ptr<RecordBatch>& batch) const = 0;
/// \brief Parse a path into a partition expression
virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
struct PartitionPathFormat {
std::string directory, prefix;
};
virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
/// \brief A default Partitioning which always yields scalar(true)
static std::shared_ptr<Partitioning> Default();
/// \brief The partition schema.
const std::shared_ptr<Schema>& schema() { return schema_; }
protected:
explicit Partitioning(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
std::shared_ptr<Schema> schema_;
};
/// \brief The encoding of partition segments.
enum class SegmentEncoding : int8_t {
/// No encoding.
None = 0,
/// Segment values are URL-encoded.
Uri = 1,
};
ARROW_DS_EXPORT
std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding);
/// \brief Options for key-value based partitioning (hive/directory).
struct ARROW_DS_EXPORT KeyValuePartitioningOptions {
/// After splitting a path into components, decode the path components
/// before parsing according to this scheme.
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
};
/// \brief Options for inferring a partitioning.
struct ARROW_DS_EXPORT PartitioningFactoryOptions {
/// When inferring a schema for partition fields, yield dictionary encoded types
/// instead of plain. This can be more efficient when materializing virtual
/// columns, and Expressions parsed by the finished Partitioning will include
/// dictionaries of all unique inspected values for each field.
bool infer_dictionary = false;
/// Optionally, an expected schema can be provided, in which case inference
/// will only check discovered fields against the schema and update internal
/// state (such as dictionaries).
std::shared_ptr<Schema> schema;
/// After splitting a path into components, decode the path components
/// before parsing according to this scheme.
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
KeyValuePartitioningOptions AsPartitioningOptions() const;
};
/// \brief Options for inferring a hive-style partitioning.
struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions {
/// The hive partitioning scheme maps null to a hard coded fallback string.
std::string null_fallback;
HivePartitioningOptions AsHivePartitioningOptions() const;
};
/// \brief PartitioningFactory provides creation of a partitioning when the
/// specific schema must be inferred from available paths (no explicit schema is known).
class ARROW_DS_EXPORT PartitioningFactory {
public:
virtual ~PartitioningFactory() = default;
/// \brief The name identifying the kind of partitioning
virtual std::string type_name() const = 0;
/// Get the schema for the resulting Partitioning.
/// This may reset internal state, for example dictionaries of unique representations.
virtual Result<std::shared_ptr<Schema>> Inspect(
const std::vector<std::string>& paths) = 0;
/// Create a partitioning using the provided schema
/// (fields may be dropped).
virtual Result<std::shared_ptr<Partitioning>> Finish(
const std::shared_ptr<Schema>& schema) const = 0;
};
/// \brief Subclass for the common case of a partitioning which yields an equality
/// expression for each segment
class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
public:
/// An unconverted equality expression consisting of a field name and the representation
/// of a scalar value
struct Key {
std::string name;
util::optional<std::string> value;
};
Result<PartitionedBatches> Partition(
const std::shared_ptr<RecordBatch>& batch) const override;
Result<compute::Expression> Parse(const std::string& path) const override;
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
const ArrayVector& dictionaries() const { return dictionaries_; }
protected:
KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
KeyValuePartitioningOptions options)
: Partitioning(std::move(schema)),
dictionaries_(std::move(dictionaries)),
options_(options) {
if (dictionaries_.empty()) {
dictionaries_.resize(schema_->num_fields());
}
}
virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
/// Convert a Key to a full expression.
Result<compute::Expression> ConvertKey(const Key& key) const;
Result<std::vector<std::string>> FormatPartitionSegments(
const ScalarVector& values) const;
Result<std::vector<Key>> ParsePartitionSegments(
const std::vector<std::string>& segments) const;
ArrayVector dictionaries_;
KeyValuePartitioningOptions options_;
};
/// \brief DirectoryPartitioning parses one segment of a path for each field in its
/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse
/// must contain segments for each field.
///
/// For example given schema<year:int16, month:int8> the path "/2009/11" would be
/// parsed to ("year"_ == 2009 and "month"_ == 11)
class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
public:
/// If a field in schema is of dictionary type, the corresponding element of
/// dictionaries must be contain the dictionary of values for that field.
explicit DirectoryPartitioning(std::shared_ptr<Schema> schema,
ArrayVector dictionaries = {},
KeyValuePartitioningOptions options = {});
std::string type_name() const override { return "directory"; }
/// \brief Create a factory for a directory partitioning.
///
/// \param[in] field_names The names for the partition fields. Types will be
/// inferred.
static std::shared_ptr<PartitioningFactory> MakeFactory(
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
private:
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
};
/// \brief The default fallback used for null values in a Hive-style partitioning.
static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__";
struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions {
std::string null_fallback = kDefaultHiveNullFallback;
static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) {
HivePartitioningOptions options;
options.null_fallback = std::move(fallback);
return options;
}
};
/// \brief Multi-level, directory based partitioning
/// originating from Apache Hive with all data files stored in the
/// leaf directories. Data is partitioned by static values of a
/// particular column in the schema. Partition keys are represented in
/// the form $key=$value in directory names.
/// Field order is ignored, as are missing or unrecognized field names.
///
/// For example given schema<year:int16, month:int8, day:int8> the path
/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321)
class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
public:
/// If a field in schema is of dictionary type, the corresponding element of
/// dictionaries must be contain the dictionary of values for that field.
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
std::string null_fallback = kDefaultHiveNullFallback)
: KeyValuePartitioning(std::move(schema), std::move(dictionaries),
KeyValuePartitioningOptions()),
hive_options_(
HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) {
}
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
HivePartitioningOptions options)
: KeyValuePartitioning(std::move(schema), std::move(dictionaries), options),
hive_options_(options) {}
std::string type_name() const override { return "hive"; }
std::string null_fallback() const { return hive_options_.null_fallback; }
const HivePartitioningOptions& options() const { return hive_options_; }
static Result<util::optional<Key>> ParseKey(const std::string& segment,
const HivePartitioningOptions& options);
/// \brief Create a factory for a hive partitioning.
static std::shared_ptr<PartitioningFactory> MakeFactory(
HivePartitioningFactoryOptions = {});
private:
const HivePartitioningOptions hive_options_;
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
};
/// \brief Implementation provided by lambda or other callable
class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
public:
using ParseImpl = std::function<Result<compute::Expression>(const std::string&)>;
using FormatImpl =
std::function<Result<PartitionPathFormat>(const compute::Expression&)>;
FunctionPartitioning(std::shared_ptr<Schema> schema, ParseImpl parse_impl,
FormatImpl format_impl = NULLPTR, std::string name = "function")
: Partitioning(std::move(schema)),
parse_impl_(std::move(parse_impl)),
format_impl_(std::move(format_impl)),
name_(std::move(name)) {}
std::string type_name() const override { return name_; }
Result<compute::Expression> Parse(const std::string& path) const override {
return parse_impl_(path);
}
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
if (format_impl_) {
return format_impl_(expr);
}
return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning");
}
Result<PartitionedBatches> Partition(
const std::shared_ptr<RecordBatch>& batch) const override {
return Status::NotImplemented("partitioning batches from ", type_name(),
" Partitioning");
}
private:
ParseImpl parse_impl_;
FormatImpl format_impl_;
std::string name_;
};
class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
public:
/// \brief Construct a FilenamePartitioning from its components.
///
/// If a field in schema is of dictionary type, the corresponding element of
/// dictionaries must be contain the dictionary of values for that field.
explicit FilenamePartitioning(std::shared_ptr<Schema> schema,
ArrayVector dictionaries = {},
KeyValuePartitioningOptions options = {});
std::string type_name() const override { return "filename"; }
/// \brief Create a factory for a filename partitioning.
///
/// \param[in] field_names The names for the partition fields. Types will be
/// inferred.
static std::shared_ptr<PartitioningFactory> MakeFactory(
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
private:
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
};
/// \brief Remove a prefix and the filename of a path.
///
/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"`
ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
const std::string& prefix);
/// \brief Vector version of StripPrefixAndFilename.
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
const std::vector<std::string>& paths, const std::string& prefix);
/// \brief Vector version of StripPrefixAndFilename.
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
const std::vector<fs::FileInfo>& files, const std::string& prefix);
/// \brief Either a Partitioning or a PartitioningFactory
class ARROW_DS_EXPORT PartitioningOrFactory {
public:
explicit PartitioningOrFactory(std::shared_ptr<Partitioning> partitioning)
: partitioning_(std::move(partitioning)) {}
explicit PartitioningOrFactory(std::shared_ptr<PartitioningFactory> factory)
: factory_(std::move(factory)) {}
PartitioningOrFactory& operator=(std::shared_ptr<Partitioning> partitioning) {
return *this = PartitioningOrFactory(std::move(partitioning));
}
PartitioningOrFactory& operator=(std::shared_ptr<PartitioningFactory> factory) {
return *this = PartitioningOrFactory(std::move(factory));
}
/// \brief The partitioning (if given).
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
/// \brief The partition factory (if given).
const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
/// \brief Get the partition schema, inferring it with the given factory if needed.
Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
private:
std::shared_ptr<PartitioningFactory> factory_;
std::shared_ptr<Partitioning> partitioning_;
};
/// @}
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,27 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Often-used headers, for precompiling.
// If updating this header, please make sure you check compilation speed
// before checking in. Adding headers which are not used extremely often
// may incur a slowdown, since it makes the precompiled header heavier to load.
// This API is EXPERIMENTAL.
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/scanner.h"
#include "arrow/pch.h"

View File

@@ -0,0 +1,33 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#include "arrow/dataset/visibility.h"
namespace arrow {
namespace dataset {
namespace internal {
/// Register dataset-based exec nodes with the exec node registry
///
/// This function must be called before using dataset ExecNode factories
ARROW_DS_EXPORT void Initialize();
} // namespace internal
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,32 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include "arrow/dataset/visibility.h"
#include "arrow/type_fwd.h"
namespace arrow {
namespace dataset {
// FIXME this is superceded by compute::Expression::Bind
ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to);
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,432 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/compute/exec/expression.h"
#include "arrow/compute/exec/options.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/dataset/dataset.h"
#include "arrow/dataset/projector.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/interfaces.h"
#include "arrow/memory_pool.h"
#include "arrow/type_fwd.h"
#include "arrow/util/async_generator.h"
#include "arrow/util/iterator.h"
#include "arrow/util/thread_pool.h"
#include "arrow/util/type_fwd.h"
namespace arrow {
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
namespace dataset {
/// \defgroup dataset-scanning Scanning API
///
/// @{
constexpr int64_t kDefaultBatchSize = 1 << 17; // 128Ki rows
// This will yield 64 batches ~ 8Mi rows
constexpr int32_t kDefaultBatchReadahead = 16;
constexpr int32_t kDefaultFragmentReadahead = 4;
/// Scan-specific options, which can be changed between scans of the same dataset.
struct ARROW_DS_EXPORT ScanOptions {
/// A row filter (which will be pushed down to partitioning/reading if supported).
compute::Expression filter = compute::literal(true);
/// A projection expression (which can add/remove/rename columns).
compute::Expression projection;
/// Schema with which batches will be read from fragments. This is also known as the
/// "reader schema" it will be used (for example) in constructing CSV file readers to
/// identify column types for parsing. Usually only a subset of its fields (see
/// MaterializedFields) will be materialized during a scan.
std::shared_ptr<Schema> dataset_schema;
/// Schema of projected record batches. This is independent of dataset_schema as its
/// fields are derived from the projection. For example, let
///
/// dataset_schema = {"a": int32, "b": int32, "id": utf8}
/// projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"})
///
/// (no filter specified). In this case, the projected_schema would be
///
/// {"a_plus_b": int32}
std::shared_ptr<Schema> projected_schema;
/// Maximum row count for scanned batches.
int64_t batch_size = kDefaultBatchSize;
/// How many batches to read ahead within a file
///
/// Set to 0 to disable batch readahead
///
/// Note: May not be supported by all formats
/// Note: Will be ignored if use_threads is set to false
int32_t batch_readahead = kDefaultBatchReadahead;
/// How many files to read ahead
///
/// Set to 0 to disable fragment readahead
///
/// Note: May not be enforced by all scanners
/// Note: Will be ignored if use_threads is set to false
int32_t fragment_readahead = kDefaultFragmentReadahead;
/// A pool from which materialized and scanned arrays will be allocated.
MemoryPool* pool = arrow::default_memory_pool();
/// IOContext for any IO tasks
///
/// Note: The IOContext executor will be ignored if use_threads is set to false
io::IOContext io_context;
/// If true the scanner will scan in parallel
///
/// Note: If true, this will use threads from both the cpu_executor and the
/// io_context.executor
/// Note: This must be true in order for any readahead to happen
bool use_threads = false;
/// Fragment-specific scan options.
std::shared_ptr<FragmentScanOptions> fragment_scan_options;
/// Return a vector of FieldRefs that require materialization.
///
/// This is usually the union of the fields referenced in the projection and the
/// filter expression. Examples:
///
/// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"]
/// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b"]
///
/// This is needed for expression where a field may not be directly
/// used in the final projection but is still required to evaluate the
/// expression.
///
/// This is used by Fragment implementations to apply the column
/// sub-selection optimization.
std::vector<FieldRef> MaterializedFields() const;
/// Parameters which control when the plan should pause for a slow consumer
compute::BackpressureOptions backpressure =
compute::BackpressureOptions::DefaultBackpressure();
};
/// \brief Describes a projection
struct ARROW_DS_EXPORT ProjectionDescr {
/// \brief The projection expression itself
/// This expression must be a call to make_struct
compute::Expression expression;
/// \brief The output schema of the projection.
/// This can be calculated from the input schema and the expression but it
/// is cached here for convenience.
std::shared_ptr<Schema> schema;
/// \brief Create a ProjectionDescr by binding an expression to the dataset schema
///
/// expression must return a struct type
static Result<ProjectionDescr> FromStructExpression(
const compute::Expression& expression, const Schema& dataset_schema);
/// \brief Create a ProjectionDescr from expressions/names for each field
static Result<ProjectionDescr> FromExpressions(std::vector<compute::Expression> exprs,
std::vector<std::string> names,
const Schema& dataset_schema);
/// \brief Create a default projection referencing fields in the dataset schema
static Result<ProjectionDescr> FromNames(std::vector<std::string> names,
const Schema& dataset_schema);
/// \brief Make a projection that projects every field in the dataset schema
static Result<ProjectionDescr> Default(const Schema& dataset_schema);
};
/// \brief Utility method to set the projection expression and schema
ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection);
/// \brief Combines a record batch with the fragment that the record batch originated
/// from
///
/// Knowing the source fragment can be useful for debugging & understanding loaded
/// data
struct TaggedRecordBatch {
std::shared_ptr<RecordBatch> record_batch;
std::shared_ptr<Fragment> fragment;
};
using TaggedRecordBatchGenerator = std::function<Future<TaggedRecordBatch>()>;
using TaggedRecordBatchIterator = Iterator<TaggedRecordBatch>;
/// \brief Combines a tagged batch with positional information
///
/// This is returned when scanning batches in an unordered fashion. This information is
/// needed if you ever want to reassemble the batches in order
struct EnumeratedRecordBatch {
Enumerated<std::shared_ptr<RecordBatch>> record_batch;
Enumerated<std::shared_ptr<Fragment>> fragment;
};
using EnumeratedRecordBatchGenerator = std::function<Future<EnumeratedRecordBatch>()>;
using EnumeratedRecordBatchIterator = Iterator<EnumeratedRecordBatch>;
/// @}
} // namespace dataset
template <>
struct IterationTraits<dataset::TaggedRecordBatch> {
static dataset::TaggedRecordBatch End() {
return dataset::TaggedRecordBatch{NULLPTR, NULLPTR};
}
static bool IsEnd(const dataset::TaggedRecordBatch& val) {
return val.record_batch == NULLPTR;
}
};
template <>
struct IterationTraits<dataset::EnumeratedRecordBatch> {
static dataset::EnumeratedRecordBatch End() {
return dataset::EnumeratedRecordBatch{
IterationEnd<Enumerated<std::shared_ptr<RecordBatch>>>(),
IterationEnd<Enumerated<std::shared_ptr<dataset::Fragment>>>()};
}
static bool IsEnd(const dataset::EnumeratedRecordBatch& val) {
return IsIterationEnd(val.fragment);
}
};
namespace dataset {
/// \defgroup dataset-scanning Scanning API
///
/// @{
/// \brief A scanner glues together several dataset classes to load in data.
/// The dataset contains a collection of fragments and partitioning rules.
///
/// The fragments identify independently loadable units of data (i.e. each fragment has
/// a potentially unique schema and possibly even format. It should be possible to read
/// fragments in parallel if desired).
///
/// The fragment's format contains the logic necessary to actually create a task to load
/// the fragment into memory. That task may or may not support parallel execution of
/// its own.
///
/// The scanner is then responsible for creating scan tasks from every fragment in the
/// dataset and (potentially) sequencing the loaded record batches together.
///
/// The scanner should not buffer the entire dataset in memory (unless asked) instead
/// yielding record batches as soon as they are ready to scan. Various readahead
/// properties control how much data is allowed to be scanned before pausing to let a
/// slow consumer catchup.
///
/// Today the scanner also handles projection & filtering although that may change in
/// the future.
class ARROW_DS_EXPORT Scanner {
public:
virtual ~Scanner() = default;
/// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads
/// are used (via use_threads), the visitor will be invoked from those threads and is
/// responsible for any synchronization.
virtual Status Scan(std::function<Status(TaggedRecordBatch)> visitor) = 0;
/// \brief Convert a Scanner into a Table.
///
/// Use this convenience utility with care. This will serially materialize the
/// Scan result in memory before creating the Table.
virtual Result<std::shared_ptr<Table>> ToTable() = 0;
/// \brief Scan the dataset into a stream of record batches. Each batch is tagged
/// with the fragment it originated from. The batches will arrive in order. The
/// order of fragments is determined by the dataset.
///
/// Note: The scanner will perform some readahead but will avoid materializing too
/// much in memory (this is goverended by the readahead options and use_threads option).
/// If the readahead queue fills up then I/O will pause until the calling thread catches
/// up.
virtual Result<TaggedRecordBatchIterator> ScanBatches() = 0;
virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync() = 0;
virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync(
::arrow::internal::Executor* cpu_thread_pool) = 0;
/// \brief Scan the dataset into a stream of record batches. Unlike ScanBatches this
/// method may allow record batches to be returned out of order. This allows for more
/// efficient scanning: some fragments may be accessed more quickly than others (e.g.
/// may be cached in RAM or just happen to get scheduled earlier by the I/O)
///
/// To make up for the out-of-order iteration each batch is further tagged with
/// positional information.
virtual Result<EnumeratedRecordBatchIterator> ScanBatchesUnordered() = 0;
virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync() = 0;
virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync(
::arrow::internal::Executor* cpu_thread_pool) = 0;
/// \brief A convenience to synchronously load the given rows by index.
///
/// Will only consume as many batches as needed from ScanBatches().
virtual Result<std::shared_ptr<Table>> TakeRows(const Array& indices) = 0;
/// \brief Get the first N rows.
virtual Result<std::shared_ptr<Table>> Head(int64_t num_rows) = 0;
/// \brief Count rows matching a predicate.
///
/// This method will push down the predicate and compute the result based on fragment
/// metadata if possible.
virtual Result<int64_t> CountRows() = 0;
/// \brief Convert the Scanner to a RecordBatchReader so it can be
/// easily used with APIs that expect a reader.
virtual Result<std::shared_ptr<RecordBatchReader>> ToRecordBatchReader() = 0;
/// \brief Get the options for this scan.
const std::shared_ptr<ScanOptions>& options() const { return scan_options_; }
/// \brief Get the dataset that this scanner will scan
virtual const std::shared_ptr<Dataset>& dataset() const = 0;
protected:
explicit Scanner(std::shared_ptr<ScanOptions> scan_options)
: scan_options_(std::move(scan_options)) {}
Result<EnumeratedRecordBatchIterator> AddPositioningToInOrderScan(
TaggedRecordBatchIterator scan);
const std::shared_ptr<ScanOptions> scan_options_;
};
/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used
/// to pass information, notably a potential filter expression and a subset of
/// columns to materialize.
class ARROW_DS_EXPORT ScannerBuilder {
public:
explicit ScannerBuilder(std::shared_ptr<Dataset> dataset);
ScannerBuilder(std::shared_ptr<Dataset> dataset,
std::shared_ptr<ScanOptions> scan_options);
ScannerBuilder(std::shared_ptr<Schema> schema, std::shared_ptr<Fragment> fragment,
std::shared_ptr<ScanOptions> scan_options);
/// \brief Make a scanner from a record batch reader.
///
/// The resulting scanner can be scanned only once. This is intended
/// to support writing data from streaming sources or other sources
/// that can be iterated only once.
static std::shared_ptr<ScannerBuilder> FromRecordBatchReader(
std::shared_ptr<RecordBatchReader> reader);
/// \brief Set the subset of columns to materialize.
///
/// Columns which are not referenced may not be read from fragments.
///
/// \param[in] columns list of columns to project. Order and duplicates will
/// be preserved.
///
/// \return Failure if any column name does not exists in the dataset's
/// Schema.
Status Project(std::vector<std::string> columns);
/// \brief Set expressions which will be evaluated to produce the materialized
/// columns.
///
/// Columns which are not referenced may not be read from fragments.
///
/// \param[in] exprs expressions to evaluate to produce columns.
/// \param[in] names list of names for the resulting columns.
///
/// \return Failure if any referenced column does not exists in the dataset's
/// Schema.
Status Project(std::vector<compute::Expression> exprs, std::vector<std::string> names);
/// \brief Set the filter expression to return only rows matching the filter.
///
/// The predicate will be passed down to Sources and corresponding
/// Fragments to exploit predicate pushdown if possible using
/// partition information or Fragment internal metadata, e.g. Parquet statistics.
/// Columns which are not referenced may not be read from fragments.
///
/// \param[in] filter expression to filter rows with.
///
/// \return Failure if any referenced columns does not exist in the dataset's
/// Schema.
Status Filter(const compute::Expression& filter);
/// \brief Indicate if the Scanner should make use of the available
/// ThreadPool found in ScanOptions;
Status UseThreads(bool use_threads = true);
/// \brief Limit how many fragments the scanner will read at once
Status FragmentReadahead(int fragment_readahead);
/// \brief Set the maximum number of rows per RecordBatch.
///
/// \param[in] batch_size the maximum number of rows.
/// \returns An error if the number for batch is not greater than 0.
///
/// This option provides a control limiting the memory owned by any RecordBatch.
Status BatchSize(int64_t batch_size);
/// \brief Set the pool from which materialized and scanned arrays will be allocated.
Status Pool(MemoryPool* pool);
/// \brief Set fragment-specific scan options.
Status FragmentScanOptions(std::shared_ptr<FragmentScanOptions> fragment_scan_options);
/// \brief Override default backpressure configuration
Status Backpressure(compute::BackpressureOptions backpressure);
/// \brief Return the constructed now-immutable Scanner object
Result<std::shared_ptr<Scanner>> Finish();
const std::shared_ptr<Schema>& schema() const;
const std::shared_ptr<Schema>& projected_schema() const;
private:
std::shared_ptr<Dataset> dataset_;
std::shared_ptr<ScanOptions> scan_options_ = std::make_shared<ScanOptions>();
};
/// \brief Construct a source ExecNode which yields batches from a dataset scan.
///
/// Does not construct associated filter or project nodes.
/// Yielded batches will be augmented with fragment/batch indices to enable stable
/// ordering for simple ExecPlans.
class ARROW_DS_EXPORT ScanNodeOptions : public compute::ExecNodeOptions {
public:
explicit ScanNodeOptions(std::shared_ptr<Dataset> dataset,
std::shared_ptr<ScanOptions> scan_options,
bool require_sequenced_output = false)
: dataset(std::move(dataset)),
scan_options(std::move(scan_options)),
require_sequenced_output(require_sequenced_output) {}
std::shared_ptr<Dataset> dataset;
std::shared_ptr<ScanOptions> scan_options;
bool require_sequenced_output;
};
/// @}
namespace internal {
ARROW_DS_EXPORT void InitializeScanner(arrow::compute::ExecFactoryRegistry* registry);
} // namespace internal
} // namespace dataset
} // namespace arrow

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,106 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#include <memory>
#include <vector>
#include "arrow/compute/type_fwd.h" // IWYU pragma: export
#include "arrow/dataset/visibility.h"
#include "arrow/filesystem/type_fwd.h" // IWYU pragma: export
#include "arrow/type_fwd.h" // IWYU pragma: export
namespace arrow {
namespace dataset {
class Dataset;
class DatasetFactory;
using DatasetVector = std::vector<std::shared_ptr<Dataset>>;
class UnionDataset;
class UnionDatasetFactory;
class Fragment;
using FragmentIterator = Iterator<std::shared_ptr<Fragment>>;
using FragmentVector = std::vector<std::shared_ptr<Fragment>>;
class FragmentScanOptions;
class FileSource;
class FileFormat;
class FileFragment;
class FileWriter;
class FileWriteOptions;
class FileSystemDataset;
class FileSystemDatasetFactory;
struct FileSystemDatasetWriteOptions;
/// \brief Controls what happens if files exist in an output directory during a dataset
/// write
enum class ExistingDataBehavior : int8_t {
/// Deletes all files in a directory the first time that directory is encountered
kDeleteMatchingPartitions,
/// Ignores existing files, overwriting any that happen to have the same name as an
/// output file
kOverwriteOrIgnore,
/// Returns an error if there are any files or subdirectories in the output directory
kError,
};
class InMemoryDataset;
class CsvFileFormat;
class CsvFileWriter;
class CsvFileWriteOptions;
struct CsvFragmentScanOptions;
class IpcFileFormat;
class IpcFileWriter;
class IpcFileWriteOptions;
class IpcFragmentScanOptions;
class ParquetFileFormat;
class ParquetFileFragment;
class ParquetFragmentScanOptions;
class ParquetFileWriter;
class ParquetFileWriteOptions;
class Partitioning;
class PartitioningFactory;
class PartitioningOrFactory;
struct KeyValuePartitioningOptions;
class DirectoryPartitioning;
class HivePartitioning;
struct HivePartitioningOptions;
class FilenamePartitioning;
struct FilenamePartitioningOptions;
struct ScanOptions;
class Scanner;
class ScannerBuilder;
class ScanTask;
using ScanTaskVector = std::vector<std::shared_ptr<ScanTask>>;
using ScanTaskIterator = Iterator<std::shared_ptr<ScanTask>>;
} // namespace dataset
} // namespace arrow

View File

@@ -0,0 +1,50 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This API is EXPERIMENTAL.
#pragma once
#if defined(_WIN32) || defined(__CYGWIN__)
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4251)
#else
#pragma GCC diagnostic ignored "-Wattributes"
#endif
#ifdef ARROW_DS_STATIC
#define ARROW_DS_EXPORT
#elif defined(ARROW_DS_EXPORTING)
#define ARROW_DS_EXPORT __declspec(dllexport)
#else
#define ARROW_DS_EXPORT __declspec(dllimport)
#endif
#define ARROW_DS_NO_EXPORT
#else // Not Windows
#ifndef ARROW_DS_EXPORT
#define ARROW_DS_EXPORT __attribute__((visibility("default")))
#endif
#ifndef ARROW_DS_NO_EXPORT
#define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden")))
#endif
#endif // Non-Windows
#if defined(_MSC_VER)
#pragma warning(pop)
#endif