// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This API is EXPERIMENTAL. #pragma once #include #include #include #include #include #include #include #include "arrow/compute/exec/expression.h" #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" #include "arrow/util/optional.h" namespace arrow { namespace dataset { constexpr char kFilenamePartitionSep = '_'; // ---------------------------------------------------------------------- // Partitioning /// \defgroup dataset-partitioning Partitioning API /// /// @{ /// \brief Interface for parsing partition expressions from string partition /// identifiers. /// /// For example, the identifier "foo=5" might be parsed to an equality expression /// between the "foo" field and the value 5. /// /// Some partitionings may store the field names in a metadata /// store instead of in file paths, for example /// dataset_root/2009/11/... could be used when the partition fields /// are "year" and "month" /// /// Paths are consumed from left to right. Paths must be relative to /// the root of a partition; path prefixes must be removed before passing /// the path to a partitioning for parsing. class ARROW_DS_EXPORT Partitioning { public: virtual ~Partitioning() = default; /// \brief The name identifying the kind of partitioning virtual std::string type_name() const = 0; /// \brief If the input batch shares any fields with this partitioning, /// produce sub-batches which satisfy mutually exclusive Expressions. struct PartitionedBatches { RecordBatchVector batches; std::vector expressions; }; virtual Result Partition( const std::shared_ptr& batch) const = 0; /// \brief Parse a path into a partition expression virtual Result Parse(const std::string& path) const = 0; struct PartitionPathFormat { std::string directory, prefix; }; virtual Result Format(const compute::Expression& expr) const = 0; /// \brief A default Partitioning which always yields scalar(true) static std::shared_ptr Default(); /// \brief The partition schema. const std::shared_ptr& schema() { return schema_; } protected: explicit Partitioning(std::shared_ptr schema) : schema_(std::move(schema)) {} std::shared_ptr schema_; }; /// \brief The encoding of partition segments. enum class SegmentEncoding : int8_t { /// No encoding. None = 0, /// Segment values are URL-encoded. Uri = 1, }; ARROW_DS_EXPORT std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding); /// \brief Options for key-value based partitioning (hive/directory). struct ARROW_DS_EXPORT KeyValuePartitioningOptions { /// After splitting a path into components, decode the path components /// before parsing according to this scheme. SegmentEncoding segment_encoding = SegmentEncoding::Uri; }; /// \brief Options for inferring a partitioning. struct ARROW_DS_EXPORT PartitioningFactoryOptions { /// When inferring a schema for partition fields, yield dictionary encoded types /// instead of plain. This can be more efficient when materializing virtual /// columns, and Expressions parsed by the finished Partitioning will include /// dictionaries of all unique inspected values for each field. bool infer_dictionary = false; /// Optionally, an expected schema can be provided, in which case inference /// will only check discovered fields against the schema and update internal /// state (such as dictionaries). std::shared_ptr schema; /// After splitting a path into components, decode the path components /// before parsing according to this scheme. SegmentEncoding segment_encoding = SegmentEncoding::Uri; KeyValuePartitioningOptions AsPartitioningOptions() const; }; /// \brief Options for inferring a hive-style partitioning. struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions { /// The hive partitioning scheme maps null to a hard coded fallback string. std::string null_fallback; HivePartitioningOptions AsHivePartitioningOptions() const; }; /// \brief PartitioningFactory provides creation of a partitioning when the /// specific schema must be inferred from available paths (no explicit schema is known). class ARROW_DS_EXPORT PartitioningFactory { public: virtual ~PartitioningFactory() = default; /// \brief The name identifying the kind of partitioning virtual std::string type_name() const = 0; /// Get the schema for the resulting Partitioning. /// This may reset internal state, for example dictionaries of unique representations. virtual Result> Inspect( const std::vector& paths) = 0; /// Create a partitioning using the provided schema /// (fields may be dropped). virtual Result> Finish( const std::shared_ptr& schema) const = 0; }; /// \brief Subclass for the common case of a partitioning which yields an equality /// expression for each segment class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { public: /// An unconverted equality expression consisting of a field name and the representation /// of a scalar value struct Key { std::string name; util::optional value; }; Result Partition( const std::shared_ptr& batch) const override; Result Parse(const std::string& path) const override; Result Format(const compute::Expression& expr) const override; const ArrayVector& dictionaries() const { return dictionaries_; } protected: KeyValuePartitioning(std::shared_ptr schema, ArrayVector dictionaries, KeyValuePartitioningOptions options) : Partitioning(std::move(schema)), dictionaries_(std::move(dictionaries)), options_(options) { if (dictionaries_.empty()) { dictionaries_.resize(schema_->num_fields()); } } virtual Result> ParseKeys(const std::string& path) const = 0; virtual Result FormatValues(const ScalarVector& values) const = 0; /// Convert a Key to a full expression. Result ConvertKey(const Key& key) const; Result> FormatPartitionSegments( const ScalarVector& values) const; Result> ParsePartitionSegments( const std::vector& segments) const; ArrayVector dictionaries_; KeyValuePartitioningOptions options_; }; /// \brief DirectoryPartitioning parses one segment of a path for each field in its /// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse /// must contain segments for each field. /// /// For example given schema the path "/2009/11" would be /// parsed to ("year"_ == 2009 and "month"_ == 11) class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning { public: /// If a field in schema is of dictionary type, the corresponding element of /// dictionaries must be contain the dictionary of values for that field. explicit DirectoryPartitioning(std::shared_ptr schema, ArrayVector dictionaries = {}, KeyValuePartitioningOptions options = {}); std::string type_name() const override { return "directory"; } /// \brief Create a factory for a directory partitioning. /// /// \param[in] field_names The names for the partition fields. Types will be /// inferred. static std::shared_ptr MakeFactory( std::vector field_names, PartitioningFactoryOptions = {}); private: Result> ParseKeys(const std::string& path) const override; Result FormatValues(const ScalarVector& values) const override; }; /// \brief The default fallback used for null values in a Hive-style partitioning. static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__"; struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions { std::string null_fallback = kDefaultHiveNullFallback; static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) { HivePartitioningOptions options; options.null_fallback = std::move(fallback); return options; } }; /// \brief Multi-level, directory based partitioning /// originating from Apache Hive with all data files stored in the /// leaf directories. Data is partitioned by static values of a /// particular column in the schema. Partition keys are represented in /// the form $key=$value in directory names. /// Field order is ignored, as are missing or unrecognized field names. /// /// For example given schema the path /// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321) class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { public: /// If a field in schema is of dictionary type, the corresponding element of /// dictionaries must be contain the dictionary of values for that field. explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries = {}, std::string null_fallback = kDefaultHiveNullFallback) : KeyValuePartitioning(std::move(schema), std::move(dictionaries), KeyValuePartitioningOptions()), hive_options_( HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) { } explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries, HivePartitioningOptions options) : KeyValuePartitioning(std::move(schema), std::move(dictionaries), options), hive_options_(options) {} std::string type_name() const override { return "hive"; } std::string null_fallback() const { return hive_options_.null_fallback; } const HivePartitioningOptions& options() const { return hive_options_; } static Result> ParseKey(const std::string& segment, const HivePartitioningOptions& options); /// \brief Create a factory for a hive partitioning. static std::shared_ptr MakeFactory( HivePartitioningFactoryOptions = {}); private: const HivePartitioningOptions hive_options_; Result> ParseKeys(const std::string& path) const override; Result FormatValues(const ScalarVector& values) const override; }; /// \brief Implementation provided by lambda or other callable class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning { public: using ParseImpl = std::function(const std::string&)>; using FormatImpl = std::function(const compute::Expression&)>; FunctionPartitioning(std::shared_ptr schema, ParseImpl parse_impl, FormatImpl format_impl = NULLPTR, std::string name = "function") : Partitioning(std::move(schema)), parse_impl_(std::move(parse_impl)), format_impl_(std::move(format_impl)), name_(std::move(name)) {} std::string type_name() const override { return name_; } Result Parse(const std::string& path) const override { return parse_impl_(path); } Result Format(const compute::Expression& expr) const override { if (format_impl_) { return format_impl_(expr); } return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning"); } Result Partition( const std::shared_ptr& batch) const override { return Status::NotImplemented("partitioning batches from ", type_name(), " Partitioning"); } private: ParseImpl parse_impl_; FormatImpl format_impl_; std::string name_; }; class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { public: /// \brief Construct a FilenamePartitioning from its components. /// /// If a field in schema is of dictionary type, the corresponding element of /// dictionaries must be contain the dictionary of values for that field. explicit FilenamePartitioning(std::shared_ptr schema, ArrayVector dictionaries = {}, KeyValuePartitioningOptions options = {}); std::string type_name() const override { return "filename"; } /// \brief Create a factory for a filename partitioning. /// /// \param[in] field_names The names for the partition fields. Types will be /// inferred. static std::shared_ptr MakeFactory( std::vector field_names, PartitioningFactoryOptions = {}); private: Result> ParseKeys(const std::string& path) const override; Result FormatValues(const ScalarVector& values) const override; }; /// \brief Remove a prefix and the filename of a path. /// /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"` ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& paths, const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& files, const std::string& prefix); /// \brief Either a Partitioning or a PartitioningFactory class ARROW_DS_EXPORT PartitioningOrFactory { public: explicit PartitioningOrFactory(std::shared_ptr partitioning) : partitioning_(std::move(partitioning)) {} explicit PartitioningOrFactory(std::shared_ptr factory) : factory_(std::move(factory)) {} PartitioningOrFactory& operator=(std::shared_ptr partitioning) { return *this = PartitioningOrFactory(std::move(partitioning)); } PartitioningOrFactory& operator=(std::shared_ptr factory) { return *this = PartitioningOrFactory(std::move(factory)); } /// \brief The partitioning (if given). const std::shared_ptr& partitioning() const { return partitioning_; } /// \brief The partition factory (if given). const std::shared_ptr& factory() const { return factory_; } /// \brief Get the partition schema, inferring it with the given factory if needed. Result> GetOrInferSchema(const std::vector& paths); private: std::shared_ptr factory_; std::shared_ptr partitioning_; }; /// @} } // namespace dataset } // namespace arrow