mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-23 02:40:43 +00:00
410 lines
16 KiB
C++
410 lines
16 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
// This API is EXPERIMENTAL.
|
|
|
|
#pragma once
|
|
|
|
#include <functional>
|
|
#include <iosfwd>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "arrow/compute/exec/expression.h"
|
|
#include "arrow/dataset/type_fwd.h"
|
|
#include "arrow/dataset/visibility.h"
|
|
#include "arrow/util/optional.h"
|
|
|
|
namespace arrow {
|
|
|
|
namespace dataset {
|
|
|
|
constexpr char kFilenamePartitionSep = '_';
|
|
|
|
// ----------------------------------------------------------------------
|
|
// Partitioning
|
|
|
|
/// \defgroup dataset-partitioning Partitioning API
|
|
///
|
|
/// @{
|
|
|
|
/// \brief Interface for parsing partition expressions from string partition
|
|
/// identifiers.
|
|
///
|
|
/// For example, the identifier "foo=5" might be parsed to an equality expression
|
|
/// between the "foo" field and the value 5.
|
|
///
|
|
/// Some partitionings may store the field names in a metadata
|
|
/// store instead of in file paths, for example
|
|
/// dataset_root/2009/11/... could be used when the partition fields
|
|
/// are "year" and "month"
|
|
///
|
|
/// Paths are consumed from left to right. Paths must be relative to
|
|
/// the root of a partition; path prefixes must be removed before passing
|
|
/// the path to a partitioning for parsing.
|
|
class ARROW_DS_EXPORT Partitioning {
|
|
public:
|
|
virtual ~Partitioning() = default;
|
|
|
|
/// \brief The name identifying the kind of partitioning
|
|
virtual std::string type_name() const = 0;
|
|
|
|
/// \brief If the input batch shares any fields with this partitioning,
|
|
/// produce sub-batches which satisfy mutually exclusive Expressions.
|
|
struct PartitionedBatches {
|
|
RecordBatchVector batches;
|
|
std::vector<compute::Expression> expressions;
|
|
};
|
|
virtual Result<PartitionedBatches> Partition(
|
|
const std::shared_ptr<RecordBatch>& batch) const = 0;
|
|
|
|
/// \brief Parse a path into a partition expression
|
|
virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
|
|
|
|
struct PartitionPathFormat {
|
|
std::string directory, prefix;
|
|
};
|
|
|
|
virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
|
|
|
|
/// \brief A default Partitioning which always yields scalar(true)
|
|
static std::shared_ptr<Partitioning> Default();
|
|
|
|
/// \brief The partition schema.
|
|
const std::shared_ptr<Schema>& schema() { return schema_; }
|
|
|
|
protected:
|
|
explicit Partitioning(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
|
|
|
|
std::shared_ptr<Schema> schema_;
|
|
};
|
|
|
|
/// \brief The encoding of partition segments.
|
|
enum class SegmentEncoding : int8_t {
|
|
/// No encoding.
|
|
None = 0,
|
|
/// Segment values are URL-encoded.
|
|
Uri = 1,
|
|
};
|
|
|
|
ARROW_DS_EXPORT
|
|
std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding);
|
|
|
|
/// \brief Options for key-value based partitioning (hive/directory).
|
|
struct ARROW_DS_EXPORT KeyValuePartitioningOptions {
|
|
/// After splitting a path into components, decode the path components
|
|
/// before parsing according to this scheme.
|
|
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
|
|
};
|
|
|
|
/// \brief Options for inferring a partitioning.
|
|
struct ARROW_DS_EXPORT PartitioningFactoryOptions {
|
|
/// When inferring a schema for partition fields, yield dictionary encoded types
|
|
/// instead of plain. This can be more efficient when materializing virtual
|
|
/// columns, and Expressions parsed by the finished Partitioning will include
|
|
/// dictionaries of all unique inspected values for each field.
|
|
bool infer_dictionary = false;
|
|
/// Optionally, an expected schema can be provided, in which case inference
|
|
/// will only check discovered fields against the schema and update internal
|
|
/// state (such as dictionaries).
|
|
std::shared_ptr<Schema> schema;
|
|
/// After splitting a path into components, decode the path components
|
|
/// before parsing according to this scheme.
|
|
SegmentEncoding segment_encoding = SegmentEncoding::Uri;
|
|
|
|
KeyValuePartitioningOptions AsPartitioningOptions() const;
|
|
};
|
|
|
|
/// \brief Options for inferring a hive-style partitioning.
|
|
struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions {
|
|
/// The hive partitioning scheme maps null to a hard coded fallback string.
|
|
std::string null_fallback;
|
|
|
|
HivePartitioningOptions AsHivePartitioningOptions() const;
|
|
};
|
|
|
|
/// \brief PartitioningFactory provides creation of a partitioning when the
|
|
/// specific schema must be inferred from available paths (no explicit schema is known).
|
|
class ARROW_DS_EXPORT PartitioningFactory {
|
|
public:
|
|
virtual ~PartitioningFactory() = default;
|
|
|
|
/// \brief The name identifying the kind of partitioning
|
|
virtual std::string type_name() const = 0;
|
|
|
|
/// Get the schema for the resulting Partitioning.
|
|
/// This may reset internal state, for example dictionaries of unique representations.
|
|
virtual Result<std::shared_ptr<Schema>> Inspect(
|
|
const std::vector<std::string>& paths) = 0;
|
|
|
|
/// Create a partitioning using the provided schema
|
|
/// (fields may be dropped).
|
|
virtual Result<std::shared_ptr<Partitioning>> Finish(
|
|
const std::shared_ptr<Schema>& schema) const = 0;
|
|
};
|
|
|
|
/// \brief Subclass for the common case of a partitioning which yields an equality
|
|
/// expression for each segment
|
|
class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
|
|
public:
|
|
/// An unconverted equality expression consisting of a field name and the representation
|
|
/// of a scalar value
|
|
struct Key {
|
|
std::string name;
|
|
util::optional<std::string> value;
|
|
};
|
|
|
|
Result<PartitionedBatches> Partition(
|
|
const std::shared_ptr<RecordBatch>& batch) const override;
|
|
|
|
Result<compute::Expression> Parse(const std::string& path) const override;
|
|
|
|
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
|
|
|
|
const ArrayVector& dictionaries() const { return dictionaries_; }
|
|
|
|
protected:
|
|
KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
|
|
KeyValuePartitioningOptions options)
|
|
: Partitioning(std::move(schema)),
|
|
dictionaries_(std::move(dictionaries)),
|
|
options_(options) {
|
|
if (dictionaries_.empty()) {
|
|
dictionaries_.resize(schema_->num_fields());
|
|
}
|
|
}
|
|
|
|
virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
|
|
|
|
virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
|
|
|
|
/// Convert a Key to a full expression.
|
|
Result<compute::Expression> ConvertKey(const Key& key) const;
|
|
|
|
Result<std::vector<std::string>> FormatPartitionSegments(
|
|
const ScalarVector& values) const;
|
|
Result<std::vector<Key>> ParsePartitionSegments(
|
|
const std::vector<std::string>& segments) const;
|
|
|
|
ArrayVector dictionaries_;
|
|
KeyValuePartitioningOptions options_;
|
|
};
|
|
|
|
/// \brief DirectoryPartitioning parses one segment of a path for each field in its
|
|
/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse
|
|
/// must contain segments for each field.
|
|
///
|
|
/// For example given schema<year:int16, month:int8> the path "/2009/11" would be
|
|
/// parsed to ("year"_ == 2009 and "month"_ == 11)
|
|
class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
|
|
public:
|
|
/// If a field in schema is of dictionary type, the corresponding element of
|
|
/// dictionaries must be contain the dictionary of values for that field.
|
|
explicit DirectoryPartitioning(std::shared_ptr<Schema> schema,
|
|
ArrayVector dictionaries = {},
|
|
KeyValuePartitioningOptions options = {});
|
|
|
|
std::string type_name() const override { return "directory"; }
|
|
|
|
/// \brief Create a factory for a directory partitioning.
|
|
///
|
|
/// \param[in] field_names The names for the partition fields. Types will be
|
|
/// inferred.
|
|
static std::shared_ptr<PartitioningFactory> MakeFactory(
|
|
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
|
|
|
|
private:
|
|
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
|
|
|
|
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
|
|
};
|
|
|
|
/// \brief The default fallback used for null values in a Hive-style partitioning.
|
|
static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__";
|
|
|
|
struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions {
|
|
std::string null_fallback = kDefaultHiveNullFallback;
|
|
|
|
static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) {
|
|
HivePartitioningOptions options;
|
|
options.null_fallback = std::move(fallback);
|
|
return options;
|
|
}
|
|
};
|
|
|
|
/// \brief Multi-level, directory based partitioning
|
|
/// originating from Apache Hive with all data files stored in the
|
|
/// leaf directories. Data is partitioned by static values of a
|
|
/// particular column in the schema. Partition keys are represented in
|
|
/// the form $key=$value in directory names.
|
|
/// Field order is ignored, as are missing or unrecognized field names.
|
|
///
|
|
/// For example given schema<year:int16, month:int8, day:int8> the path
|
|
/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321)
|
|
class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
|
|
public:
|
|
/// If a field in schema is of dictionary type, the corresponding element of
|
|
/// dictionaries must be contain the dictionary of values for that field.
|
|
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
|
|
std::string null_fallback = kDefaultHiveNullFallback)
|
|
: KeyValuePartitioning(std::move(schema), std::move(dictionaries),
|
|
KeyValuePartitioningOptions()),
|
|
hive_options_(
|
|
HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) {
|
|
}
|
|
|
|
explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
|
|
HivePartitioningOptions options)
|
|
: KeyValuePartitioning(std::move(schema), std::move(dictionaries), options),
|
|
hive_options_(options) {}
|
|
|
|
std::string type_name() const override { return "hive"; }
|
|
std::string null_fallback() const { return hive_options_.null_fallback; }
|
|
const HivePartitioningOptions& options() const { return hive_options_; }
|
|
|
|
static Result<util::optional<Key>> ParseKey(const std::string& segment,
|
|
const HivePartitioningOptions& options);
|
|
|
|
/// \brief Create a factory for a hive partitioning.
|
|
static std::shared_ptr<PartitioningFactory> MakeFactory(
|
|
HivePartitioningFactoryOptions = {});
|
|
|
|
private:
|
|
const HivePartitioningOptions hive_options_;
|
|
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
|
|
|
|
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
|
|
};
|
|
|
|
/// \brief Implementation provided by lambda or other callable
|
|
class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
|
|
public:
|
|
using ParseImpl = std::function<Result<compute::Expression>(const std::string&)>;
|
|
|
|
using FormatImpl =
|
|
std::function<Result<PartitionPathFormat>(const compute::Expression&)>;
|
|
|
|
FunctionPartitioning(std::shared_ptr<Schema> schema, ParseImpl parse_impl,
|
|
FormatImpl format_impl = NULLPTR, std::string name = "function")
|
|
: Partitioning(std::move(schema)),
|
|
parse_impl_(std::move(parse_impl)),
|
|
format_impl_(std::move(format_impl)),
|
|
name_(std::move(name)) {}
|
|
|
|
std::string type_name() const override { return name_; }
|
|
|
|
Result<compute::Expression> Parse(const std::string& path) const override {
|
|
return parse_impl_(path);
|
|
}
|
|
|
|
Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
|
|
if (format_impl_) {
|
|
return format_impl_(expr);
|
|
}
|
|
return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning");
|
|
}
|
|
|
|
Result<PartitionedBatches> Partition(
|
|
const std::shared_ptr<RecordBatch>& batch) const override {
|
|
return Status::NotImplemented("partitioning batches from ", type_name(),
|
|
" Partitioning");
|
|
}
|
|
|
|
private:
|
|
ParseImpl parse_impl_;
|
|
FormatImpl format_impl_;
|
|
std::string name_;
|
|
};
|
|
|
|
class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
|
|
public:
|
|
/// \brief Construct a FilenamePartitioning from its components.
|
|
///
|
|
/// If a field in schema is of dictionary type, the corresponding element of
|
|
/// dictionaries must be contain the dictionary of values for that field.
|
|
explicit FilenamePartitioning(std::shared_ptr<Schema> schema,
|
|
ArrayVector dictionaries = {},
|
|
KeyValuePartitioningOptions options = {});
|
|
|
|
std::string type_name() const override { return "filename"; }
|
|
|
|
/// \brief Create a factory for a filename partitioning.
|
|
///
|
|
/// \param[in] field_names The names for the partition fields. Types will be
|
|
/// inferred.
|
|
static std::shared_ptr<PartitioningFactory> MakeFactory(
|
|
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
|
|
|
|
private:
|
|
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
|
|
|
|
Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
|
|
};
|
|
|
|
/// \brief Remove a prefix and the filename of a path.
|
|
///
|
|
/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"`
|
|
ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
|
|
const std::string& prefix);
|
|
|
|
/// \brief Vector version of StripPrefixAndFilename.
|
|
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
|
|
const std::vector<std::string>& paths, const std::string& prefix);
|
|
|
|
/// \brief Vector version of StripPrefixAndFilename.
|
|
ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
|
|
const std::vector<fs::FileInfo>& files, const std::string& prefix);
|
|
|
|
/// \brief Either a Partitioning or a PartitioningFactory
|
|
class ARROW_DS_EXPORT PartitioningOrFactory {
|
|
public:
|
|
explicit PartitioningOrFactory(std::shared_ptr<Partitioning> partitioning)
|
|
: partitioning_(std::move(partitioning)) {}
|
|
|
|
explicit PartitioningOrFactory(std::shared_ptr<PartitioningFactory> factory)
|
|
: factory_(std::move(factory)) {}
|
|
|
|
PartitioningOrFactory& operator=(std::shared_ptr<Partitioning> partitioning) {
|
|
return *this = PartitioningOrFactory(std::move(partitioning));
|
|
}
|
|
|
|
PartitioningOrFactory& operator=(std::shared_ptr<PartitioningFactory> factory) {
|
|
return *this = PartitioningOrFactory(std::move(factory));
|
|
}
|
|
|
|
/// \brief The partitioning (if given).
|
|
const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
|
|
|
|
/// \brief The partition factory (if given).
|
|
const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
|
|
|
|
/// \brief Get the partition schema, inferring it with the given factory if needed.
|
|
Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
|
|
|
|
private:
|
|
std::shared_ptr<PartitioningFactory> factory_;
|
|
std::shared_ptr<Partitioning> partitioning_;
|
|
};
|
|
|
|
/// @}
|
|
|
|
} // namespace dataset
|
|
} // namespace arrow
|