// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // NOTE: API is EXPERIMENTAL and will change without going through a // deprecation cycle. #pragma once #include #include #include #include "arrow/compute/kernel.h" #include "arrow/compute/type_fwd.h" #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/compare.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { namespace compute { /// \defgroup compute-functions Abstract compute function API /// /// @{ /// \brief Extension point for defining options outside libarrow (but /// still within this project). class ARROW_EXPORT FunctionOptionsType { public: virtual ~FunctionOptionsType() = default; virtual const char* type_name() const = 0; virtual std::string Stringify(const FunctionOptions&) const = 0; virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; virtual Result> Serialize(const FunctionOptions&) const; virtual Result> Deserialize( const Buffer& buffer) const; virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; }; /// \brief Base class for specifying options configuring a function's behavior, /// such as error handling. class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { public: virtual ~FunctionOptions() = default; const FunctionOptionsType* options_type() const { return options_type_; } const char* type_name() const { return options_type()->type_name(); } bool Equals(const FunctionOptions& other) const; using util::EqualityComparable::Equals; using util::EqualityComparable::operator==; using util::EqualityComparable::operator!=; std::string ToString() const; std::unique_ptr Copy() const; /// \brief Serialize an options struct to a buffer. Result> Serialize() const; /// \brief Deserialize an options struct from a buffer. /// Note: this will only look for `type_name` in the default FunctionRegistry; /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then /// call FunctionOptionsType::Deserialize(). static Result> Deserialize( const std::string& type_name, const Buffer& buffer); protected: explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} const FunctionOptionsType* options_type_; }; ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); /// \brief Contains the number of required arguments for the function. /// /// Naming conventions taken from https://en.wikipedia.org/wiki/Arity. struct ARROW_EXPORT Arity { /// \brief A function taking no arguments static Arity Nullary() { return Arity(0, false); } /// \brief A function taking 1 argument static Arity Unary() { return Arity(1, false); } /// \brief A function taking 2 arguments static Arity Binary() { return Arity(2, false); } /// \brief A function taking 3 arguments static Arity Ternary() { return Arity(3, false); } /// \brief A function taking a variable number of arguments /// /// \param[in] min_args the minimum number of arguments required when /// invoking the function static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); } // NOTE: the 0-argument form (default constructor) is required for Cython explicit Arity(int num_args = 0, bool is_varargs = false) : num_args(num_args), is_varargs(is_varargs) {} /// The number of required arguments (or the minimum number for varargs /// functions). int num_args; /// If true, then the num_args is the minimum number of required arguments. bool is_varargs = false; }; struct ARROW_EXPORT FunctionDoc { /// \brief A one-line summary of the function, using a verb. /// /// For example, "Add two numeric arrays or scalars". std::string summary; /// \brief A detailed description of the function, meant to follow the summary. std::string description; /// \brief Symbolic names (identifiers) for the function arguments. /// /// Some bindings may use this to generate nicer function signatures. std::vector arg_names; // TODO add argument descriptions? /// \brief Name of the options class, if any. std::string options_class; /// \brief Whether options are required for function execution /// /// If false, then either the function does not have an options class /// or there is a usable default options value. bool options_required; FunctionDoc() = default; FunctionDoc(std::string summary, std::string description, std::vector arg_names, std::string options_class = "", bool options_required = false) : summary(std::move(summary)), description(std::move(description)), arg_names(std::move(arg_names)), options_class(std::move(options_class)), options_required(options_required) {} static const FunctionDoc& Empty(); }; /// \brief Base class for compute functions. Function implementations contain a /// collection of "kernels" which are implementations of the function for /// specific argument types. Selecting a viable kernel for executing a function /// is referred to as "dispatching". class ARROW_EXPORT Function { public: /// \brief The kind of function, which indicates in what contexts it is /// valid for use. enum Kind { /// A function that performs scalar data operations on whole arrays of /// data. Can generally process Array or Scalar values. The size of the /// output will be the same as the size (or broadcasted size, in the case /// of mixing Array and Scalar inputs) of the input. SCALAR, /// A function with array input and output whose behavior depends on the /// values of the entire arrays passed, rather than the value of each scalar /// value. VECTOR, /// A function that computes scalar summary statistics from array input. SCALAR_AGGREGATE, /// A function that computes grouped summary statistics from array input /// and an array of group identifiers. HASH_AGGREGATE, /// A function that dispatches to other functions and does not contain its /// own kernels. META }; virtual ~Function() = default; /// \brief The name of the kernel. The registry enforces uniqueness of names. const std::string& name() const { return name_; } /// \brief The kind of kernel, which indicates in what contexts it is valid /// for use. Function::Kind kind() const { return kind_; } /// \brief Contains the number of arguments the function requires, or if the /// function accepts variable numbers of arguments. const Arity& arity() const { return arity_; } /// \brief Return the function documentation const FunctionDoc& doc() const { return *doc_; } /// \brief Returns the number of registered kernels for this function. virtual int num_kernels() const = 0; /// \brief Return a kernel that can execute the function given the exact /// argument types (without implicit type casts or scalar->array promotions). /// /// NB: This function is overridden in CastFunction. virtual Result DispatchExact( const std::vector& values) const; /// \brief Return a best-match kernel that can execute the function given the argument /// types, after implicit casts are applied. /// /// \param[in,out] values Argument types. An element may be modified to indicate that /// the returned kernel only approximately matches the input value descriptors; callers /// are responsible for casting inputs to the type and shape required by the kernel. virtual Result DispatchBest(std::vector* values) const; /// \brief Execute the function eagerly with the passed input arguments with /// kernel dispatch, batch iteration, and memory allocation details taken /// care of. /// /// If the `options` pointer is null, then `default_options()` will be used. /// /// This function can be overridden in subclasses. virtual Result Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const; /// \brief Returns the default options for this function. /// /// Whatever option semantics a Function has, implementations must guarantee /// that default_options() is valid to pass to Execute as options. const FunctionOptions* default_options() const { return default_options_; } virtual Status Validate() const; protected: Function(std::string name, Function::Kind kind, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options) : name_(std::move(name)), kind_(kind), arity_(arity), doc_(doc ? doc : &FunctionDoc::Empty()), default_options_(default_options) {} Status CheckArity(const std::vector&) const; Status CheckArity(const std::vector&) const; std::string name_; Function::Kind kind_; Arity arity_; const FunctionDoc* doc_; const FunctionOptions* default_options_ = NULLPTR; }; namespace detail { template class FunctionImpl : public Function { public: /// \brief Return pointers to current-available kernels for inspection std::vector kernels() const { std::vector result; for (const auto& kernel : kernels_) { result.push_back(&kernel); } return result; } int num_kernels() const override { return static_cast(kernels_.size()); } protected: FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options) : Function(std::move(name), kind, arity, doc, default_options) {} std::vector kernels_; }; /// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned. ARROW_EXPORT const Kernel* DispatchExactImpl(const Function* func, const std::vector&); /// \brief Return an error message if no Kernel is found. ARROW_EXPORT Status NoMatchingKernel(const Function* func, const std::vector&); } // namespace detail /// \brief A function that executes elementwise operations on arrays or /// scalars, and therefore whose results generally do not depend on the order /// of the values in the arguments. Accepts and returns arrays that are all of /// the same size. These functions roughly correspond to the functions used in /// SQL expressions. class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl { public: using KernelType = ScalarKernel; ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl(std::move(name), Function::SCALAR, arity, doc, default_options) {} /// \brief Add a kernel with given input/output types, no required state /// initialization, preallocation for fixed-width types, and default null /// handling (intersect validity bitmaps of inputs). Status AddKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR); /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. Status AddKernel(ScalarKernel kernel); }; /// \brief A function that executes general array operations that may yield /// outputs of different sizes or have results that depend on the whole array /// contents. These functions roughly correspond to the functions found in /// non-SQL array languages like APL and its derivatives. class ARROW_EXPORT VectorFunction : public detail::FunctionImpl { public: using KernelType = VectorKernel; VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl(std::move(name), Function::VECTOR, arity, doc, default_options) {} /// \brief Add a simple kernel with given input/output types, no required /// state initialization, no data preallocation, and no preallocation of the /// validity bitmap. Status AddKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR); /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. Status AddKernel(VectorKernel kernel); }; class ARROW_EXPORT ScalarAggregateFunction : public detail::FunctionImpl { public: using KernelType = ScalarAggregateKernel; ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl( std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {} /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. Status AddKernel(ScalarAggregateKernel kernel); }; class ARROW_EXPORT HashAggregateFunction : public detail::FunctionImpl { public: using KernelType = HashAggregateKernel; HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl( std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {} /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. Status AddKernel(HashAggregateKernel kernel); }; /// \brief A function that dispatches to other functions. Must implement /// MetaFunction::ExecuteImpl. /// /// For Array, ChunkedArray, and Scalar Datum kinds, may rely on the execution /// of concrete Function types, but must handle other Datum kinds on its own. class ARROW_EXPORT MetaFunction : public Function { public: int num_kernels() const override { return 0; } Result Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override; protected: virtual Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const = 0; MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc, const FunctionOptions* default_options = NULLPTR) : Function(std::move(name), Function::META, arity, doc, default_options) {} }; /// @} } // namespace compute } // namespace arrow