// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include "arrow/array/array_base.h" #include "arrow/array/array_binary.h" #include "arrow/array/builder_adaptive.h" // IWYU pragma: export #include "arrow/array/builder_base.h" // IWYU pragma: export #include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { // ---------------------------------------------------------------------- // Dictionary builder namespace internal { template struct DictionaryValue { using type = typename T::c_type; using PhysicalType = T; }; template struct DictionaryValue> { using type = util::string_view; using PhysicalType = typename std::conditional::value, BinaryType, LargeBinaryType>::type; }; template struct DictionaryValue> { using type = util::string_view; using PhysicalType = BinaryType; }; class ARROW_EXPORT DictionaryMemoTable { public: DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr& type); DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr& dictionary); ~DictionaryMemoTable(); Status GetArrayData(int64_t start_offset, std::shared_ptr* out); /// \brief Insert new memo values Status InsertValues(const Array& values); int32_t size() const; template Status GetOrInsert(typename DictionaryValue::type value, int32_t* out) { // We want to keep the DictionaryMemoTable implementation private, also we can't // use extern template classes because of compiler issues (MinGW?). Instead, // we expose explicit function overrides for each supported physical type. const typename DictionaryValue::PhysicalType* physical_type = NULLPTR; return GetOrInsert(physical_type, value, out); } private: Status GetOrInsert(const BooleanType*, bool value, int32_t* out); Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out); Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out); Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out); Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out); Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out); Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out); Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out); Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out); Status GetOrInsert(const DurationType*, int64_t value, int32_t* out); Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out); Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out); Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out); Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out); Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out); Status GetOrInsert(const MonthDayNanoIntervalType*, MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out); Status GetOrInsert(const DayTimeIntervalType*, DayTimeIntervalType::DayMilliseconds value, int32_t* out); Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out); Status GetOrInsert(const FloatType*, float value, int32_t* out); Status GetOrInsert(const DoubleType*, double value, int32_t* out); Status GetOrInsert(const BinaryType*, util::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, util::string_view value, int32_t* out); class DictionaryMemoTableImpl; std::unique_ptr impl_; }; } // namespace internal /// \addtogroup dictionary-builders /// /// @{ namespace internal { /// \brief Array builder for created encoded DictionaryArray from /// dense array /// /// Unlike other builders, dictionary builder does not completely /// reset the state on Finish calls. template class DictionaryBuilderBase : public ArrayBuilder { public: using TypeClass = DictionaryType; using Value = typename DictionaryValue::type; // WARNING: the type given below is the value type, not the DictionaryType. // The DictionaryType is instantiated on the Finish() call. template DictionaryBuilderBase(uint8_t start_int_size, enable_if_t::value && !is_fixed_size_binary_type::value, const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), indices_builder_(start_int_size, pool), value_type_(value_type) {} template explicit DictionaryBuilderBase( enable_if_t::value, const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), indices_builder_(pool), value_type_(value_type) {} template explicit DictionaryBuilderBase( const std::shared_ptr& index_type, enable_if_t::value, const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), indices_builder_(index_type, pool), value_type_(value_type) {} template DictionaryBuilderBase(uint8_t start_int_size, enable_if_t::value && is_fixed_size_binary_type::value, const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(start_int_size, pool), value_type_(value_type) {} template explicit DictionaryBuilderBase( enable_if_fixed_size_binary&> value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(pool), value_type_(value_type) {} template explicit DictionaryBuilderBase( const std::shared_ptr& index_type, enable_if_fixed_size_binary&> value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(index_type, pool), value_type_(value_type) {} template explicit DictionaryBuilderBase( enable_if_parameter_free pool = default_memory_pool()) : DictionaryBuilderBase(TypeTraits::type_singleton(), pool) {} // This constructor doesn't check for errors. Use InsertMemoValues instead. explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), memo_table_(new internal::DictionaryMemoTable(pool, dictionary)), delta_offset_(0), byte_width_(-1), indices_builder_(pool), value_type_(dictionary->type()) {} ~DictionaryBuilderBase() override = default; /// \brief The current number of entries in the dictionary int64_t dictionary_length() const { return memo_table_->size(); } /// \brief The value byte width (for FixedSizeBinaryType) template enable_if_fixed_size_binary byte_width() const { return byte_width_; } /// \brief Append a scalar value Status Append(Value value) { ARROW_RETURN_NOT_OK(Reserve(1)); int32_t memo_index; ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert(value, &memo_index)); ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index)); length_ += 1; return Status::OK(); } /// \brief Append a fixed-width string (only for FixedSizeBinaryType) template enable_if_fixed_size_binary Append(const uint8_t* value) { return Append(util::string_view(reinterpret_cast(value), byte_width_)); } /// \brief Append a fixed-width string (only for FixedSizeBinaryType) template enable_if_fixed_size_binary Append(const char* value) { return Append(util::string_view(value, byte_width_)); } /// \brief Append a string (only for binary types) template enable_if_binary_like Append(const uint8_t* value, int32_t length) { return Append(reinterpret_cast(value), length); } /// \brief Append a string (only for binary types) template enable_if_binary_like Append(const char* value, int32_t length) { return Append(util::string_view(value, length)); } /// \brief Append a string (only for string types) template enable_if_string_like Append(const char* value, int32_t length) { return Append(util::string_view(value, length)); } /// \brief Append a decimal (only for Decimal128Type) template enable_if_decimal128 Append(const Decimal128& value) { uint8_t data[16]; value.ToBytes(data); return Append(data, 16); } /// \brief Append a decimal (only for Decimal128Type) template enable_if_decimal256 Append(const Decimal256& value) { uint8_t data[32]; value.ToBytes(data); return Append(data, 32); } /// \brief Append a scalar null value Status AppendNull() final { length_ += 1; null_count_ += 1; return indices_builder_.AppendNull(); } Status AppendNulls(int64_t length) final { length_ += length; null_count_ += length; return indices_builder_.AppendNulls(length); } Status AppendEmptyValue() final { length_ += 1; return indices_builder_.AppendEmptyValue(); } Status AppendEmptyValues(int64_t length) final { length_ += length; return indices_builder_.AppendEmptyValues(length); } Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override { if (!scalar.is_valid) return AppendNulls(n_repeats); const auto& dict_ty = internal::checked_cast(*scalar.type); const DictionaryScalar& dict_scalar = internal::checked_cast(scalar); const auto& dict = internal::checked_cast::ArrayType&>( *dict_scalar.value.dictionary); ARROW_RETURN_NOT_OK(Reserve(n_repeats)); switch (dict_ty.index_type()->id()) { case Type::UINT8: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); case Type::INT8: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); case Type::UINT16: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); case Type::INT16: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); case Type::UINT32: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); case Type::INT32: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); case Type::UINT64: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); case Type::INT64: return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); default: return Status::TypeError("Invalid index type: ", dict_ty); } return Status::OK(); } Status AppendScalars(const ScalarVector& scalars) override { for (const auto& scalar : scalars) { ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1)); } return Status::OK(); } Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { // Visit the indices and insert the unpacked values. const auto& dict_ty = internal::checked_cast(*array.type); const typename TypeTraits::ArrayType dict(array.dictionary); ARROW_RETURN_NOT_OK(Reserve(length)); switch (dict_ty.index_type()->id()) { case Type::UINT8: return AppendArraySliceImpl(dict, array, offset, length); case Type::INT8: return AppendArraySliceImpl(dict, array, offset, length); case Type::UINT16: return AppendArraySliceImpl(dict, array, offset, length); case Type::INT16: return AppendArraySliceImpl(dict, array, offset, length); case Type::UINT32: return AppendArraySliceImpl(dict, array, offset, length); case Type::INT32: return AppendArraySliceImpl(dict, array, offset, length); case Type::UINT64: return AppendArraySliceImpl(dict, array, offset, length); case Type::INT64: return AppendArraySliceImpl(dict, array, offset, length); default: return Status::TypeError("Invalid index type: ", dict_ty); } return Status::OK(); } /// \brief Insert values into the dictionary's memo, but do not append any /// indices. Can be used to initialize a new builder with known dictionary /// values /// \param[in] values dictionary values to add to memo. Type must match /// builder type Status InsertMemoValues(const Array& values) { return memo_table_->InsertValues(values); } /// \brief Append a whole dense array to the builder template enable_if_t::value, Status> AppendArray( const Array& array) { using ArrayType = typename TypeTraits::ArrayType; #ifndef NDEBUG ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( value_type_, array, "Wrong value type of array to be appended")); #endif const auto& concrete_array = static_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { ARROW_RETURN_NOT_OK(AppendNull()); } else { ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i))); } } return Status::OK(); } template enable_if_fixed_size_binary AppendArray(const Array& array) { #ifndef NDEBUG ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( value_type_, array, "Wrong value type of array to be appended")); #endif const auto& concrete_array = static_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { ARROW_RETURN_NOT_OK(AppendNull()); } else { ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i))); } } return Status::OK(); } void Reset() override { // Perform a partial reset. Call ResetFull to also reset the accumulated // dictionary values ArrayBuilder::Reset(); indices_builder_.Reset(); } /// \brief Reset and also clear accumulated dictionary values in memo table void ResetFull() { Reset(); memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_)); } Status Resize(int64_t capacity) override { ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); capacity = std::max(capacity, kMinBuilderCapacity); ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); capacity_ = indices_builder_.capacity(); return Status::OK(); } /// \brief Return dictionary indices and a delta dictionary since the last /// time that Finish or FinishDelta were called, and reset state of builder /// (except the memo table) Status FinishDelta(std::shared_ptr* out_indices, std::shared_ptr* out_delta) { std::shared_ptr indices_data; std::shared_ptr delta_data; ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data)); *out_indices = MakeArray(indices_data); *out_delta = MakeArray(delta_data); return Status::OK(); } /// \cond FALSE using ArrayBuilder::Finish; /// \endcond Status Finish(std::shared_ptr* out) { return FinishTyped(out); } std::shared_ptr type() const override { return ::arrow::dictionary(indices_builder_.type(), value_type_); } protected: template Status AppendArraySliceImpl(const typename TypeTraits::ArrayType& dict, const ArrayData& array, int64_t offset, int64_t length) { const c_type* values = array.GetValues(1) + offset; return VisitBitBlocks( array.buffers[0], array.offset + offset, length, [&](const int64_t position) { const int64_t index = static_cast(values[position]); if (dict.IsValid(index)) { return Append(dict.GetView(index)); } return AppendNull(); }, [&]() { return AppendNull(); }); } template Status AppendScalarImpl(const typename TypeTraits::ArrayType& dict, const Scalar& index_scalar, int64_t n_repeats) { using ScalarType = typename TypeTraits::ScalarType; const auto index = internal::checked_cast(index_scalar).value; if (index_scalar.is_valid && dict.IsValid(index)) { const auto& value = dict.GetView(index); for (int64_t i = 0; i < n_repeats; i++) { ARROW_RETURN_NOT_OK(Append(value)); } return Status::OK(); } return AppendNulls(n_repeats); } Status FinishInternal(std::shared_ptr* out) override { std::shared_ptr dictionary; ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary)); // Set type of array data to the right dictionary type (*out)->type = type(); (*out)->dictionary = dictionary; return Status::OK(); } Status FinishWithDictOffset(int64_t dict_offset, std::shared_ptr* out_indices, std::shared_ptr* out_dictionary) { // Finalize indices array ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices)); // Generate dictionary array from hash table contents ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary)); delta_offset_ = memo_table_->size(); // Update internals for further uses of this DictionaryBuilder ArrayBuilder::Reset(); return Status::OK(); } std::unique_ptr memo_table_; // The size of the dictionary memo at last invocation of Finish, to use in // FinishDelta for computing dictionary deltas int32_t delta_offset_; // Only used for FixedSizeBinaryType int32_t byte_width_; BuilderType indices_builder_; std::shared_ptr value_type_; }; template class DictionaryBuilderBase : public ArrayBuilder { public: template DictionaryBuilderBase( enable_if_t::value, uint8_t> start_int_size, const std::shared_ptr& value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} explicit DictionaryBuilderBase(const std::shared_ptr& value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(pool) {} explicit DictionaryBuilderBase(const std::shared_ptr& index_type, const std::shared_ptr& value_type, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(index_type, pool) {} template explicit DictionaryBuilderBase( enable_if_t::value, uint8_t> start_int_size, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(pool) {} explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), indices_builder_(pool) {} /// \brief Append a scalar null value Status AppendNull() final { length_ += 1; null_count_ += 1; return indices_builder_.AppendNull(); } Status AppendNulls(int64_t length) final { length_ += length; null_count_ += length; return indices_builder_.AppendNulls(length); } Status AppendEmptyValue() final { length_ += 1; return indices_builder_.AppendEmptyValue(); } Status AppendEmptyValues(int64_t length) final { length_ += length; return indices_builder_.AppendEmptyValues(length); } /// \brief Append a whole dense array to the builder Status AppendArray(const Array& array) { #ifndef NDEBUG ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( Type::NA, array, "Wrong value type of array to be appended")); #endif for (int64_t i = 0; i < array.length(); i++) { ARROW_RETURN_NOT_OK(AppendNull()); } return Status::OK(); } Status Resize(int64_t capacity) override { ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); capacity = std::max(capacity, kMinBuilderCapacity); ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); capacity_ = indices_builder_.capacity(); return Status::OK(); } Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); (*out)->type = dictionary((*out)->type, null()); (*out)->dictionary = NullArray(0).data(); return Status::OK(); } /// \cond FALSE using ArrayBuilder::Finish; /// \endcond Status Finish(std::shared_ptr* out) { return FinishTyped(out); } std::shared_ptr type() const override { return ::arrow::dictionary(indices_builder_.type(), null()); } protected: BuilderType indices_builder_; }; } // namespace internal /// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the /// smallest index size that can accommodate the dictionary indices template class DictionaryBuilder : public internal::DictionaryBuilderBase { public: using BASE = internal::DictionaryBuilderBase; using BASE::BASE; /// \brief Append dictionary indices directly without modifying memo /// /// NOTE: Experimental API Status AppendIndices(const int64_t* values, int64_t length, const uint8_t* valid_bytes = NULLPTR) { int64_t null_count_before = this->indices_builder_.null_count(); ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); this->capacity_ = this->indices_builder_.capacity(); this->length_ += length; this->null_count_ += this->indices_builder_.null_count() - null_count_before; return Status::OK(); } }; /// \brief A DictionaryArray builder that always returns int32 dictionary /// indices so that data cast to dictionary form will have a consistent index /// type, e.g. for creating a ChunkedArray template class Dictionary32Builder : public internal::DictionaryBuilderBase { public: using BASE = internal::DictionaryBuilderBase; using BASE::BASE; /// \brief Append dictionary indices directly without modifying memo /// /// NOTE: Experimental API Status AppendIndices(const int32_t* values, int64_t length, const uint8_t* valid_bytes = NULLPTR) { int64_t null_count_before = this->indices_builder_.null_count(); ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); this->capacity_ = this->indices_builder_.capacity(); this->length_ += length; this->null_count_ += this->indices_builder_.null_count() - null_count_before; return Status::OK(); } }; // ---------------------------------------------------------------------- // Binary / Unicode builders // (compatibility aliases; those used to be derived classes with additional // Append() overloads, but they have been folded into DictionaryBuilderBase) using BinaryDictionaryBuilder = DictionaryBuilder; using StringDictionaryBuilder = DictionaryBuilder; using BinaryDictionary32Builder = Dictionary32Builder; using StringDictionary32Builder = Dictionary32Builder; /// @} } // namespace arrow