first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,264 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <iosfwd>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
#include "arrow/visitor.h"
namespace arrow {
// ----------------------------------------------------------------------
// User array accessor types
/// \brief Array base type
/// Immutable data array with some logical type and some length.
///
/// Any memory is owned by the respective Buffer instance (or its parents).
///
/// The base class is only required to have a null bitmap buffer if the null
/// count is greater than 0
///
/// If known, the null count can be provided in the base Array constructor. If
/// the null count is not known, pass -1 to indicate that the null count is to
/// be computed on the first call to null_count()
class ARROW_EXPORT Array {
public:
virtual ~Array() = default;
/// \brief Return true if value at index is null. Does not boundscheck
bool IsNull(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? !bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count == data_->length;
}
/// \brief Return true if value at index is valid (not null). Does not
/// boundscheck
bool IsValid(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count != data_->length;
}
/// \brief Return a Scalar containing the value of this array at i
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
/// Size in the number of elements this array contains.
int64_t length() const { return data_->length; }
/// A relative position into another array's data, to enable zero-copy
/// slicing. This value defaults to zero
int64_t offset() const { return data_->offset; }
/// The number of null entries in the array. If the null count was not known
/// at time of construction (and set to a negative value), then the null
/// count will be computed and cached on the first invocation of this
/// function
int64_t null_count() const;
std::shared_ptr<DataType> type() const { return data_->type; }
Type::type type_id() const { return data_->type->id(); }
/// Buffer for the validity (null) bitmap, if any. Note that Union types
/// never have a null bitmap.
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
/// Raw pointer to the null bitmap.
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
/// Equality comparison with another array
bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
bool Equals(const std::shared_ptr<Array>& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
/// \brief Return the formatted unified diff of arrow::Diff between this
/// Array and another Array
std::string Diff(const Array& other) const;
/// Approximate equality comparison with another array
///
/// epsilon is only used if this is FloatArray or DoubleArray
bool ApproxEquals(const std::shared_ptr<Array>& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
bool ApproxEquals(const Array& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
/// Compare if the range of slots specified are equal for the given array and
/// this array. end_idx exclusive. This methods does not bounds check.
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
const Array& other,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
const std::shared_ptr<Array>& other,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
int64_t other_start_idx,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
int64_t end_idx, int64_t other_start_idx,
const EqualOptions& = EqualOptions::Defaults()) const;
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
Status Accept(ArrayVisitor* visitor) const;
/// Construct a zero-copy view of this array with the given type.
///
/// This method checks if the types are layout-compatible.
/// Nested types are traversed in depth-first order. Data buffers must have
/// the same item sizes, even though the logical types may be different.
/// An error is returned if the types are not layout-compatible.
Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
/// Construct a zero-copy slice of the array with the indicated offset and
/// length
///
/// \param[in] offset the position of the first element in the constructed
/// slice
/// \param[in] length the length of the slice. If there are not enough
/// elements in the array, the length will be adjusted accordingly
///
/// \return a new object wrapped in std::shared_ptr<Array>
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
/// Slice from offset until end of the array
std::shared_ptr<Array> Slice(int64_t offset) const;
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
const std::shared_ptr<ArrayData>& data() const { return data_; }
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
/// \return PrettyPrint representation of array suitable for debugging
std::string ToString() const;
/// \brief Perform cheap validation checks to determine obvious inconsistencies
/// within the array's internal data.
///
/// This is O(k) where k is the number of descendents.
///
/// \return Status
Status Validate() const;
/// \brief Perform extensive validation checks to determine inconsistencies
/// within the array's internal data.
///
/// This is potentially O(k*n) where k is the number of descendents and n
/// is the array length.
///
/// \return Status
Status ValidateFull() const;
protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
std::shared_ptr<ArrayData> data_;
const uint8_t* null_bitmap_data_ = NULLPTR;
/// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
if (data->buffers.size() > 0) {
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
} else {
null_bitmap_data_ = NULLPTR;
}
data_ = data;
}
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
ARROW_EXPORT friend void PrintTo(const Array& x, std::ostream* os);
};
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
os << x.ToString();
return os;
}
/// Base class for non-nested arrays
class ARROW_EXPORT FlatArray : public Array {
protected:
using Array::Array;
};
/// Base class for arrays of fixed-size logical types
class ARROW_EXPORT PrimitiveArray : public FlatArray {
public:
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// Does not account for any slice offset
std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
protected:
PrimitiveArray() : raw_values_(NULLPTR) {}
void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
}
explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
const uint8_t* raw_values_;
};
/// Degenerate null type Array
class ARROW_EXPORT NullArray : public FlatArray {
public:
using TypeClass = NullType;
explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
explicit NullArray(int64_t length);
private:
void SetData(const std::shared_ptr<ArrayData>& data) {
null_bitmap_data_ = NULLPTR;
data->null_count = data->length;
data_ = data;
}
};
} // namespace arrow

View File

@ -0,0 +1,269 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes for Binary, LargeBinart, String, LargeString,
// FixedSizeBinary
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h" // IWYU pragma: export
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup binary-arrays
///
/// @{
// ----------------------------------------------------------------------
// Binary and String
/// Base class for variable-sized binary arrays, regardless of offset size
/// and logical interpretation.
template <typename TYPE>
class BaseBinaryArray : public FlatArray {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
// Account for base offset
i += data_->offset;
const offset_type pos = raw_value_offsets_[i];
*out_length = raw_value_offsets_[i + 1] - pos;
return raw_data_ + pos;
}
/// \brief Get binary value as a string_view
///
/// \param i the value index
/// \return the view over the selected value
util::string_view GetView(int64_t i) const {
// Account for base offset
i += data_->offset;
const offset_type pos = raw_value_offsets_[i];
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
raw_value_offsets_[i + 1] - pos);
}
util::optional<util::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
/// \brief Get binary value as a string_view
/// Provided for consistency with other arrays.
///
/// \param i the value index
/// \return the view over the selected value
util::string_view Value(int64_t i) const { return GetView(i); }
/// \brief Get binary value as a std::string
///
/// \param i the value index
/// \return the value copied into a std::string
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
const offset_type* raw_value_offsets() const {
return raw_value_offsets_ + data_->offset;
}
const uint8_t* raw_data() const { return raw_data_; }
/// \brief Return the data buffer absolute offset of the data for the value
/// at the passed index.
///
/// Does not perform boundschecking
offset_type value_offset(int64_t i) const {
return raw_value_offsets_[i + data_->offset];
}
/// \brief Return the length of the data for the value at the passed index.
///
/// Does not perform boundschecking
offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
/// \brief Return the total length of the memory in the data buffer
/// referenced by this array. If the array has been sliced then this may be
/// less than the size of the data buffer (data_->buffers[2]).
offset_type total_values_length() const {
if (data_->length > 0) {
return raw_value_offsets_[data_->length + data_->offset] -
raw_value_offsets_[data_->offset];
} else {
return 0;
}
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
// For subclasses
BaseBinaryArray() = default;
// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
}
const offset_type* raw_value_offsets_ = NULLPTR;
const uint8_t* raw_data_ = NULLPTR;
};
/// Concrete Array class for variable-size binary data
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
public:
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as StringArray
BinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for variable-size string (utf-8) data
class ARROW_EXPORT StringArray : public BinaryArray {
public:
using TypeClass = StringType;
explicit StringArray(const std::shared_ptr<ArrayData>& data);
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
/// Concrete Array class for large variable-size binary data
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
public:
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as LargeStringArray
LargeBinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for large variable-size string (utf-8) data
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
public:
using TypeClass = LargeStringType;
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
// ----------------------------------------------------------------------
// Fixed width binary
/// Concrete Array class for fixed-size binary data
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
public:
using TypeClass = FixedSizeBinaryType;
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const uint8_t* GetValue(int64_t i) const;
const uint8_t* Value(int64_t i) const { return GetValue(i); }
util::string_view GetView(int64_t i) const {
return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
}
util::optional<util::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
int32_t byte_width() const { return byte_width_; }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
void SetData(const std::shared_ptr<ArrayData>& data) {
this->PrimitiveArray::SetData(data);
byte_width_ =
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
}
int32_t byte_width_;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,72 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include "arrow/array/array_binary.h"
#include "arrow/array/data.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-arrays
///
/// @{
// ----------------------------------------------------------------------
// Decimal128Array
/// Concrete Array class for 128-bit decimal data
class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal128Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal128Array from ArrayData instance
explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
// Backward compatibility
using DecimalArray = Decimal128Array;
// ----------------------------------------------------------------------
// Decimal256Array
/// Concrete Array class for 256-bit decimal data
class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal256Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal256Array from ArrayData instance
explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,180 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// DictionaryArray
/// \brief Array type for dictionary-encoded data with a
/// data-dependent dictionary
///
/// A dictionary array contains an array of non-negative integers (the
/// "dictionary indices") along with a data type containing a "dictionary"
/// corresponding to the distinct values represented in the data.
///
/// For example, the array
///
/// ["foo", "bar", "foo", "bar", "foo", "bar"]
///
/// with dictionary ["bar", "foo"], would have dictionary array representation
///
/// indices: [1, 0, 1, 0, 1, 0]
/// dictionary: ["bar", "foo"]
///
/// The indices in principle may be any integer type.
class ARROW_EXPORT DictionaryArray : public Array {
public:
using TypeClass = DictionaryType;
explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
DictionaryArray(const std::shared_ptr<DataType>& type,
const std::shared_ptr<Array>& indices,
const std::shared_ptr<Array>& dictionary);
/// \brief Construct DictionaryArray from dictionary and indices
/// array and validate
///
/// This function does the validation of the indices and input type. It checks if
/// all indices are non-negative and smaller than the size of the dictionary.
///
/// \param[in] type a dictionary type
/// \param[in] dictionary the dictionary with same value type as the
/// type object
/// \param[in] indices an array of non-negative integers smaller than the
/// size of the dictionary
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
const std::shared_ptr<Array>& dictionary);
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
dictionary);
}
/// \brief Transpose this DictionaryArray
///
/// This method constructs a new dictionary array with the given dictionary
/// type, transposing indices using the transpose map. The type and the
/// transpose map are typically computed using DictionaryUnifier.
///
/// \param[in] type the new type object
/// \param[in] dictionary the new dictionary
/// \param[in] transpose_map transposition array of this array's indices
/// into the target array's indices
/// \param[in] pool a pool to allocate the array data from
Result<std::shared_ptr<Array>> Transpose(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
/// \brief Determine whether dictionary arrays may be compared without unification
bool CanCompareIndices(const DictionaryArray& other) const;
/// \brief Return the dictionary for this array, which is stored as
/// a member of the ArrayData internal structure
std::shared_ptr<Array> dictionary() const;
std::shared_ptr<Array> indices() const;
/// \brief Return the ith value of indices, cast to int64_t. Not recommended
/// for use in performance-sensitive code. Does not validate whether the
/// value is null or out-of-bounds.
int64_t GetValueIndex(int64_t i) const;
const DictionaryType* dict_type() const { return dict_type_; }
private:
void SetData(const std::shared_ptr<ArrayData>& data);
const DictionaryType* dict_type_;
std::shared_ptr<Array> indices_;
// Lazily initialized when invoking dictionary()
mutable std::shared_ptr<Array> dictionary_;
};
/// \brief Helper class for incremental dictionary unification
class ARROW_EXPORT DictionaryUnifier {
public:
virtual ~DictionaryUnifier() = default;
/// \brief Construct a DictionaryUnifier
/// \param[in] value_type the data type of the dictionaries
/// \param[in] pool MemoryPool to use for memory allocations
static Result<std::unique_ptr<DictionaryUnifier>> Make(
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
/// \brief Unify dictionaries accross array chunks
///
/// The dictionaries in the array chunks will be unified, their indices
/// accordingly transposed.
///
/// Only dictionaries with a primitive value type are currently supported.
/// However, dictionaries nested inside a more complex type are correctly unified.
static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
const std::shared_ptr<ChunkedArray>& array,
MemoryPool* pool = default_memory_pool());
/// \brief Unify dictionaries accross the chunks of each table column
///
/// The dictionaries in each table column will be unified, their indices
/// accordingly transposed.
///
/// Only dictionaries with a primitive value type are currently supported.
/// However, dictionaries nested inside a more complex type are correctly unified.
static Result<std::shared_ptr<Table>> UnifyTable(
const Table& table, MemoryPool* pool = default_memory_pool());
/// \brief Append dictionary to the internal memo
virtual Status Unify(const Array& dictionary) = 0;
/// \brief Append dictionary and compute transpose indices
/// \param[in] dictionary the dictionary values to unify
/// \param[out] out_transpose a Buffer containing computed transpose indices
/// as int32_t values equal in length to the passed dictionary. The value in
/// each slot corresponds to the new index value for each original index
/// for a DictionaryArray with the old dictionary
virtual Status Unify(const Array& dictionary,
std::shared_ptr<Buffer>* out_transpose) = 0;
/// \brief Return a result DictionaryType with the smallest possible index
/// type to accommodate the unified dictionary. The unifier cannot be used
/// after this is called
virtual Status GetResult(std::shared_ptr<DataType>* out_type,
std::shared_ptr<Array>* out_dict) = 0;
/// \brief Return a unified dictionary with the given index type. If
/// the index type is not large enough then an invalid status will be returned.
/// The unifier cannot be used after this is called
virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
std::shared_ptr<Array>* out_dict) = 0;
};
} // namespace arrow

View File

@ -0,0 +1,569 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and
// Union
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-arrays
///
/// @{
// ----------------------------------------------------------------------
// ListArray
template <typename TYPE>
class BaseListArray;
namespace internal {
// Private helper for ListArray::SetData.
// Unfortunately, trying to define BaseListArray::SetData outside of this header
// doesn't play well with MSVC.
template <typename TYPE>
void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id = TYPE::type_id);
} // namespace internal
/// Base class for variable-sized list arrays, regardless of offset size.
template <typename TYPE>
class BaseListArray : public Array {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
const TypeClass* list_type() const { return list_type_; }
/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const { return values_; }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
/// Return pointer to raw value offsets accounting for any slice offset
const offset_type* raw_value_offsets() const {
return raw_value_offsets_ + data_->offset;
}
// The following functions will not perform boundschecking
offset_type value_offset(int64_t i) const {
return raw_value_offsets_[i + data_->offset];
}
offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}
protected:
friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self,
const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id);
const TypeClass* list_type_ = NULLPTR;
std::shared_ptr<Array> values_;
const offset_type* raw_value_offsets_ = NULLPTR;
};
/// Concrete Array class for list data
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
public:
explicit ListArray(std::shared_ptr<ArrayData> data);
ListArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct ListArray from array of offsets and child value array
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
static Result<std::shared_ptr<ListArray>> FromArrays(
const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
static Result<std::shared_ptr<ListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration of this array's offsets as well as null elements backed
/// by non-empty lists (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list offsets as an Int32Array
///
/// The returned array will not have a validity bitmap, so you cannot expect
/// to pass it to ListArray::FromArrays() and get back the same list array
/// if the original one has nulls.
std::shared_ptr<Array> offsets() const;
protected:
// This constructor defers SetData to a derived array class
ListArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
};
/// Concrete Array class for large list data (with 64-bit offsets)
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
public:
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct LargeListArray from array of offsets and child value array
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int64 type
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
static Result<std::shared_ptr<LargeListArray>> FromArrays(
const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
static Result<std::shared_ptr<LargeListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration of this array's offsets as well as null elements backed
/// by non-empty lists (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list offsets as an Int64Array
std::shared_ptr<Array> offsets() const;
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
};
// ----------------------------------------------------------------------
// MapArray
/// Concrete Array class for map data
///
/// NB: "value" in this context refers to a pair of a key and the corresponding item
class ARROW_EXPORT MapArray : public ListArray {
public:
using TypeClass = MapType;
explicit MapArray(const std::shared_ptr<ArrayData>& data);
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct MapArray from array of offsets and child key, item arrays
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
/// \param[in] keys Array containing key values
/// \param[in] items Array containing item values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool());
static Result<std::shared_ptr<Array>> FromArrays(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
MemoryPool* pool = default_memory_pool());
const MapType* map_type() const { return map_type_; }
/// \brief Return array object containing all map keys
std::shared_ptr<Array> keys() const { return keys_; }
/// \brief Return array object containing all mapped items
std::shared_ptr<Array> items() const { return items_; }
/// Validate child data before constructing the actual MapArray.
static Status ValidateChildData(
const std::vector<std::shared_ptr<ArrayData>>& child_data);
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
static Result<std::shared_ptr<Array>> FromArraysInternal(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
MemoryPool* pool);
private:
const MapType* map_type_;
std::shared_ptr<Array> keys_, items_;
};
// ----------------------------------------------------------------------
// FixedSizeListArray
/// Concrete Array class for fixed size list data
class ARROW_EXPORT FixedSizeListArray : public Array {
public:
using TypeClass = FixedSizeListType;
using offset_type = TypeClass::offset_type;
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const FixedSizeListType* list_type() const;
/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;
std::shared_ptr<DataType> value_type() const;
// The following functions will not perform boundschecking
int64_t value_offset(int64_t i) const {
i += data_->offset;
return list_size_ * i;
}
int32_t value_length(int64_t i = 0) const {
ARROW_UNUSED(i);
return list_size_;
}
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration null elements (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Construct FixedSizeListArray from child value array and value_length
///
/// \param[in] values Array containing list values
/// \param[in] list_size The fixed length of each list
/// \return Will have length equal to values.length() / list_size
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
int32_t list_size);
/// \brief Construct FixedSizeListArray from child value array and type
///
/// \param[in] values Array containing list values
/// \param[in] type The fixed sized list type
/// \return Will have length equal to values.length() / type.list_size()
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
std::shared_ptr<DataType> type);
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
int32_t list_size_;
private:
std::shared_ptr<Array> values_;
};
// ----------------------------------------------------------------------
// Struct
/// Concrete Array class for struct data
class ARROW_EXPORT StructArray : public Array {
public:
using TypeClass = StructType;
explicit StructArray(const std::shared_ptr<ArrayData>& data);
StructArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Array>>& children,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Return a StructArray from child arrays and field names.
///
/// The length and data type are automatically inferred from the arguments.
/// There should be at least one child array.
static Result<std::shared_ptr<StructArray>> Make(
const ArrayVector& children, const std::vector<std::string>& field_names,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Return a StructArray from child arrays and fields.
///
/// The length is automatically inferred from the arguments.
/// There should be at least one child array. This method does not
/// check that field types and child array types are consistent.
static Result<std::shared_ptr<StructArray>> Make(
const ArrayVector& children, const FieldVector& fields,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const StructType* struct_type() const;
// Return a shared pointer in case the requestor desires to share ownership
// with this array. The returned array has its offset, length and null
// count adjusted.
std::shared_ptr<Array> field(int pos) const;
const ArrayVector& fields() const;
/// Returns null if name not found
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
/// \brief Flatten this array as a vector of arrays, one for each field
///
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
/// \brief Get one of the child arrays, combining its null bitmap
/// with the parent struct array's bitmap.
///
/// \param[in] index Which child array to get
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<std::shared_ptr<Array>> GetFlattenedField(
int index, MemoryPool* pool = default_memory_pool()) const;
private:
// For caching boxed child data
// XXX This is not handled in a thread-safe manner.
mutable ArrayVector boxed_fields_;
};
// ----------------------------------------------------------------------
// Union
/// Base class for SparseUnionArray and DenseUnionArray
class ARROW_EXPORT UnionArray : public Array {
public:
using type_code_t = int8_t;
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
/// The logical type code of the value at index.
type_code_t type_code(int64_t i) const { return raw_type_codes_[i + data_->offset]; }
/// The physical child id containing value at index.
int child_id(int64_t i) const {
return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
}
const UnionType* union_type() const { return union_type_; }
UnionMode::type mode() const { return union_type_->mode(); }
/// \brief Return the given field as an individual array.
///
/// For sparse unions, the returned array has its offset, length and null
/// count adjusted.
std::shared_ptr<Array> field(int pos) const;
protected:
void SetData(std::shared_ptr<ArrayData> data);
const type_code_t* raw_type_codes_;
const UnionType* union_type_;
// For caching boxed child data
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
};
/// Concrete Array class for sparse union data
class ARROW_EXPORT SparseUnionArray : public UnionArray {
public:
using TypeClass = SparseUnionType;
explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
/// \brief Construct SparseUnionArray from type_ids and children
///
/// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
std::vector<type_code_t> type_codes) {
return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
std::move(type_codes));
}
/// \brief Construct SparseUnionArray with custom field names from type_ids and children
///
/// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
std::vector<std::string> field_names = {},
std::vector<type_code_t> type_codes = {});
const SparseUnionType* union_type() const {
return internal::checked_cast<const SparseUnionType*>(union_type_);
}
/// \brief Get one of the child arrays, adjusting its null bitmap
/// where the union array type code does not match.
///
/// \param[in] index Which child array to get (i.e. the physical index, not the type
/// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<std::shared_ptr<Array>> GetFlattenedField(
int index, MemoryPool* pool = default_memory_pool()) const;
protected:
void SetData(std::shared_ptr<ArrayData> data);
};
/// \brief Concrete Array class for dense union data
///
/// Note that union types do not have a validity bitmap
class ARROW_EXPORT DenseUnionArray : public UnionArray {
public:
using TypeClass = DenseUnionType;
explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
std::shared_ptr<Buffer> type_ids,
std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
/// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] value_offsets An array of signed int32 values indicating the
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
const Array& value_offsets,
ArrayVector children,
std::vector<type_code_t> type_codes) {
return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
std::move(type_codes));
}
/// \brief Construct DenseUnionArray with custom field names from type_ids,
/// value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] value_offsets An array of signed int32 values indicating the
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
const Array& value_offsets,
ArrayVector children,
std::vector<std::string> field_names = {},
std::vector<type_code_t> type_codes = {});
const DenseUnionType* union_type() const {
return internal::checked_cast<const DenseUnionType*>(union_type_);
}
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
protected:
const int32_t* raw_value_offsets_;
void SetData(const std::shared_ptr<ArrayData>& data);
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,202 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor types for primitive/C-type-based arrays, such as numbers,
// boolean, and temporal types.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// Concrete Array class for boolean data
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
public:
using TypeClass = BooleanType;
using IteratorType = stl::ArrayIterator<BooleanArray>;
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
bool Value(int64_t i) const {
return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
i + data_->offset);
}
bool GetView(int64_t i) const { return Value(i); }
util::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
/// \brief Return the number of false (0) values among the valid
/// values. Result is not cached.
int64_t false_count() const;
/// \brief Return the number of true (1) values among the valid
/// values. Result is not cached.
int64_t true_count() const;
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
using PrimitiveArray::PrimitiveArray;
};
/// \addtogroup numeric-arrays
///
/// @{
/// \brief Concrete Array class for numeric data with a corresponding C type
///
/// This class is templated on the corresponding DataType subclass for the
/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
///
/// Note that convenience aliases are available for all accepted types
/// (for example Int8Array for NumericArray<Int8Type>).
template <typename TYPE>
class NumericArray : public PrimitiveArray {
public:
using TypeClass = TYPE;
using value_type = typename TypeClass::c_type;
using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
// Only enable this constructor without a type argument for types without additional
// metadata
template <typename T1 = TYPE>
NumericArray(enable_if_parameter_free<T1, int64_t> length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
null_count, offset) {}
const value_type* raw_values() const {
return reinterpret_cast<const value_type*>(raw_values_) + data_->offset;
}
value_type Value(int64_t i) const { return raw_values()[i]; }
// For API compatibility with BinaryArray etc.
value_type GetView(int64_t i) const { return Value(i); }
util::optional<value_type> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
using PrimitiveArray::PrimitiveArray;
};
/// DayTimeArray
/// ---------------------
/// \brief Array of Day and Millisecond values.
class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
public:
using TypeClass = DayTimeIntervalType;
using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
TypeClass::DayMilliseconds GetValue(int64_t i) const;
TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
// For compatibility with Take kernel.
TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
util::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
};
/// \brief Array of Month, Day and nanosecond values.
class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
public:
using TypeClass = MonthDayNanoIntervalType;
using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
TypeClass::MonthDayNanos GetValue(int64_t i) const;
TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
// For compatibility with Take kernel.
TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
util::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,213 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <cstring>
#include <memory>
#include <type_traits>
#include "arrow/array/builder_base.h"
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-builders
///
/// @{
namespace internal {
class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
public:
AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool);
explicit AdaptiveIntBuilderBase(MemoryPool* pool)
: AdaptiveIntBuilderBase(sizeof(uint8_t), pool) {}
/// \brief Append multiple nulls
/// \param[in] length the number of nulls to append
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(CommitPendingData());
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(Reserve(length));
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
UnsafeSetNull(length);
}
return Status::OK();
}
Status AppendNull() final {
pending_data_[pending_pos_] = 0;
pending_valid_[pending_pos_] = 0;
pending_has_nulls_ = true;
++pending_pos_;
++length_;
++null_count_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(CommitPendingData());
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(Reserve(length));
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
UnsafeSetNotNull(length);
}
return Status::OK();
}
Status AppendEmptyValue() final {
pending_data_[pending_pos_] = 0;
pending_valid_[pending_pos_] = 1;
++pending_pos_;
++length_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
void Reset() override;
Status Resize(int64_t capacity) override;
protected:
Status AppendInternal(const uint64_t val) {
pending_data_[pending_pos_] = val;
pending_valid_[pending_pos_] = 1;
++pending_pos_;
++length_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
virtual Status CommitPendingData() = 0;
template <typename new_type, typename old_type>
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
ExpandIntSizeInternal();
template <typename new_type, typename old_type>
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
ExpandIntSizeInternal();
std::shared_ptr<ResizableBuffer> data_;
uint8_t* raw_data_ = NULLPTR;
const uint8_t start_int_size_;
uint8_t int_size_;
static constexpr int32_t pending_size_ = 1024;
uint8_t pending_valid_[pending_size_];
uint64_t pending_data_[pending_size_];
int32_t pending_pos_ = 0;
bool pending_has_nulls_ = false;
};
} // namespace internal
class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
public:
explicit AdaptiveUIntBuilder(uint8_t start_int_size,
MemoryPool* pool = default_memory_pool());
explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
: AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
using ArrayBuilder::Advance;
using internal::AdaptiveIntBuilderBase::Reset;
/// Scalar append
Status Append(const uint64_t val) { return AppendInternal(val); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const uint64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
std::shared_ptr<DataType> type() const override;
protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Status AppendValuesInternal(const uint64_t* values, int64_t length,
const uint8_t* valid_bytes);
template <typename new_type>
Status ExpandIntSizeN();
};
class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
public:
explicit AdaptiveIntBuilder(uint8_t start_int_size,
MemoryPool* pool = default_memory_pool());
explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool())
: AdaptiveIntBuilder(sizeof(uint8_t), pool) {}
using ArrayBuilder::Advance;
using internal::AdaptiveIntBuilderBase::Reset;
/// Scalar append
Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
std::shared_ptr<DataType> type() const override;
protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Status AppendValuesInternal(const int64_t* values, int64_t length,
const uint8_t* valid_bytes);
template <typename new_type>
Status ExpandIntSizeN();
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,350 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm> // IWYU pragma: keep
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/array_primitive.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \defgroup numeric-builders Concrete builder subclasses for numeric types
/// @{
/// @}
/// \defgroup temporal-builders Concrete builder subclasses for temporal types
/// @{
/// @}
/// \defgroup binary-builders Concrete builder subclasses for binary types
/// @{
/// @}
/// \defgroup nested-builders Concrete builder subclasses for nested types
/// @{
/// @}
/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
/// @{
/// @}
constexpr int64_t kMinBuilderCapacity = 1 << 5;
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
/// Base class for all data array builders.
///
/// This class provides a facilities for incrementally building the null bitmap
/// (see Append methods) and as a side effect the current number of slots and
/// the null count.
///
/// \note Users are expected to use builders as one of the concrete types below.
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
class ARROW_EXPORT ArrayBuilder {
public:
explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
virtual ~ArrayBuilder() = default;
/// For nested types. Since the objects are owned by this class instance, we
/// skip shared pointers and just return a raw pointer
ArrayBuilder* child(int i) { return children_[i].get(); }
const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
int num_children() const { return static_cast<int>(children_.size()); }
virtual int64_t length() const { return length_; }
int64_t null_count() const { return null_count_; }
int64_t capacity() const { return capacity_; }
/// \brief Ensure that enough memory has been allocated to fit the indicated
/// number of total elements in the builder, including any that have already
/// been appended. Does not account for reallocations that may be due to
/// variable size data, like binary values. To make space for incremental
/// appends, use Reserve instead.
///
/// \param[in] capacity the minimum number of total array values to
/// accommodate. Must be greater than the current capacity.
/// \return Status
virtual Status Resize(int64_t capacity);
/// \brief Ensure that there is enough space allocated to append the indicated
/// number of elements without any further reallocation. Overallocation is
/// used in order to minimize the impact of incremental Reserve() calls.
/// Note that additional_capacity is relative to the current number of elements
/// rather than to the current capacity, so calls to Reserve() which are not
/// interspersed with addition of new elements may not increase the capacity.
///
/// \param[in] additional_capacity the number of additional array values
/// \return Status
Status Reserve(int64_t additional_capacity) {
auto current_capacity = capacity();
auto min_capacity = length() + additional_capacity;
if (min_capacity <= current_capacity) return Status::OK();
// leave growth factor up to BufferBuilder
auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
return Resize(new_capacity);
}
/// Reset the builder.
virtual void Reset();
/// \brief Append a null value to builder
virtual Status AppendNull() = 0;
/// \brief Append a number of null values to builder
virtual Status AppendNulls(int64_t length) = 0;
/// \brief Append a non-null value to builder
///
/// The appended value is an implementation detail, but the corresponding
/// memory slot is guaranteed to be initialized.
/// This method is useful when appending a null value to a parent nested type.
virtual Status AppendEmptyValue() = 0;
/// \brief Append a number of non-null values to builder
///
/// The appended values are an implementation detail, but the corresponding
/// memory slot is guaranteed to be initialized.
/// This method is useful when appending null values to a parent nested type.
virtual Status AppendEmptyValues(int64_t length) = 0;
/// \brief Append a value from a scalar
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
virtual Status AppendScalars(const ScalarVector& scalars);
/// \brief Append a range of values from an array.
///
/// The given array must be the same type as the builder.
virtual Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) {
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
}
/// For cases where raw data was memcpy'd into the internal buffers, allows us
/// to advance the length of the builder. It is your responsibility to use
/// this function responsibly.
ARROW_DEPRECATED(
"Deprecated in 6.0.0. ArrayBuilder::Advance is poorly supported and mostly "
"untested.\nFor low-level control over buffer construction, use BufferBuilder "
"or TypedBufferBuilder directly.")
Status Advance(int64_t elements);
/// \brief Return result of builder as an internal generic ArrayData
/// object. Resets builder except for dictionary builder
///
/// \param[out] out the finalized ArrayData object
/// \return Status
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
/// \brief Return result of builder as an Array object.
///
/// The builder is reset except for DictionaryBuilder.
///
/// \param[out] out the finalized Array object
/// \return Status
Status Finish(std::shared_ptr<Array>* out);
/// \brief Return result of builder as an Array object.
///
/// The builder is reset except for DictionaryBuilder.
///
/// \return The finalized Array object
Result<std::shared_ptr<Array>> Finish();
/// \brief Return the type of the built Array
virtual std::shared_ptr<DataType> type() const = 0;
protected:
/// Append to null bitmap
Status AppendToBitmap(bool is_valid);
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
/// assume all of length bits are valid.
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
/// Uniform append. Append N times the same validity bit.
Status AppendToBitmap(int64_t num_bits, bool value);
/// Set the next length bits to not null (i.e. valid).
Status SetNotNull(int64_t length);
// Unsafe operations (don't check capacity/don't resize)
void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
// Append to null bitmap, update the length
void UnsafeAppendToBitmap(bool is_valid) {
null_bitmap_builder_.UnsafeAppend(is_valid);
++length_;
if (!is_valid) ++null_count_;
}
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
// assume all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
if (valid_bytes == NULLPTR) {
return UnsafeSetNotNull(length);
}
null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
length_ += length;
null_count_ = null_bitmap_builder_.false_count();
}
// Vector append. Copy from a given bitmap. If bitmap is null assume
// all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
if (bitmap == NULLPTR) {
return UnsafeSetNotNull(length);
}
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
length_ += length;
null_count_ = null_bitmap_builder_.false_count();
}
// Append the same validity value a given number of times.
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
if (value) {
UnsafeSetNotNull(num_bits);
} else {
UnsafeSetNull(num_bits);
}
}
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
// Set the next validity bits to not null (i.e. valid).
void UnsafeSetNotNull(int64_t length);
// Set the next validity bits to null (i.e. invalid).
void UnsafeSetNull(int64_t length);
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
/// \brief Finish to an array of the specified ArrayType
template <typename ArrayType>
Status FinishTyped(std::shared_ptr<ArrayType>* out) {
std::shared_ptr<Array> out_untyped;
ARROW_RETURN_NOT_OK(Finish(&out_untyped));
*out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
return Status::OK();
}
// Check the requested capacity for validity
Status CheckCapacity(int64_t new_capacity) {
if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
return Status::Invalid(
"Resize capacity must be positive (requested: ", new_capacity, ")");
}
if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
", current length: ", length_, ")");
}
return Status::OK();
}
// Check for array type
Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
const Array& array, const char* message);
Status CheckArrayType(Type::type expected_type, const Array& array,
const char* message);
MemoryPool* pool_;
TypedBufferBuilder<bool> null_bitmap_builder_;
int64_t null_count_ = 0;
// Array length, so far. Also, the index of the next element to be added
int64_t length_ = 0;
int64_t capacity_ = 0;
// Child value array builders. These are owned by this class
std::vector<std::shared_ptr<ArrayBuilder>> children_;
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};
/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the data type to create the builder for
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
return std::move(out);
}
/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type, where any top-level or nested dictionary builders return the
/// exact index type specified by the type.
ARROW_EXPORT
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
return std::move(out);
}
/// \brief Construct an empty DictionaryBuilder initialized optionally
/// with a pre-existing dictionary
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the dictionary type to create the builder for
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
const std::shared_ptr<Array>& dictionary,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
return std::move(out);
}
} // namespace arrow

View File

@ -0,0 +1,703 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/array_binary.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h" // IWYU pragma: export
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup binary-builders
///
/// @{
// ----------------------------------------------------------------------
// Binary and String
template <typename TYPE>
class BaseBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {}
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
: BaseBinaryBuilder(pool) {}
Status Append(const uint8_t* value, offset_type length) {
ARROW_RETURN_NOT_OK(Reserve(1));
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
UnsafeAppendToBitmap(true);
return Status::OK();
}
Status Append(const char* value, offset_type length) {
return Append(reinterpret_cast<const uint8_t*>(value), length);
}
Status Append(util::string_view value) {
return Append(value.data(), static_cast<offset_type>(value.size()));
}
/// Extend the last appended value by appending more data at the end
///
/// Unlike Append, this does not create a new offset.
Status ExtendCurrent(const uint8_t* value, offset_type length) {
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
return Status::OK();
}
Status ExtendCurrent(util::string_view value) {
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}
Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, false);
return Status::OK();
}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(AppendNextOffset());
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(false);
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(AppendNextOffset());
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(true);
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, true);
return Status::OK();
}
/// \brief Append without checking capacity
///
/// Offsets and data should have been presized using Reserve() and
/// ReserveData(), respectively.
void UnsafeAppend(const uint8_t* value, offset_type length) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(value, length);
UnsafeAppendToBitmap(true);
}
void UnsafeAppend(const char* value, offset_type length) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
}
void UnsafeAppend(const std::string& value) {
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
}
void UnsafeAppend(util::string_view value) {
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}
/// Like ExtendCurrent, but do not check capacity
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
value_data_builder_.UnsafeAppend(value, length);
}
void UnsafeExtendCurrent(util::string_view value) {
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}
void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}
void UnsafeAppendEmptyValue() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(true);
}
/// \brief Append a sequence of strings in one shot.
///
/// \param[in] values a vector of strings
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const std::vector<std::string>& values,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = std::accumulate(
values.begin(), values.end(), 0ULL,
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
ARROW_RETURN_NOT_OK(Reserve(values.size()));
ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
if (valid_bytes != NULLPTR) {
for (std::size_t i = 0; i < values.size(); ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
value_data_builder_.UnsafeAppend(
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
}
}
} else {
for (std::size_t i = 0; i < values.size(); ++i) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
}
}
UnsafeAppendToBitmap(valid_bytes, values.size());
return Status::OK();
}
/// \brief Append a sequence of nul-terminated strings in one shot.
/// If one of the values is NULL, it is processed as a null
/// value even if the corresponding valid_bytes entry is 1.
///
/// \param[in] values a contiguous C array of nul-terminated char *
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const char** values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = 0;
std::vector<std::size_t> value_lengths(length);
bool have_null_value = false;
for (int64_t i = 0; i < length; ++i) {
if (values[i] != NULLPTR) {
auto value_length = strlen(values[i]);
value_lengths[i] = value_length;
total_length += value_length;
} else {
have_null_value = true;
}
}
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ReserveData(total_length));
if (valid_bytes) {
int64_t valid_bytes_offset = 0;
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
} else {
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
i - valid_bytes_offset);
UnsafeAppendToBitmap(false);
valid_bytes_offset = i + 1;
}
}
}
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
} else {
if (have_null_value) {
std::vector<uint8_t> valid_vector(length, 0);
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
valid_vector[i] = 1;
}
}
UnsafeAppendToBitmap(valid_vector.data(), length);
} else {
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
}
UnsafeAppendToBitmap(NULLPTR, length);
}
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
auto bitmap = array.GetValues<uint8_t>(0, 0);
auto offsets = array.GetValues<offset_type>(1);
auto data = array.GetValues<uint8_t>(2, 0);
for (int64_t i = 0; i < length; i++) {
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
const offset_type start = offsets[offset + i];
const offset_type end = offsets[offset + i + 1];
ARROW_RETURN_NOT_OK(Append(data + start, end - start));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
value_data_builder_.Reset();
}
Status ValidateOverflow(int64_t new_bytes) {
auto new_size = value_data_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
// One more than requested for offsets
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return value_data_builder_.Reserve(elements);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
// Write final offset (values length)
ARROW_RETURN_NOT_OK(AppendNextOffset());
// These buffers' padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
null_count_, 0);
Reset();
return Status::OK();
}
/// \return data pointer of the value date builder
const uint8_t* value_data() const { return value_data_builder_.data(); }
/// \return size of values buffer so far
int64_t value_data_length() const { return value_data_builder_.length(); }
/// \return capacity of values buffer
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
/// \return data pointer of the value date builder
const offset_type* offsets_data() const { return offsets_builder_.data(); }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
const offset_type* offsets = offsets_builder_.data();
const auto offset = offsets[i];
if (i == (length_ - 1)) {
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
} else {
*out_length = offsets[i + 1] - offset;
}
return value_data_builder_.data() + offset;
}
offset_type offset(int64_t i) const { return offsets_data()[i]; }
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const {
offset_type value_length;
const uint8_t* value_data = GetValue(i, &value_length);
return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
}
// Cannot make this a static attribute because of linking issues
static constexpr int64_t memory_limit() {
return std::numeric_limits<offset_type>::max() - 1;
}
protected:
TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
Status AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
}
void UnsafeAppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
};
/// \class BinaryBuilder
/// \brief Builder class for variable-length binary data
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
public:
using BaseBinaryBuilder::BaseBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return binary(); }
};
/// \class StringBuilder
/// \brief Builder class for UTF8 strings
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
public:
using BinaryBuilder::BinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return utf8(); }
};
/// \class LargeBinaryBuilder
/// \brief Builder class for large variable-length binary data
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
public:
using BaseBinaryBuilder::BaseBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return large_binary(); }
};
/// \class LargeStringBuilder
/// \brief Builder class for large UTF8 strings
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
public:
using LargeBinaryBuilder::LargeBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return large_utf8(); }
};
// ----------------------------------------------------------------------
// FixedSizeBinaryBuilder
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = FixedSizeBinaryType;
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
Status Append(const uint8_t* value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
return Status::OK();
}
Status Append(const char* value) {
return Append(reinterpret_cast<const uint8_t*>(value));
}
Status Append(const util::string_view& view) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(view);
return Status::OK();
}
Status Append(const std::string& s) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(s);
return Status::OK();
}
Status Append(const Buffer& s) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(util::string_view(s));
return Status::OK();
}
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
template <size_t NBYTES>
Status Append(const std::array<uint8_t, NBYTES>& value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(
util::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
return Status::OK();
}
Status AppendValues(const uint8_t* data, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
int64_t bitmap_offset);
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
return AppendValues(
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
void UnsafeAppend(const uint8_t* value) {
UnsafeAppendToBitmap(true);
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
byte_builder_.UnsafeAppend(value, byte_width_);
}
}
void UnsafeAppend(const char* value) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
}
void UnsafeAppend(util::string_view value) {
#ifndef NDEBUG
CheckValueSize(static_cast<size_t>(value.size()));
#endif
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
}
void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
void UnsafeAppendNull() {
UnsafeAppendToBitmap(false);
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
}
Status ValidateOverflow(int64_t new_bytes) const {
auto new_size = byte_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return byte_builder_.Reserve(elements);
}
void Reset() override;
Status Resize(int64_t capacity) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
/// \return size of values buffer so far
int64_t value_data_length() const { return byte_builder_.length(); }
int32_t byte_width() const { return byte_width_; }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i) const;
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const;
static constexpr int64_t memory_limit() {
return std::numeric_limits<int64_t>::max() - 1;
}
std::shared_ptr<DataType> type() const override {
return fixed_size_binary(byte_width_);
}
protected:
int32_t byte_width_;
BufferBuilder byte_builder_;
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
uint8_t* GetMutableValue(int64_t i) {
uint8_t* data_ptr = byte_builder_.mutable_data();
return data_ptr + i * byte_width_;
}
void CheckValueSize(int64_t size);
};
/// @}
// ----------------------------------------------------------------------
// Chunked builders: build a sequence of BinaryArray or StringArray that are
// limited to a particular size (to the upper limit of 2GB)
namespace internal {
class ARROW_EXPORT ChunkedBinaryBuilder {
public:
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
MemoryPool* pool = default_memory_pool());
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
MemoryPool* pool = default_memory_pool());
virtual ~ChunkedBinaryBuilder() = default;
Status Append(const uint8_t* value, int32_t length) {
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
max_chunk_value_length_)) {
if (builder_->value_data_length() == 0) {
// The current item is larger than max_chunk_size_;
// this chunk will be oversize and hold *only* this item
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
return NextChunk();
}
// The current item would cause builder_->value_data_length() to exceed
// max_chunk_size_, so finish this chunk and append the current item to the next
// chunk
ARROW_RETURN_NOT_OK(NextChunk());
return Append(value, length);
}
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
// The current item would cause builder_->length() to exceed max_chunk_length_, so
// finish this chunk and append the current item to the next chunk
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->Append(value, length);
}
Status Append(const util::string_view& value) {
return Append(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<int32_t>(value.size()));
}
Status AppendNull() {
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->AppendNull();
}
Status Reserve(int64_t values);
virtual Status Finish(ArrayVector* out);
protected:
Status NextChunk();
// maximum total character data size per chunk
int64_t max_chunk_value_length_;
// maximum elements allowed per chunk
int64_t max_chunk_length_ = kListMaximumElements;
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
// add to extra_capacity_ instead and wait to reserve until the next chunk
int64_t extra_capacity_ = 0;
std::unique_ptr<BinaryBuilder> builder_;
std::vector<std::shared_ptr<Array>> chunks_;
};
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
public:
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
Status Finish(ArrayVector* out) override;
};
} // namespace internal
} // namespace arrow

View File

@ -0,0 +1,100 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/array/array_decimal.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/data.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-builders
///
/// @{
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal128Type;
using ValueType = Decimal128;
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(Decimal128 val);
void UnsafeAppend(Decimal128 val);
void UnsafeAppend(util::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal128Type> decimal_type_;
};
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal256Type;
using ValueType = Decimal256;
explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(const Decimal256& val);
void UnsafeAppend(const Decimal256& val);
void UnsafeAppend(util::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal256Type> decimal_type_;
};
using DecimalBuilder = Decimal128Builder;
/// @}
} // namespace arrow

View File

@ -0,0 +1,722 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "arrow/array/array_base.h"
#include "arrow/array/array_binary.h"
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
#include "arrow/array/builder_base.h" // IWYU pragma: export
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// Dictionary builder
namespace internal {
template <typename T, typename Enable = void>
struct DictionaryValue {
using type = typename T::c_type;
using PhysicalType = T;
};
template <typename T>
struct DictionaryValue<T, enable_if_base_binary<T>> {
using type = util::string_view;
using PhysicalType =
typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
BinaryType, LargeBinaryType>::type;
};
template <typename T>
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
using type = util::string_view;
using PhysicalType = BinaryType;
};
class ARROW_EXPORT DictionaryMemoTable {
public:
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
~DictionaryMemoTable();
Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
/// \brief Insert new memo values
Status InsertValues(const Array& values);
int32_t size() const;
template <typename T>
Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
// We want to keep the DictionaryMemoTable implementation private, also we can't
// use extern template classes because of compiler issues (MinGW?). Instead,
// we expose explicit function overrides for each supported physical type.
const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
return GetOrInsert(physical_type, value, out);
}
private:
Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const MonthDayNanoIntervalType*,
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
Status GetOrInsert(const DayTimeIntervalType*,
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
Status GetOrInsert(const FloatType*, float value, int32_t* out);
Status GetOrInsert(const DoubleType*, double value, int32_t* out);
Status GetOrInsert(const BinaryType*, util::string_view value, int32_t* out);
Status GetOrInsert(const LargeBinaryType*, util::string_view value, int32_t* out);
class DictionaryMemoTableImpl;
std::unique_ptr<DictionaryMemoTableImpl> impl_;
};
} // namespace internal
/// \addtogroup dictionary-builders
///
/// @{
namespace internal {
/// \brief Array builder for created encoded DictionaryArray from
/// dense array
///
/// Unlike other builders, dictionary builder does not completely
/// reset the state on Finish calls.
template <typename BuilderType, typename T>
class DictionaryBuilderBase : public ArrayBuilder {
public:
using TypeClass = DictionaryType;
using Value = typename DictionaryValue<T>::type;
// WARNING: the type given below is the value type, not the DictionaryType.
// The DictionaryType is instantiated on the Finish() call.
template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
!is_fixed_size_binary_type<T1>::value,
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(start_int_size, pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(index_type, pool),
value_type_(value_type) {}
template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
is_fixed_size_binary_type<T1>::value,
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(start_int_size, pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(index_type, pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
// This constructor doesn't check for errors. Use InsertMemoValues instead.
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool),
value_type_(dictionary->type()) {}
~DictionaryBuilderBase() override = default;
/// \brief The current number of entries in the dictionary
int64_t dictionary_length() const { return memo_table_->size(); }
/// \brief The value byte width (for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
return byte_width_;
}
/// \brief Append a scalar value
Status Append(Value value) {
ARROW_RETURN_NOT_OK(Reserve(1));
int32_t memo_index;
ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
length_ += 1;
return Status::OK();
}
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
return Append(util::string_view(reinterpret_cast<const char*>(value), byte_width_));
}
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
return Append(util::string_view(value, byte_width_));
}
/// \brief Append a string (only for binary types)
template <typename T1 = T>
enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
return Append(reinterpret_cast<const char*>(value), length);
}
/// \brief Append a string (only for binary types)
template <typename T1 = T>
enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
return Append(util::string_view(value, length));
}
/// \brief Append a string (only for string types)
template <typename T1 = T>
enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
return Append(util::string_view(value, length));
}
/// \brief Append a decimal (only for Decimal128Type)
template <typename T1 = T>
enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
uint8_t data[16];
value.ToBytes(data);
return Append(data, 16);
}
/// \brief Append a decimal (only for Decimal128Type)
template <typename T1 = T>
enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
uint8_t data[32];
value.ToBytes(data);
return Append(data, 32);
}
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
null_count_ += 1;
return indices_builder_.AppendNull();
}
Status AppendNulls(int64_t length) final {
length_ += length;
null_count_ += length;
return indices_builder_.AppendNulls(length);
}
Status AppendEmptyValue() final {
length_ += 1;
return indices_builder_.AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
length_ += length;
return indices_builder_.AppendEmptyValues(length);
}
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
if (!scalar.is_valid) return AppendNulls(n_repeats);
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
const DictionaryScalar& dict_scalar =
internal::checked_cast<const DictionaryScalar&>(scalar);
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
*dict_scalar.value.dictionary);
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT8:
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT16:
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT16:
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT32:
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT32:
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT64:
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT64:
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}
Status AppendScalars(const ScalarVector& scalars) override {
for (const auto& scalar : scalars) {
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
// Visit the indices and insert the unpacked values.
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
const typename TypeTraits<T>::ArrayType dict(array.dictionary);
ARROW_RETURN_NOT_OK(Reserve(length));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
case Type::INT8:
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
case Type::UINT16:
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
case Type::INT16:
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
case Type::UINT32:
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
case Type::INT32:
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
case Type::UINT64:
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
case Type::INT64:
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}
/// \brief Insert values into the dictionary's memo, but do not append any
/// indices. Can be used to initialize a new builder with known dictionary
/// values
/// \param[in] values dictionary values to add to memo. Type must match
/// builder type
Status InsertMemoValues(const Array& values) {
return memo_table_->InsertValues(values);
}
/// \brief Append a whole dense array to the builder
template <typename T1 = T>
enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
const Array& array) {
using ArrayType = typename TypeTraits<T>::ArrayType;
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
value_type_, array, "Wrong value type of array to be appended"));
#endif
const auto& concrete_array = static_cast<const ArrayType&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
ARROW_RETURN_NOT_OK(AppendNull());
} else {
ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
}
}
return Status::OK();
}
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
value_type_, array, "Wrong value type of array to be appended"));
#endif
const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
ARROW_RETURN_NOT_OK(AppendNull());
} else {
ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
}
}
return Status::OK();
}
void Reset() override {
// Perform a partial reset. Call ResetFull to also reset the accumulated
// dictionary values
ArrayBuilder::Reset();
indices_builder_.Reset();
}
/// \brief Reset and also clear accumulated dictionary values in memo table
void ResetFull() {
Reset();
memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
capacity_ = indices_builder_.capacity();
return Status::OK();
}
/// \brief Return dictionary indices and a delta dictionary since the last
/// time that Finish or FinishDelta were called, and reset state of builder
/// (except the memo table)
Status FinishDelta(std::shared_ptr<Array>* out_indices,
std::shared_ptr<Array>* out_delta) {
std::shared_ptr<ArrayData> indices_data;
std::shared_ptr<ArrayData> delta_data;
ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
*out_indices = MakeArray(indices_data);
*out_delta = MakeArray(delta_data);
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), value_type_);
}
protected:
template <typename c_type>
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
const ArrayData& array, int64_t offset, int64_t length) {
const c_type* values = array.GetValues<c_type>(1) + offset;
return VisitBitBlocks(
array.buffers[0], array.offset + offset, length,
[&](const int64_t position) {
const int64_t index = static_cast<int64_t>(values[position]);
if (dict.IsValid(index)) {
return Append(dict.GetView(index));
}
return AppendNull();
},
[&]() { return AppendNull(); });
}
template <typename IndexType>
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
const Scalar& index_scalar, int64_t n_repeats) {
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
if (index_scalar.is_valid && dict.IsValid(index)) {
const auto& value = dict.GetView(index);
for (int64_t i = 0; i < n_repeats; i++) {
ARROW_RETURN_NOT_OK(Append(value));
}
return Status::OK();
}
return AppendNulls(n_repeats);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
std::shared_ptr<ArrayData> dictionary;
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
// Set type of array data to the right dictionary type
(*out)->type = type();
(*out)->dictionary = dictionary;
return Status::OK();
}
Status FinishWithDictOffset(int64_t dict_offset,
std::shared_ptr<ArrayData>* out_indices,
std::shared_ptr<ArrayData>* out_dictionary) {
// Finalize indices array
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
// Generate dictionary array from hash table contents
ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
delta_offset_ = memo_table_->size();
// Update internals for further uses of this DictionaryBuilder
ArrayBuilder::Reset();
return Status::OK();
}
std::unique_ptr<DictionaryMemoTable> memo_table_;
// The size of the dictionary memo at last invocation of Finish, to use in
// FinishDelta for computing dictionary deltas
int32_t delta_offset_;
// Only used for FixedSizeBinaryType
int32_t byte_width_;
BuilderType indices_builder_;
std::shared_ptr<DataType> value_type_;
};
template <typename BuilderType>
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
public:
template <typename B = BuilderType>
DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
template <typename B = BuilderType>
explicit DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
null_count_ += 1;
return indices_builder_.AppendNull();
}
Status AppendNulls(int64_t length) final {
length_ += length;
null_count_ += length;
return indices_builder_.AppendNulls(length);
}
Status AppendEmptyValue() final {
length_ += 1;
return indices_builder_.AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
length_ += length;
return indices_builder_.AppendEmptyValues(length);
}
/// \brief Append a whole dense array to the builder
Status AppendArray(const Array& array) {
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
Type::NA, array, "Wrong value type of array to be appended"));
#endif
for (int64_t i = 0; i < array.length(); i++) {
ARROW_RETURN_NOT_OK(AppendNull());
}
return Status::OK();
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
capacity_ = indices_builder_.capacity();
return Status::OK();
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
(*out)->type = dictionary((*out)->type, null());
(*out)->dictionary = NullArray(0).data();
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), null());
}
protected:
BuilderType indices_builder_;
};
} // namespace internal
/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
/// smallest index size that can accommodate the dictionary indices
template <typename T>
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
public:
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
using BASE::BASE;
/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->indices_builder_.null_count();
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
this->capacity_ = this->indices_builder_.capacity();
this->length_ += length;
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
return Status::OK();
}
};
/// \brief A DictionaryArray builder that always returns int32 dictionary
/// indices so that data cast to dictionary form will have a consistent index
/// type, e.g. for creating a ChunkedArray
template <typename T>
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
public:
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
using BASE::BASE;
/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int32_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->indices_builder_.null_count();
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
this->capacity_ = this->indices_builder_.capacity();
this->length_ += length;
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
return Status::OK();
}
};
// ----------------------------------------------------------------------
// Binary / Unicode builders
// (compatibility aliases; those used to be derived classes with additional
// Append() overloads, but they have been folded into DictionaryBuilderBase)
using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
using StringDictionaryBuilder = DictionaryBuilder<StringType>;
using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
using StringDictionary32Builder = Dictionary32Builder<StringType>;
/// @}
} // namespace arrow

View File

@ -0,0 +1,561 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-builders
///
/// @{
// ----------------------------------------------------------------------
// List builder
template <typename TYPE>
class BaseListBuilder : public ArrayBuilder {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
/// Use this constructor to incrementally build the value array along with offsets and
/// null bitmap.
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
const std::shared_ptr<DataType>& type)
: ArrayBuilder(pool),
offsets_builder_(pool),
value_builder_(value_builder),
value_field_(type->field(0)->WithType(NULLPTR)) {}
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder)
: BaseListBuilder(pool, value_builder, list(value_builder->type())) {}
Status Resize(int64_t capacity) override {
if (capacity > maximum_elements()) {
return Status::CapacityError("List array cannot reserve space for more than ",
maximum_elements(), " got ", capacity);
}
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
// One more than requested for offsets
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
value_builder_->Reset();
}
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status AppendValues(const offset_type* offsets, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(valid_bytes, length);
offsets_builder_.UnsafeAppend(offsets, length);
return Status::OK();
}
/// \brief Start a new variable-length list slot
///
/// This function should be called before beginning to append elements to the
/// value builder
Status Append(bool is_valid = true) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(is_valid);
return AppendNextOffset();
}
Status AppendNull() final { return Append(false); }
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
UnsafeAppendToBitmap(length, false);
const int64_t num_values = value_builder_->length();
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
}
return Status::OK();
}
Status AppendEmptyValue() final { return Append(true); }
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
UnsafeAppendToBitmap(length, true);
const int64_t num_values = value_builder_->length();
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
const offset_type* offsets = array.GetValues<offset_type>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(Append());
int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0],
offsets[row], slot_length));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Offset padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets, null_bitmap;
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
if (value_builder_->length() == 0) {
// Try to make sure we get a non-null values buffer (ARROW-2744)
ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
}
std::shared_ptr<ArrayData> items;
ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
null_count_);
Reset();
return Status::OK();
}
Status ValidateOverflow(int64_t new_elements) const {
auto new_length = value_builder_->length() + new_elements;
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
return Status::CapacityError("List array cannot contain more than ",
maximum_elements(), " elements, have ", new_elements);
} else {
return Status::OK();
}
}
ArrayBuilder* value_builder() const { return value_builder_.get(); }
// Cannot make this a static attribute because of linking issues
static constexpr int64_t maximum_elements() {
return std::numeric_limits<offset_type>::max() - 1;
}
std::shared_ptr<DataType> type() const override {
return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
}
protected:
TypedBufferBuilder<offset_type> offsets_builder_;
std::shared_ptr<ArrayBuilder> value_builder_;
std::shared_ptr<Field> value_field_;
Status AppendNextOffset() {
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
const int64_t num_values = value_builder_->length();
return offsets_builder_.Append(static_cast<offset_type>(num_values));
}
};
/// \class ListBuilder
/// \brief Builder class for variable-length list array value types
///
/// To use this class, you must append values to the child array builder and use
/// the Append function to delimit each distinct list value (once the values
/// have been appended to the child array) or use the bulk API to append
/// a sequence of offsets and null values.
///
/// A note on types. Per arrow/type.h all types in the c++ implementation are
/// logical so even though this class always builds list array, this can
/// represent multiple different logical types. If no logical type is provided
/// at construction time, the class defaults to List<T> where t is taken from the
/// value_builder/values that the object is constructed with.
class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
public:
using BaseListBuilder::BaseListBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
};
/// \class LargeListBuilder
/// \brief Builder class for large variable-length list array value types
///
/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
public:
using BaseListBuilder::BaseListBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
};
// ----------------------------------------------------------------------
// Map builder
/// \class MapBuilder
/// \brief Builder class for arrays of variable-size maps
///
/// To use this class, you must append values to the key and item array builders
/// and use the Append function to delimit each distinct map (once the keys and items
/// have been appended) or use the bulk API to append a sequence of offsets and null
/// maps.
///
/// Key uniqueness and ordering are not validated.
class ARROW_EXPORT MapBuilder : public ArrayBuilder {
public:
/// Use this constructor to define the built array's type explicitly. If key_builder
/// or item_builder has indeterminate type, this builder will also.
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
const std::shared_ptr<ArrayBuilder>& item_builder,
const std::shared_ptr<DataType>& type);
/// Use this constructor to infer the built array's type. If key_builder or
/// item_builder has indeterminate type, this builder will also.
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
const std::shared_ptr<DataType>& type);
Status Resize(int64_t capacity) override;
void Reset() override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status AppendValues(const int32_t* offsets, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
/// \brief Start a new variable-length map slot
///
/// This function should be called before beginning to append elements to the
/// key and item builders
Status Append();
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
const int32_t* offsets = array.GetValues<int32_t>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(Append());
const int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
*array.child_data[0]->child_data[0], offsets[row], slot_length));
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
*array.child_data[0]->child_data[1], offsets[row], slot_length));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
/// \brief Get builder to append keys.
///
/// Append a key with this builder should be followed by appending
/// an item or null value with item_builder().
ArrayBuilder* key_builder() const { return key_builder_.get(); }
/// \brief Get builder to append items
///
/// Appending an item with this builder should have been preceded
/// by appending a key with key_builder().
ArrayBuilder* item_builder() const { return item_builder_.get(); }
/// \brief Get builder to add Map entries as struct values.
///
/// This is used instead of key_builder()/item_builder() and allows
/// the Map to be built as a list of struct values.
ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
std::shared_ptr<DataType> type() const override {
// Key and Item builder may update types, but they don't contain the field names,
// so we need to reconstruct the type. (See ARROW-13735.)
return std::make_shared<MapType>(
field(entries_name_,
struct_({field(key_name_, key_builder_->type(), false),
field(item_name_, item_builder_->type(), item_nullable_)}),
false),
keys_sorted_);
}
Status ValidateOverflow(int64_t new_elements) {
return list_builder_->ValidateOverflow(new_elements);
}
protected:
inline Status AdjustStructBuilderLength();
protected:
bool keys_sorted_ = false;
bool item_nullable_ = false;
std::string entries_name_;
std::string key_name_;
std::string item_name_;
std::shared_ptr<ListBuilder> list_builder_;
std::shared_ptr<ArrayBuilder> key_builder_;
std::shared_ptr<ArrayBuilder> item_builder_;
};
// ----------------------------------------------------------------------
// FixedSizeList builder
/// \class FixedSizeListBuilder
/// \brief Builder class for fixed-length list array value types
class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
public:
/// Use this constructor to define the built array's type explicitly. If value_builder
/// has indeterminate type, this builder will also.
FixedSizeListBuilder(MemoryPool* pool,
std::shared_ptr<ArrayBuilder> const& value_builder,
int32_t list_size);
/// Use this constructor to infer the built array's type. If value_builder has
/// indeterminate type, this builder will also.
FixedSizeListBuilder(MemoryPool* pool,
std::shared_ptr<ArrayBuilder> const& value_builder,
const std::shared_ptr<DataType>& type);
Status Resize(int64_t capacity) override;
void Reset() override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
/// \brief Append a valid fixed length list.
///
/// This function affects only the validity bitmap; the child values must be appended
/// using the child array builder.
Status Append();
/// \brief Vector append
///
/// If passed, valid_bytes wil be read and any zero byte
/// will cause the corresponding slot to be null
///
/// This function affects only the validity bitmap; the child values must be appended
/// using the child array builder. This includes appending nulls for null lists.
/// XXX this restriction is confusing, should this method be omitted?
Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
/// \brief Append a null fixed length list.
///
/// The child array builder will have the appropriate number of nulls appended
/// automatically.
Status AppendNull() final;
/// \brief Append length null fixed length lists.
///
/// The child array builder will have the appropriate number of nulls appended
/// automatically.
Status AppendNulls(int64_t length) final;
Status ValidateOverflow(int64_t new_elements);
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
*array.child_data[0], list_size_ * (array.offset + row), list_size_));
ARROW_RETURN_NOT_OK(Append());
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
ArrayBuilder* value_builder() const { return value_builder_.get(); }
std::shared_ptr<DataType> type() const override {
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
}
// Cannot make this a static attribute because of linking issues
static constexpr int64_t maximum_elements() {
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
}
protected:
std::shared_ptr<Field> value_field_;
const int32_t list_size_;
std::shared_ptr<ArrayBuilder> value_builder_;
};
// ----------------------------------------------------------------------
// Struct
// ---------------------------------------------------------------------------------
// StructArray builder
/// Append, Resize and Reserve methods are acting on StructBuilder.
/// Please make sure all these methods of all child-builders' are consistently
/// called to maintain data-structure consistency.
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
public:
/// If any of field_builders has indeterminate type, this builder will also
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
/// Null bitmap is of equal length to every child field, and any zero byte
/// will be considered as a null for that field, but users must using app-
/// end methods or advance methods of the child builders' independently to
/// insert data.
Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
/// Append an element to the Struct. All child-builders' Append method must
/// be called independently to maintain data-structure consistency.
Status Append(bool is_valid = true) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
/// \brief Append a null value. Automatically appends an empty value to each child
/// builder.
Status AppendNull() final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(false);
}
/// \brief Append multiple null values. Automatically appends empty values to each
/// child builder.
Status AppendNulls(int64_t length) final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
}
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, false);
return Status::OK();
}
Status AppendEmptyValue() final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(true);
}
Status AppendEmptyValues(int64_t length) final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
}
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, true);
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i],
array.offset + offset, length));
}
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(validity, array.offset + offset, length);
return Status::OK();
}
void Reset() override;
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
int num_fields() const { return static_cast<int>(children_.size()); }
std::shared_ptr<DataType> type() const override;
private:
std::shared_ptr<DataType> type_;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,539 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <memory>
#include <vector>
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
namespace arrow {
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
public:
explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {}
explicit NullBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool())
: NullBuilder(pool) {}
/// \brief Append the specified number of null elements
Status AppendNulls(int64_t length) final {
if (length < 0) return Status::Invalid("length must be positive");
null_count_ += length;
length_ += length;
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() final { return AppendNulls(1); }
Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
Status Append(std::nullptr_t) { return AppendNull(); }
Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override {
return AppendNulls(length);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
std::shared_ptr<DataType> type() const override { return null(); }
Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
};
/// \addtogroup numeric-builders
///
/// @{
/// Base class for all Builders that emit an Array of a scalar numerical type.
template <typename T>
class NumericBuilder : public ArrayBuilder {
public:
using TypeClass = T;
using value_type = typename T::c_type;
using ArrayType = typename TypeTraits<T>::ArrayType;
template <typename T1 = T>
explicit NumericBuilder(
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
: ArrayBuilder(pool), type_(TypeTraits<T>::type_singleton()), data_builder_(pool) {}
NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
: ArrayBuilder(pool), type_(type), data_builder_(pool) {}
/// Append a single scalar and increase the size if necessary.
Status Append(const value_type val) {
ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
UnsafeAppend(val);
return Status::OK();
}
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
/// The memory at the corresponding data slot is set to 0 to prevent
/// uninitialized memory access
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNull(length);
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(false);
return Status::OK();
}
/// \brief Append a empty element
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(true);
return Status::OK();
}
/// \brief Append several empty elements
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNotNull(length);
return Status::OK();
}
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
void Reset() override { data_builder_.Reset(); }
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
return ArrayBuilder::Resize(capacity);
}
value_type operator[](int64_t index) const { return GetValue(index); }
value_type& operator[](int64_t index) {
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] bitmap a validity bitmap to copy (may be null)
/// \param[in] bitmap_offset an offset into the validity bitmap
/// \return Status
Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
int64_t bitmap_offset) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
const std::vector<bool>& is_valid) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<value_type>& values,
const std::vector<bool>& is_valid) {
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
/// \return Status
Status AppendValues(const std::vector<value_type>& values) {
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
null_bitmap_builder_.FinishWithLength(length_));
ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \return Status
template <typename ValuesIter>
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
UnsafeSetNotNull(length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot, with a specified nullmap
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \param[in] valid_begin InputIterator with elements indication valid(1)
/// or null(0) values.
/// \return Status
template <typename ValuesIter, typename ValidIter>
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
static_assert(!internal::is_null_pointer<ValidIter>::value,
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
"version instead");
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
// Same as above, with a pointer type ValidIter
template <typename ValuesIter, typename ValidIter>
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
return AppendValues(array.GetValues<value_type>(1) + offset, length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
/// Append a single scalar under the assumption that the underlying Buffer is
/// large enough.
///
/// This method does not capacity-check; make sure to call Reserve
/// beforehand.
void UnsafeAppend(const value_type val) {
ArrayBuilder::UnsafeAppendToBitmap(true);
data_builder_.UnsafeAppend(val);
}
void UnsafeAppendNull() {
ArrayBuilder::UnsafeAppendToBitmap(false);
data_builder_.UnsafeAppend(value_type{}); // zero
}
std::shared_ptr<DataType> type() const override { return type_; }
protected:
std::shared_ptr<DataType> type_;
TypedBufferBuilder<value_type> data_builder_;
};
// Builders
using UInt8Builder = NumericBuilder<UInt8Type>;
using UInt16Builder = NumericBuilder<UInt16Type>;
using UInt32Builder = NumericBuilder<UInt32Type>;
using UInt64Builder = NumericBuilder<UInt64Type>;
using Int8Builder = NumericBuilder<Int8Type>;
using Int16Builder = NumericBuilder<Int16Type>;
using Int32Builder = NumericBuilder<Int32Type>;
using Int64Builder = NumericBuilder<Int64Type>;
using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
using FloatBuilder = NumericBuilder<FloatType>;
using DoubleBuilder = NumericBuilder<DoubleType>;
/// @}
/// \addtogroup temporal-builders
///
/// @{
using Date32Builder = NumericBuilder<Date32Type>;
using Date64Builder = NumericBuilder<Date64Type>;
using Time32Builder = NumericBuilder<Time32Type>;
using Time64Builder = NumericBuilder<Time64Type>;
using TimestampBuilder = NumericBuilder<TimestampType>;
using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
using DurationBuilder = NumericBuilder<DurationType>;
/// @}
class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
public:
using TypeClass = BooleanType;
using value_type = bool;
explicit BooleanBuilder(MemoryPool* pool = default_memory_pool());
BooleanBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, false);
UnsafeSetNull(length);
return Status::OK();
}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendNull();
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(false);
UnsafeSetNotNull(1);
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, false);
UnsafeSetNotNull(length);
return Status::OK();
}
/// Scalar append
Status Append(const bool val) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(val);
return Status::OK();
}
Status Append(const uint8_t val) { return Append(val != 0); }
/// Scalar append, without checking for capacity
void UnsafeAppend(const bool val) {
data_builder_.UnsafeAppend(val);
UnsafeAppendToBitmap(true);
}
void UnsafeAppendNull() {
data_builder_.UnsafeAppend(false);
UnsafeAppendToBitmap(false);
}
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous array of bytes (non-zero is 1)
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a bitmap of values
/// \param[in] length the number of values to append
/// \param[in] validity a validity bitmap to copy (may be null)
/// \param[in] offset an offset into the values and validity bitmaps
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
int64_t offset);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length,
const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of bytes
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<uint8_t>& values,
const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of bytes
/// \return Status
Status AppendValues(const std::vector<uint8_t>& values);
/// \brief Append a sequence of elements in one shot
/// \param[in] values an std::vector<bool> indicating true (1) or false
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values an std::vector<bool> indicating true (1) or false
/// \return Status
Status AppendValues(const std::vector<bool>& values);
/// \brief Append a sequence of elements in one shot
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// or null(0) values
/// \return Status
template <typename ValuesIter>
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
// this updates length_
UnsafeSetNotNull(length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot, with a specified nullmap
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \param[in] valid_begin InputIterator with elements indication valid(1)
/// or null(0) values
/// \return Status
template <typename ValuesIter, typename ValidIter>
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
static_assert(!internal::is_null_pointer<ValidIter>::value,
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
"version instead");
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
// Same as above, for a pointer type ValidIter
template <typename ValuesIter, typename ValidIter>
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
}
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
Status AppendValues(int64_t length, bool value);
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
return AppendValues(array.GetValues<uint8_t>(1, 0), length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
void Reset() override;
Status Resize(int64_t capacity) override;
std::shared_ptr<DataType> type() const override { return boolean(); }
protected:
TypedBufferBuilder<bool> data_builder_;
};
} // namespace arrow

View File

@ -0,0 +1,62 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Contains declarations of time related Arrow builder types.
#pragma once
#include <memory>
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_primitive.h"
namespace arrow {
/// \addtogroup temporal-builders
///
/// @{
// TODO(ARROW-7938): this class is untested
class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
public:
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool())
: DayTimeIntervalBuilder(day_time_interval(), pool) {}
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool())
: NumericBuilder<DayTimeIntervalType>(type, pool) {}
};
class ARROW_EXPORT MonthDayNanoIntervalBuilder
: public NumericBuilder<MonthDayNanoIntervalType> {
public:
using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool())
: MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool) {}
explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool())
: NumericBuilder<MonthDayNanoIntervalType>(type, pool) {}
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,248 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer_builder.h"
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-builders
///
/// @{
/// \brief Base class for union array builds.
///
/// Note that while we subclass ArrayBuilder, as union types do not have a
/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
public:
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
/// \brief Make a new child builder available to the UnionArray
///
/// \param[in] new_child the child builder
/// \param[in] field_name the name of the field in the union array type
/// if type inference is used
/// \return child index, which is the "type" argument that needs
/// to be passed to the "Append" method to add a new element to
/// the union array.
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
const std::string& field_name = "");
std::shared_ptr<DataType> type() const override;
int64_t length() const override { return types_builder_.length(); }
protected:
BasicUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type);
int8_t NextTypeId();
std::vector<std::shared_ptr<Field>> child_fields_;
std::vector<int8_t> type_codes_;
UnionMode::type mode_;
std::vector<ArrayBuilder*> type_id_to_children_;
std::vector<int> type_id_to_child_id_;
// for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
int8_t dense_type_id_ = 0;
TypedBufferBuilder<int8_t> types_builder_;
};
/// \class DenseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit DenseUnionBuilder(MemoryPool* pool)
: BasicUnionBuilder(pool, {}, dense_union(FieldVector{})), offsets_builder_(pool) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
DenseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, children, type), offsets_builder_(pool) {}
Status AppendNull() final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
// Append a null arbitrarily to the first child
return child_builder->AppendNull();
}
Status AppendNulls(int64_t length) final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
// Append just a single null to the first child
return child_builder->AppendNull();
}
Status AppendEmptyValue() final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
// Append an empty value arbitrarily to the first child
return child_builder->AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
// Append just a single empty value to the first child
return child_builder->AppendEmptyValue();
}
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
/// is called.
Status Append(int8_t next_type) {
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
return Status::CapacityError(
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
"child");
}
auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
return offsets_builder_.Append(offset);
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
private:
TypedBufferBuilder<int32_t> offsets_builder_;
};
/// \class SparseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit SparseUnionBuilder(MemoryPool* pool)
: BasicUnionBuilder(pool, {}, sparse_union(FieldVector{})) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
SparseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, children, type) {}
/// \brief Append a null value.
///
/// A null is appended to the first child, empty values to the other children.
Status AppendNull() final {
const auto first_child_code = type_codes_[0];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
}
return Status::OK();
}
/// \brief Append multiple null values.
///
/// Nulls are appended to the first child, empty values to the other children.
Status AppendNulls(int64_t length) final {
const auto first_child_code = type_codes_[0];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
ARROW_RETURN_NOT_OK(
type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
}
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
for (int8_t code : type_codes_) {
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
}
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
for (int8_t code : type_codes_) {
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
}
return Status::OK();
}
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
/// is called, and all other child builders must have null or empty value appended.
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,37 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \brief Concatenate arrays
///
/// \param[in] arrays a vector of arrays to be concatenated
/// \param[in] pool memory to store the result will be allocated from this memory pool
/// \return the concatenated array
ARROW_EXPORT
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
MemoryPool* pool = default_memory_pool());
} // namespace arrow

View File

@ -0,0 +1,258 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <atomic> // IWYU pragma: export
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// When slicing, we do not know the null count of the sliced range without
// doing some computation. To avoid doing this eagerly, we set the null count
// to -1 (any negative number will do). When Array::null_count is called the
// first time, the null count will be computed. See ARROW-33
constexpr int64_t kUnknownNullCount = -1;
// ----------------------------------------------------------------------
// Generic array data container
/// \class ArrayData
/// \brief Mutable container for generic Arrow array data
///
/// This data structure is a self-contained representation of the memory and
/// metadata inside an Arrow array data structure (called vectors in Java). The
/// classes arrow::Array and its subclasses provide strongly-typed accessors
/// with support for the visitor pattern and other affordances.
///
/// This class is designed for easy internal data manipulation, analytical data
/// processing, and data transport to and from IPC messages. For example, we
/// could cast from int64 to float64 like so:
///
/// Int64Array arr = GetMyData();
/// auto new_data = arr.data()->Copy();
/// new_data->type = arrow::float64();
/// DoubleArray double_arr(new_data);
///
/// This object is also useful in an analytics setting where memory may be
/// reused. For example, if we had a group of operations all returning doubles,
/// say:
///
/// Log(Sqrt(Expr(arr)))
///
/// Then the low-level implementations of each of these functions could have
/// the signatures
///
/// void Log(const ArrayData& values, ArrayData* out);
///
/// As another example a function may consume one or more memory buffers in an
/// input array and replace them with newly-allocated data, changing the output
/// data type as well.
struct ARROW_EXPORT ArrayData {
ArrayData() = default;
ArrayData(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
}
ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
this->child_data = std::move(child_data);
}
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
// Move constructor
ArrayData(ArrayData&& other) noexcept
: type(std::move(other.type)),
length(other.length),
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)) {
SetNullCount(other.null_count);
}
// Copy constructor
ArrayData(const ArrayData& other) noexcept
: type(other.type),
length(other.length),
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary) {
SetNullCount(other.null_count);
}
// Move assignment
ArrayData& operator=(ArrayData&& other) {
type = std::move(other.type);
length = other.length;
SetNullCount(other.null_count);
offset = other.offset;
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
return *this;
}
// Copy assignment
ArrayData& operator=(const ArrayData& other) {
type = other.type;
length = other.length;
SetNullCount(other.null_count);
offset = other.offset;
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
return *this;
}
std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
// Access a buffer's data as a typed C pointer
template <typename T>
inline const T* GetValues(int i, int64_t absolute_offset) const {
if (buffers[i]) {
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
} else {
return NULLPTR;
}
}
template <typename T>
inline const T* GetValues(int i) const {
return GetValues<T>(i, offset);
}
// Like GetValues, but returns NULLPTR instead of aborting if the underlying
// buffer is not a CPU buffer.
template <typename T>
inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
if (buffers[i] && buffers[i]->is_cpu()) {
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
} else {
return NULLPTR;
}
}
template <typename T>
inline const T* GetValuesSafe(int i) const {
return GetValuesSafe<T>(i, offset);
}
// Access a buffer's data as a typed C pointer
template <typename T>
inline T* GetMutableValues(int i, int64_t absolute_offset) {
if (buffers[i]) {
return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
} else {
return NULLPTR;
}
}
template <typename T>
inline T* GetMutableValues(int i) {
return GetMutableValues<T>(i, offset);
}
/// \brief Construct a zero-copy slice of the data with the given offset and length
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
/// \brief Input-checking variant of Slice
///
/// An Invalid Status is returned if the requested slice falls out of bounds.
/// Note that unlike Slice, `length` isn't clamped to the available buffer size.
Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
void SetNullCount(int64_t v) { null_count.store(v); }
/// \brief Return null count, or compute and set it if it's not known
int64_t GetNullCount() const;
bool MayHaveNulls() const {
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
// but no buffer
return null_count.load() != 0 && buffers[0] != NULLPTR;
}
std::shared_ptr<DataType> type;
int64_t length = 0;
mutable std::atomic<int64_t> null_count{0};
// The logical start point into the physical buffers (in values, not bytes).
// Note that, for child data, this must be *added* to the child data's own offset.
int64_t offset = 0;
std::vector<std::shared_ptr<Buffer>> buffers;
std::vector<std::shared_ptr<ArrayData>> child_data;
// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;
};
namespace internal {
/// Construct a zero-copy view of this ArrayData with the given type.
///
/// This method checks if the types are layout-compatible.
/// Nested types are traversed in depth-first order. Data buffers must have
/// the same item sizes, even though the logical types may be different.
/// An error is returned if the types are not layout-compatible.
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
const std::shared_ptr<DataType>& type);
} // namespace internal
} // namespace arrow

View File

@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <functional>
#include <iosfwd>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/array_nested.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \brief Compare two arrays, returning an edit script which expresses the difference
/// between them
///
/// An edit script is an array of struct(insert: bool, run_length: int64_t).
/// Each element of "insert" determines whether an element was inserted into (true)
/// or deleted from (false) base. Each insertion or deletion is followed by a run of
/// elements which are unchanged from base to target; the length of this run is stored
/// in "run_length". (Note that the edit script begins and ends with a run of shared
/// elements but both fields of the struct must have the same length. To accommodate this
/// the first element of "insert" should be ignored.)
///
/// For example for base "hlloo" and target "hello", the edit script would be
/// [
/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
/// {"insert": false, "run_length": 0} // delete("o") then an empty run
/// ]
///
/// Diffing arrays containing nulls is not currently supported.
///
/// \param[in] base baseline for comparison
/// \param[in] target an array of identical type to base whose elements differ from base's
/// \param[in] pool memory to store the result will be allocated from this memory pool
/// \return an edit script array which can be applied to base to produce target
ARROW_EXPORT
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
MemoryPool* pool = default_memory_pool());
/// \brief visitor interface for easy traversal of an edit script
///
/// visitor will be called for each hunk of insertions and deletions.
ARROW_EXPORT Status VisitEditScript(
const Array& edits,
const std::function<Status(int64_t delete_begin, int64_t delete_end,
int64_t insert_begin, int64_t insert_end)>& visitor);
/// \brief return a function which will format an edit script in unified
/// diff format to os, given base and target arrays of type
ARROW_EXPORT Result<
std::function<Status(const Array& edits, const Array& base, const Array& target)>>
MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
} // namespace arrow

View File

@ -0,0 +1,89 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <vector>
#include "arrow/array/data.h"
#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \brief Create a strongly-typed Array instance from generic ArrayData
/// \param[in] data the array contents
/// \return the resulting Array instance
ARROW_EXPORT
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
/// \brief Create a strongly-typed Array instance with all elements null
/// \param[in] type the array type
/// \param[in] length the array length
/// \param[in] pool the memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
int64_t length,
MemoryPool* pool = default_memory_pool());
/// \brief Create an Array instance whose slots are the given scalar
/// \param[in] scalar the value with which to fill the array
/// \param[in] length the array length
/// \param[in] pool the memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeArrayFromScalar(
const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
/// \brief Create an empty Array of a given type
///
/// The output Array will be of the given type.
///
/// \param[in] type the data type of the empty Array
/// \param[in] pool the memory pool to allocate memory from
/// \return the resulting Array
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool());
namespace internal {
/// \brief Swap endian of each element in a generic ArrayData
///
/// As dictionaries are often shared between different arrays, dictionaries
/// are not swapped by this function and should be handled separately.
///
/// \param[in] data the array contents
/// \return the resulting ArrayData whose elements were swapped
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
const std::shared_ptr<ArrayData>& data);
/// Given a number of ArrayVectors, treat each ArrayVector as the
/// chunks of a chunked array. Then rechunk each ArrayVector such that
/// all ArrayVectors are chunked identically. It is mandatory that
/// all ArrayVectors contain the same total number of elements.
ARROW_EXPORT
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
} // namespace internal
} // namespace arrow

View File

@ -0,0 +1,56 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
// Internal functions implementing Array::Validate() and friends.
// O(1) array metadata validation
ARROW_EXPORT
Status ValidateArray(const Array& array);
ARROW_EXPORT
Status ValidateArray(const ArrayData& data);
// O(N) array data validation.
// Note that, starting from 7.0.0, "full" routines also validate metadata.
// Before, ValidateArray() needed to be called before ValidateArrayFull()
// to ensure metadata correctness, otherwise invalid memory accesses
// may occur.
ARROW_EXPORT
Status ValidateArrayFull(const Array& array);
ARROW_EXPORT
Status ValidateArrayFull(const ArrayData& data);
ARROW_EXPORT
Status ValidateUTF8(const Array& array);
ARROW_EXPORT
Status ValidateUTF8(const ArrayData& data);
} // namespace internal
} // namespace arrow