mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-01 22:13:01 +00:00
first commit
This commit is contained in:
264
.venv/Lib/site-packages/pyarrow/include/arrow/array/array_base.h
Normal file
264
.venv/Lib/site-packages/pyarrow/include/arrow/array/array_base.h
Normal file
@ -0,0 +1,264 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
#include "arrow/visitor.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// User array accessor types
|
||||
|
||||
/// \brief Array base type
|
||||
/// Immutable data array with some logical type and some length.
|
||||
///
|
||||
/// Any memory is owned by the respective Buffer instance (or its parents).
|
||||
///
|
||||
/// The base class is only required to have a null bitmap buffer if the null
|
||||
/// count is greater than 0
|
||||
///
|
||||
/// If known, the null count can be provided in the base Array constructor. If
|
||||
/// the null count is not known, pass -1 to indicate that the null count is to
|
||||
/// be computed on the first call to null_count()
|
||||
class ARROW_EXPORT Array {
|
||||
public:
|
||||
virtual ~Array() = default;
|
||||
|
||||
/// \brief Return true if value at index is null. Does not boundscheck
|
||||
bool IsNull(int64_t i) const {
|
||||
return null_bitmap_data_ != NULLPTR
|
||||
? !bit_util::GetBit(null_bitmap_data_, i + data_->offset)
|
||||
: data_->null_count == data_->length;
|
||||
}
|
||||
|
||||
/// \brief Return true if value at index is valid (not null). Does not
|
||||
/// boundscheck
|
||||
bool IsValid(int64_t i) const {
|
||||
return null_bitmap_data_ != NULLPTR
|
||||
? bit_util::GetBit(null_bitmap_data_, i + data_->offset)
|
||||
: data_->null_count != data_->length;
|
||||
}
|
||||
|
||||
/// \brief Return a Scalar containing the value of this array at i
|
||||
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
|
||||
|
||||
/// Size in the number of elements this array contains.
|
||||
int64_t length() const { return data_->length; }
|
||||
|
||||
/// A relative position into another array's data, to enable zero-copy
|
||||
/// slicing. This value defaults to zero
|
||||
int64_t offset() const { return data_->offset; }
|
||||
|
||||
/// The number of null entries in the array. If the null count was not known
|
||||
/// at time of construction (and set to a negative value), then the null
|
||||
/// count will be computed and cached on the first invocation of this
|
||||
/// function
|
||||
int64_t null_count() const;
|
||||
|
||||
std::shared_ptr<DataType> type() const { return data_->type; }
|
||||
Type::type type_id() const { return data_->type->id(); }
|
||||
|
||||
/// Buffer for the validity (null) bitmap, if any. Note that Union types
|
||||
/// never have a null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
|
||||
|
||||
/// Raw pointer to the null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
|
||||
|
||||
/// Equality comparison with another array
|
||||
bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool Equals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Return the formatted unified diff of arrow::Diff between this
|
||||
/// Array and another Array
|
||||
std::string Diff(const Array& other) const;
|
||||
|
||||
/// Approximate equality comparison with another array
|
||||
///
|
||||
/// epsilon is only used if this is FloatArray or DoubleArray
|
||||
bool ApproxEquals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool ApproxEquals(const Array& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// Compare if the range of slots specified are equal for the given array and
|
||||
/// this array. end_idx exclusive. This methods does not bounds check.
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const Array& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const std::shared_ptr<Array>& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
|
||||
int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
|
||||
int64_t end_idx, int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
|
||||
Status Accept(ArrayVisitor* visitor) const;
|
||||
|
||||
/// Construct a zero-copy view of this array with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
|
||||
|
||||
/// Construct a zero-copy slice of the array with the indicated offset and
|
||||
/// length
|
||||
///
|
||||
/// \param[in] offset the position of the first element in the constructed
|
||||
/// slice
|
||||
/// \param[in] length the length of the slice. If there are not enough
|
||||
/// elements in the array, the length will be adjusted accordingly
|
||||
///
|
||||
/// \return a new object wrapped in std::shared_ptr<Array>
|
||||
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// Slice from offset until end of the array
|
||||
std::shared_ptr<Array> Slice(int64_t offset) const;
|
||||
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
|
||||
|
||||
const std::shared_ptr<ArrayData>& data() const { return data_; }
|
||||
|
||||
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
|
||||
|
||||
/// \return PrettyPrint representation of array suitable for debugging
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Perform cheap validation checks to determine obvious inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is O(k) where k is the number of descendents.
|
||||
///
|
||||
/// \return Status
|
||||
Status Validate() const;
|
||||
|
||||
/// \brief Perform extensive validation checks to determine inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is potentially O(k*n) where k is the number of descendents and n
|
||||
/// is the array length.
|
||||
///
|
||||
/// \return Status
|
||||
Status ValidateFull() const;
|
||||
|
||||
protected:
|
||||
Array() = default;
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
|
||||
|
||||
std::shared_ptr<ArrayData> data_;
|
||||
const uint8_t* null_bitmap_data_ = NULLPTR;
|
||||
|
||||
/// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
if (data->buffers.size() > 0) {
|
||||
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
|
||||
} else {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
}
|
||||
data_ = data;
|
||||
}
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
|
||||
|
||||
ARROW_EXPORT friend void PrintTo(const Array& x, std::ostream* os);
|
||||
};
|
||||
|
||||
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
|
||||
os << x.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
/// Base class for non-nested arrays
|
||||
class ARROW_EXPORT FlatArray : public Array {
|
||||
protected:
|
||||
using Array::Array;
|
||||
};
|
||||
|
||||
/// Base class for arrays of fixed-size logical types
|
||||
class ARROW_EXPORT PrimitiveArray : public FlatArray {
|
||||
public:
|
||||
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// Does not account for any slice offset
|
||||
std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
|
||||
|
||||
protected:
|
||||
PrimitiveArray() : raw_values_(NULLPTR) {}
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
|
||||
}
|
||||
|
||||
explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
|
||||
const uint8_t* raw_values_;
|
||||
};
|
||||
|
||||
/// Degenerate null type Array
|
||||
class ARROW_EXPORT NullArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = NullType;
|
||||
|
||||
explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
explicit NullArray(int64_t length);
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
data->null_count = data->length;
|
||||
data_ = data;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,269 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for Binary, LargeBinart, String, LargeString,
|
||||
// FixedSizeBinary
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/string_view.h" // IWYU pragma: export
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
/// Base class for variable-sized binary arrays, regardless of offset size
|
||||
/// and logical interpretation.
|
||||
template <typename TYPE>
|
||||
class BaseBinaryArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
|
||||
|
||||
/// Return the pointer to the given elements bytes
|
||||
// XXX should GetValue(int64_t i) return a string_view?
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
// Account for base offset
|
||||
i += data_->offset;
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
*out_length = raw_value_offsets_[i + 1] - pos;
|
||||
return raw_data_ + pos;
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
util::string_view GetView(int64_t i) const {
|
||||
// Account for base offset
|
||||
i += data_->offset;
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
|
||||
raw_value_offsets_[i + 1] - pos);
|
||||
}
|
||||
|
||||
util::optional<util::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
/// Provided for consistency with other arrays.
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
util::string_view Value(int64_t i) const { return GetView(i); }
|
||||
|
||||
/// \brief Get binary value as a std::string
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the value copied into a std::string
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
|
||||
|
||||
const offset_type* raw_value_offsets() const {
|
||||
return raw_value_offsets_ + data_->offset;
|
||||
}
|
||||
|
||||
const uint8_t* raw_data() const { return raw_data_; }
|
||||
|
||||
/// \brief Return the data buffer absolute offset of the data for the value
|
||||
/// at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_offset(int64_t i) const {
|
||||
return raw_value_offsets_[i + data_->offset];
|
||||
}
|
||||
|
||||
/// \brief Return the length of the data for the value at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_length(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
|
||||
}
|
||||
|
||||
/// \brief Return the total length of the memory in the data buffer
|
||||
/// referenced by this array. If the array has been sliced then this may be
|
||||
/// less than the size of the data buffer (data_->buffers[2]).
|
||||
offset_type total_values_length() const {
|
||||
if (data_->length > 0) {
|
||||
return raw_value_offsets_[data_->length + data_->offset] -
|
||||
raw_value_offsets_[data_->offset];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
// For subclasses
|
||||
BaseBinaryArray() = default;
|
||||
|
||||
// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
|
||||
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
|
||||
}
|
||||
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
const uint8_t* raw_data_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size binary data
|
||||
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
|
||||
public:
|
||||
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as StringArray
|
||||
BinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size string (utf-8) data
|
||||
class ARROW_EXPORT StringArray : public BinaryArray {
|
||||
public:
|
||||
using TypeClass = StringType;
|
||||
|
||||
explicit StringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size binary data
|
||||
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
|
||||
public:
|
||||
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as LargeStringArray
|
||||
LargeBinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size string (utf-8) data
|
||||
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
|
||||
public:
|
||||
using TypeClass = LargeStringType;
|
||||
|
||||
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Fixed width binary
|
||||
|
||||
/// Concrete Array class for fixed-size binary data
|
||||
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
|
||||
|
||||
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const uint8_t* GetValue(int64_t i) const;
|
||||
const uint8_t* Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
util::string_view GetView(int64_t i) const {
|
||||
return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
|
||||
}
|
||||
|
||||
util::optional<util::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->PrimitiveArray::SetData(data);
|
||||
byte_width_ =
|
||||
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
|
||||
}
|
||||
|
||||
int32_t byte_width_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,72 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal128Array
|
||||
|
||||
/// Concrete Array class for 128-bit decimal data
|
||||
class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal128Array from ArrayData instance
|
||||
explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// Backward compatibility
|
||||
using DecimalArray = Decimal128Array;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal256Array
|
||||
|
||||
/// Concrete Array class for 256-bit decimal data
|
||||
class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal256Array from ArrayData instance
|
||||
explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
180
.venv/Lib/site-packages/pyarrow/include/arrow/array/array_dict.h
Normal file
180
.venv/Lib/site-packages/pyarrow/include/arrow/array/array_dict.h
Normal file
@ -0,0 +1,180 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// DictionaryArray
|
||||
|
||||
/// \brief Array type for dictionary-encoded data with a
|
||||
/// data-dependent dictionary
|
||||
///
|
||||
/// A dictionary array contains an array of non-negative integers (the
|
||||
/// "dictionary indices") along with a data type containing a "dictionary"
|
||||
/// corresponding to the distinct values represented in the data.
|
||||
///
|
||||
/// For example, the array
|
||||
///
|
||||
/// ["foo", "bar", "foo", "bar", "foo", "bar"]
|
||||
///
|
||||
/// with dictionary ["bar", "foo"], would have dictionary array representation
|
||||
///
|
||||
/// indices: [1, 0, 1, 0, 1, 0]
|
||||
/// dictionary: ["bar", "foo"]
|
||||
///
|
||||
/// The indices in principle may be any integer type.
|
||||
class ARROW_EXPORT DictionaryArray : public Array {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
|
||||
explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DictionaryArray(const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
/// \brief Construct DictionaryArray from dictionary and indices
|
||||
/// array and validate
|
||||
///
|
||||
/// This function does the validation of the indices and input type. It checks if
|
||||
/// all indices are non-negative and smaller than the size of the dictionary.
|
||||
///
|
||||
/// \param[in] type a dictionary type
|
||||
/// \param[in] dictionary the dictionary with same value type as the
|
||||
/// type object
|
||||
/// \param[in] indices an array of non-negative integers smaller than the
|
||||
/// size of the dictionary
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
|
||||
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
|
||||
dictionary);
|
||||
}
|
||||
|
||||
/// \brief Transpose this DictionaryArray
|
||||
///
|
||||
/// This method constructs a new dictionary array with the given dictionary
|
||||
/// type, transposing indices using the transpose map. The type and the
|
||||
/// transpose map are typically computed using DictionaryUnifier.
|
||||
///
|
||||
/// \param[in] type the new type object
|
||||
/// \param[in] dictionary the new dictionary
|
||||
/// \param[in] transpose_map transposition array of this array's indices
|
||||
/// into the target array's indices
|
||||
/// \param[in] pool a pool to allocate the array data from
|
||||
Result<std::shared_ptr<Array>> Transpose(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Determine whether dictionary arrays may be compared without unification
|
||||
bool CanCompareIndices(const DictionaryArray& other) const;
|
||||
|
||||
/// \brief Return the dictionary for this array, which is stored as
|
||||
/// a member of the ArrayData internal structure
|
||||
std::shared_ptr<Array> dictionary() const;
|
||||
std::shared_ptr<Array> indices() const;
|
||||
|
||||
/// \brief Return the ith value of indices, cast to int64_t. Not recommended
|
||||
/// for use in performance-sensitive code. Does not validate whether the
|
||||
/// value is null or out-of-bounds.
|
||||
int64_t GetValueIndex(int64_t i) const;
|
||||
|
||||
const DictionaryType* dict_type() const { return dict_type_; }
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
const DictionaryType* dict_type_;
|
||||
std::shared_ptr<Array> indices_;
|
||||
|
||||
// Lazily initialized when invoking dictionary()
|
||||
mutable std::shared_ptr<Array> dictionary_;
|
||||
};
|
||||
|
||||
/// \brief Helper class for incremental dictionary unification
|
||||
class ARROW_EXPORT DictionaryUnifier {
|
||||
public:
|
||||
virtual ~DictionaryUnifier() = default;
|
||||
|
||||
/// \brief Construct a DictionaryUnifier
|
||||
/// \param[in] value_type the data type of the dictionaries
|
||||
/// \param[in] pool MemoryPool to use for memory allocations
|
||||
static Result<std::unique_ptr<DictionaryUnifier>> Make(
|
||||
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries accross array chunks
|
||||
///
|
||||
/// The dictionaries in the array chunks will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
|
||||
const std::shared_ptr<ChunkedArray>& array,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries accross the chunks of each table column
|
||||
///
|
||||
/// The dictionaries in each table column will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<Table>> UnifyTable(
|
||||
const Table& table, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Append dictionary to the internal memo
|
||||
virtual Status Unify(const Array& dictionary) = 0;
|
||||
|
||||
/// \brief Append dictionary and compute transpose indices
|
||||
/// \param[in] dictionary the dictionary values to unify
|
||||
/// \param[out] out_transpose a Buffer containing computed transpose indices
|
||||
/// as int32_t values equal in length to the passed dictionary. The value in
|
||||
/// each slot corresponds to the new index value for each original index
|
||||
/// for a DictionaryArray with the old dictionary
|
||||
virtual Status Unify(const Array& dictionary,
|
||||
std::shared_ptr<Buffer>* out_transpose) = 0;
|
||||
|
||||
/// \brief Return a result DictionaryType with the smallest possible index
|
||||
/// type to accommodate the unified dictionary. The unifier cannot be used
|
||||
/// after this is called
|
||||
virtual Status GetResult(std::shared_ptr<DataType>* out_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
|
||||
/// \brief Return a unified dictionary with the given index type. If
|
||||
/// the index type is not large enough then an invalid status will be returned.
|
||||
/// The unifier cannot be used after this is called
|
||||
virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,569 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and
|
||||
// Union
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListArray
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListArray;
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Private helper for ListArray::SetData.
|
||||
// Unfortunately, trying to define BaseListArray::SetData outside of this header
|
||||
// doesn't play well with MSVC.
|
||||
template <typename TYPE>
|
||||
void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id = TYPE::type_id);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// Base class for variable-sized list arrays, regardless of offset size.
|
||||
template <typename TYPE>
|
||||
class BaseListArray : public Array {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
const TypeClass* list_type() const { return list_type_; }
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
std::shared_ptr<Array> values() const { return values_; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
|
||||
|
||||
/// Return pointer to raw value offsets accounting for any slice offset
|
||||
const offset_type* raw_value_offsets() const {
|
||||
return raw_value_offsets_ + data_->offset;
|
||||
}
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
offset_type value_offset(int64_t i) const {
|
||||
return raw_value_offsets_[i + data_->offset];
|
||||
}
|
||||
offset_type value_length(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
|
||||
}
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
protected:
|
||||
friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self,
|
||||
const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id);
|
||||
|
||||
const TypeClass* list_type_ = NULLPTR;
|
||||
std::shared_ptr<Array> values_;
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// Concrete Array class for list data
|
||||
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
|
||||
public:
|
||||
explicit ListArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
ListArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct ListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
ListArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// Concrete Array class for large list data (with 64-bit offsets)
|
||||
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
|
||||
public:
|
||||
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct LargeListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int64 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int64Array
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// MapArray
|
||||
|
||||
/// Concrete Array class for map data
|
||||
///
|
||||
/// NB: "value" in this context refers to a pair of a key and the corresponding item
|
||||
class ARROW_EXPORT MapArray : public ListArray {
|
||||
public:
|
||||
using TypeClass = MapType;
|
||||
|
||||
explicit MapArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct MapArray from array of offsets and child key, item arrays
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] keys Array containing key values
|
||||
/// \param[in] items Array containing item values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
|
||||
const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
const MapType* map_type() const { return map_type_; }
|
||||
|
||||
/// \brief Return array object containing all map keys
|
||||
std::shared_ptr<Array> keys() const { return keys_; }
|
||||
|
||||
/// \brief Return array object containing all mapped items
|
||||
std::shared_ptr<Array> items() const { return items_; }
|
||||
|
||||
/// Validate child data before constructing the actual MapArray.
|
||||
static Status ValidateChildData(
|
||||
const std::vector<std::shared_ptr<ArrayData>>& child_data);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArraysInternal(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool);
|
||||
|
||||
private:
|
||||
const MapType* map_type_;
|
||||
std::shared_ptr<Array> keys_, items_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeListArray
|
||||
|
||||
/// Concrete Array class for fixed size list data
|
||||
class ARROW_EXPORT FixedSizeListArray : public Array {
|
||||
public:
|
||||
using TypeClass = FixedSizeListType;
|
||||
using offset_type = TypeClass::offset_type;
|
||||
|
||||
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const FixedSizeListType* list_type() const;
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
std::shared_ptr<Array> values() const;
|
||||
|
||||
std::shared_ptr<DataType> value_type() const;
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
int64_t value_offset(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return list_size_ * i;
|
||||
}
|
||||
int32_t value_length(int64_t i = 0) const {
|
||||
ARROW_UNUSED(i);
|
||||
return list_size_;
|
||||
}
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration null elements (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and value_length
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] list_size The fixed length of each list
|
||||
/// \return Will have length equal to values.length() / list_size
|
||||
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
|
||||
int32_t list_size);
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and type
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] type The fixed sized list type
|
||||
/// \return Will have length equal to values.length() / type.list_size()
|
||||
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
int32_t list_size_;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Array> values_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
/// Concrete Array class for struct data
|
||||
class ARROW_EXPORT StructArray : public Array {
|
||||
public:
|
||||
using TypeClass = StructType;
|
||||
|
||||
explicit StructArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StructArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::vector<std::shared_ptr<Array>>& children,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and field names.
|
||||
///
|
||||
/// The length and data type are automatically inferred from the arguments.
|
||||
/// There should be at least one child array.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const std::vector<std::string>& field_names,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and fields.
|
||||
///
|
||||
/// The length is automatically inferred from the arguments.
|
||||
/// There should be at least one child array. This method does not
|
||||
/// check that field types and child array types are consistent.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const FieldVector& fields,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const StructType* struct_type() const;
|
||||
|
||||
// Return a shared pointer in case the requestor desires to share ownership
|
||||
// with this array. The returned array has its offset, length and null
|
||||
// count adjusted.
|
||||
std::shared_ptr<Array> field(int pos) const;
|
||||
|
||||
const ArrayVector& fields() const;
|
||||
|
||||
/// Returns null if name not found
|
||||
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
|
||||
|
||||
/// \brief Flatten this array as a vector of arrays, one for each field
|
||||
///
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Get one of the child arrays, combining its null bitmap
|
||||
/// with the parent struct array's bitmap.
|
||||
///
|
||||
/// \param[in] index Which child array to get
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
private:
|
||||
// For caching boxed child data
|
||||
// XXX This is not handled in a thread-safe manner.
|
||||
mutable ArrayVector boxed_fields_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Union
|
||||
|
||||
/// Base class for SparseUnionArray and DenseUnionArray
|
||||
class ARROW_EXPORT UnionArray : public Array {
|
||||
public:
|
||||
using type_code_t = int8_t;
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
|
||||
|
||||
const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
|
||||
|
||||
/// The logical type code of the value at index.
|
||||
type_code_t type_code(int64_t i) const { return raw_type_codes_[i + data_->offset]; }
|
||||
|
||||
/// The physical child id containing value at index.
|
||||
int child_id(int64_t i) const {
|
||||
return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
|
||||
}
|
||||
|
||||
const UnionType* union_type() const { return union_type_; }
|
||||
|
||||
UnionMode::type mode() const { return union_type_->mode(); }
|
||||
|
||||
/// \brief Return the given field as an individual array.
|
||||
///
|
||||
/// For sparse unions, the returned array has its offset, length and null
|
||||
/// count adjusted.
|
||||
std::shared_ptr<Array> field(int pos) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
|
||||
const type_code_t* raw_type_codes_;
|
||||
const UnionType* union_type_;
|
||||
|
||||
// For caching boxed child data
|
||||
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
|
||||
};
|
||||
|
||||
/// Concrete Array class for sparse union data
|
||||
class ARROW_EXPORT SparseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = SparseUnionType;
|
||||
|
||||
explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct SparseUnionArray from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct SparseUnionArray with custom field names from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const SparseUnionType* union_type() const {
|
||||
return internal::checked_cast<const SparseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// \brief Get one of the child arrays, adjusting its null bitmap
|
||||
/// where the union array type code does not match.
|
||||
///
|
||||
/// \param[in] index Which child array to get (i.e. the physical index, not the type
|
||||
/// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for dense union data
|
||||
///
|
||||
/// Note that union types do not have a validity bitmap
|
||||
class ARROW_EXPORT DenseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = DenseUnionType;
|
||||
|
||||
explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids,
|
||||
std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct DenseUnionArray with custom field names from type_ids,
|
||||
/// value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const DenseUnionType* union_type() const {
|
||||
return internal::checked_cast<const DenseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
|
||||
|
||||
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
|
||||
|
||||
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
|
||||
|
||||
protected:
|
||||
const int32_t* raw_value_offsets_;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,202 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor types for primitive/C-type-based arrays, such as numbers,
|
||||
// boolean, and temporal types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// Concrete Array class for boolean data
|
||||
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using IteratorType = stl::ArrayIterator<BooleanArray>;
|
||||
|
||||
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
bool Value(int64_t i) const {
|
||||
return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
|
||||
i + data_->offset);
|
||||
}
|
||||
|
||||
bool GetView(int64_t i) const { return Value(i); }
|
||||
|
||||
util::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
|
||||
|
||||
/// \brief Return the number of false (0) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t false_count() const;
|
||||
|
||||
/// \brief Return the number of true (1) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t true_count() const;
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using PrimitiveArray::PrimitiveArray;
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Concrete Array class for numeric data with a corresponding C type
|
||||
///
|
||||
/// This class is templated on the corresponding DataType subclass for the
|
||||
/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
|
||||
///
|
||||
/// Note that convenience aliases are available for all accepted types
|
||||
/// (for example Int8Array for NumericArray<Int8Type>).
|
||||
template <typename TYPE>
|
||||
class NumericArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using value_type = typename TypeClass::c_type;
|
||||
using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
|
||||
|
||||
explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
|
||||
|
||||
// Only enable this constructor without a type argument for types without additional
|
||||
// metadata
|
||||
template <typename T1 = TYPE>
|
||||
NumericArray(enable_if_parameter_free<T1, int64_t> length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
|
||||
null_count, offset) {}
|
||||
|
||||
const value_type* raw_values() const {
|
||||
return reinterpret_cast<const value_type*>(raw_values_) + data_->offset;
|
||||
}
|
||||
|
||||
value_type Value(int64_t i) const { return raw_values()[i]; }
|
||||
|
||||
// For API compatibility with BinaryArray etc.
|
||||
value_type GetView(int64_t i) const { return Value(i); }
|
||||
|
||||
util::optional<value_type> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using PrimitiveArray::PrimitiveArray;
|
||||
};
|
||||
|
||||
/// DayTimeArray
|
||||
/// ---------------------
|
||||
/// \brief Array of Day and Millisecond values.
|
||||
class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = DayTimeIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
|
||||
|
||||
explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::DayMilliseconds GetValue(int64_t i) const;
|
||||
TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
util::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// \brief Array of Month, Day and nanosecond values.
|
||||
class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = MonthDayNanoIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
|
||||
|
||||
explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::MonthDayNanos GetValue(int64_t i) const;
|
||||
TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
util::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,213 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool);
|
||||
|
||||
explicit AdaptiveIntBuilderBase(MemoryPool* pool)
|
||||
: AdaptiveIntBuilderBase(sizeof(uint8_t), pool) {}
|
||||
|
||||
/// \brief Append multiple nulls
|
||||
/// \param[in] length the number of nulls to append
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 0;
|
||||
pending_has_nulls_ = true;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
++null_count_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNotNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
protected:
|
||||
Status AppendInternal(const uint64_t val) {
|
||||
pending_data_[pending_pos_] = val;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status CommitPendingData() = 0;
|
||||
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
|
||||
std::shared_ptr<ResizableBuffer> data_;
|
||||
uint8_t* raw_data_ = NULLPTR;
|
||||
|
||||
const uint8_t start_int_size_;
|
||||
uint8_t int_size_;
|
||||
|
||||
static constexpr int32_t pending_size_ = 1024;
|
||||
uint8_t pending_valid_[pending_size_];
|
||||
uint64_t pending_data_[pending_size_];
|
||||
int32_t pending_pos_ = 0;
|
||||
bool pending_has_nulls_ = false;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveUIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
|
||||
|
||||
using ArrayBuilder::Advance;
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const uint64_t val) { return AppendInternal(val); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: AdaptiveIntBuilder(sizeof(uint8_t), pool) {}
|
||||
|
||||
using ArrayBuilder::Advance;
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,350 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm> // IWYU pragma: keep
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_primitive.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup numeric-builders Concrete builder subclasses for numeric types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup temporal-builders Concrete builder subclasses for temporal types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup binary-builders Concrete builder subclasses for binary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup nested-builders Concrete builder subclasses for nested types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
constexpr int64_t kMinBuilderCapacity = 1 << 5;
|
||||
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
|
||||
|
||||
/// Base class for all data array builders.
|
||||
///
|
||||
/// This class provides a facilities for incrementally building the null bitmap
|
||||
/// (see Append methods) and as a side effect the current number of slots and
|
||||
/// the null count.
|
||||
///
|
||||
/// \note Users are expected to use builders as one of the concrete types below.
|
||||
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
|
||||
class ARROW_EXPORT ArrayBuilder {
|
||||
public:
|
||||
explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
|
||||
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
|
||||
|
||||
virtual ~ArrayBuilder() = default;
|
||||
|
||||
/// For nested types. Since the objects are owned by this class instance, we
|
||||
/// skip shared pointers and just return a raw pointer
|
||||
ArrayBuilder* child(int i) { return children_[i].get(); }
|
||||
|
||||
const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
|
||||
|
||||
int num_children() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
virtual int64_t length() const { return length_; }
|
||||
int64_t null_count() const { return null_count_; }
|
||||
int64_t capacity() const { return capacity_; }
|
||||
|
||||
/// \brief Ensure that enough memory has been allocated to fit the indicated
|
||||
/// number of total elements in the builder, including any that have already
|
||||
/// been appended. Does not account for reallocations that may be due to
|
||||
/// variable size data, like binary values. To make space for incremental
|
||||
/// appends, use Reserve instead.
|
||||
///
|
||||
/// \param[in] capacity the minimum number of total array values to
|
||||
/// accommodate. Must be greater than the current capacity.
|
||||
/// \return Status
|
||||
virtual Status Resize(int64_t capacity);
|
||||
|
||||
/// \brief Ensure that there is enough space allocated to append the indicated
|
||||
/// number of elements without any further reallocation. Overallocation is
|
||||
/// used in order to minimize the impact of incremental Reserve() calls.
|
||||
/// Note that additional_capacity is relative to the current number of elements
|
||||
/// rather than to the current capacity, so calls to Reserve() which are not
|
||||
/// interspersed with addition of new elements may not increase the capacity.
|
||||
///
|
||||
/// \param[in] additional_capacity the number of additional array values
|
||||
/// \return Status
|
||||
Status Reserve(int64_t additional_capacity) {
|
||||
auto current_capacity = capacity();
|
||||
auto min_capacity = length() + additional_capacity;
|
||||
if (min_capacity <= current_capacity) return Status::OK();
|
||||
|
||||
// leave growth factor up to BufferBuilder
|
||||
auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
|
||||
return Resize(new_capacity);
|
||||
}
|
||||
|
||||
/// Reset the builder.
|
||||
virtual void Reset();
|
||||
|
||||
/// \brief Append a null value to builder
|
||||
virtual Status AppendNull() = 0;
|
||||
/// \brief Append a number of null values to builder
|
||||
virtual Status AppendNulls(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a non-null value to builder
|
||||
///
|
||||
/// The appended value is an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending a null value to a parent nested type.
|
||||
virtual Status AppendEmptyValue() = 0;
|
||||
|
||||
/// \brief Append a number of non-null values to builder
|
||||
///
|
||||
/// The appended values are an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending null values to a parent nested type.
|
||||
virtual Status AppendEmptyValues(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a value from a scalar
|
||||
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
|
||||
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
|
||||
virtual Status AppendScalars(const ScalarVector& scalars);
|
||||
|
||||
/// \brief Append a range of values from an array.
|
||||
///
|
||||
/// The given array must be the same type as the builder.
|
||||
virtual Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) {
|
||||
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
|
||||
}
|
||||
|
||||
/// For cases where raw data was memcpy'd into the internal buffers, allows us
|
||||
/// to advance the length of the builder. It is your responsibility to use
|
||||
/// this function responsibly.
|
||||
ARROW_DEPRECATED(
|
||||
"Deprecated in 6.0.0. ArrayBuilder::Advance is poorly supported and mostly "
|
||||
"untested.\nFor low-level control over buffer construction, use BufferBuilder "
|
||||
"or TypedBufferBuilder directly.")
|
||||
Status Advance(int64_t elements);
|
||||
|
||||
/// \brief Return result of builder as an internal generic ArrayData
|
||||
/// object. Resets builder except for dictionary builder
|
||||
///
|
||||
/// \param[out] out the finalized ArrayData object
|
||||
/// \return Status
|
||||
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \param[out] out the finalized Array object
|
||||
/// \return Status
|
||||
Status Finish(std::shared_ptr<Array>* out);
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \return The finalized Array object
|
||||
Result<std::shared_ptr<Array>> Finish();
|
||||
|
||||
/// \brief Return the type of the built Array
|
||||
virtual std::shared_ptr<DataType> type() const = 0;
|
||||
|
||||
protected:
|
||||
/// Append to null bitmap
|
||||
Status AppendToBitmap(bool is_valid);
|
||||
|
||||
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
|
||||
/// assume all of length bits are valid.
|
||||
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
|
||||
|
||||
/// Uniform append. Append N times the same validity bit.
|
||||
Status AppendToBitmap(int64_t num_bits, bool value);
|
||||
|
||||
/// Set the next length bits to not null (i.e. valid).
|
||||
Status SetNotNull(int64_t length);
|
||||
|
||||
// Unsafe operations (don't check capacity/don't resize)
|
||||
|
||||
void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
|
||||
|
||||
// Append to null bitmap, update the length
|
||||
void UnsafeAppendToBitmap(bool is_valid) {
|
||||
null_bitmap_builder_.UnsafeAppend(is_valid);
|
||||
++length_;
|
||||
if (!is_valid) ++null_count_;
|
||||
}
|
||||
|
||||
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
|
||||
// assume all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
|
||||
if (valid_bytes == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Vector append. Copy from a given bitmap. If bitmap is null assume
|
||||
// all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
|
||||
if (bitmap == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Append the same validity value a given number of times.
|
||||
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
|
||||
if (value) {
|
||||
UnsafeSetNotNull(num_bits);
|
||||
} else {
|
||||
UnsafeSetNull(num_bits);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
|
||||
|
||||
// Set the next validity bits to not null (i.e. valid).
|
||||
void UnsafeSetNotNull(int64_t length);
|
||||
|
||||
// Set the next validity bits to null (i.e. invalid).
|
||||
void UnsafeSetNull(int64_t length);
|
||||
|
||||
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
|
||||
|
||||
/// \brief Finish to an array of the specified ArrayType
|
||||
template <typename ArrayType>
|
||||
Status FinishTyped(std::shared_ptr<ArrayType>* out) {
|
||||
std::shared_ptr<Array> out_untyped;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out_untyped));
|
||||
*out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check the requested capacity for validity
|
||||
Status CheckCapacity(int64_t new_capacity) {
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
|
||||
return Status::Invalid(
|
||||
"Resize capacity must be positive (requested: ", new_capacity, ")");
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
|
||||
return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
|
||||
", current length: ", length_, ")");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check for array type
|
||||
Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
|
||||
const Array& array, const char* message);
|
||||
Status CheckArrayType(Type::type expected_type, const Array& array,
|
||||
const char* message);
|
||||
|
||||
MemoryPool* pool_;
|
||||
|
||||
TypedBufferBuilder<bool> null_bitmap_builder_;
|
||||
int64_t null_count_ = 0;
|
||||
|
||||
// Array length, so far. Also, the index of the next element to be added
|
||||
int64_t length_ = 0;
|
||||
int64_t capacity_ = 0;
|
||||
|
||||
// Child value array builders. These are owned by this class
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> children_;
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
|
||||
};
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the data type to create the builder for
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type, where any top-level or nested dictionary builders return the
|
||||
/// exact index type specified by the type.
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
/// \brief Construct an empty DictionaryBuilder initialized optionally
|
||||
/// with a pre-existing dictionary
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the dictionary type to create the builder for
|
||||
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& dictionary,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,703 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/string_view.h" // IWYU pragma: export
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseBinaryBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {}
|
||||
|
||||
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
|
||||
: BaseBinaryBuilder(pool) {}
|
||||
|
||||
Status Append(const uint8_t* value, offset_type length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value, offset_type length) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
Status Append(util::string_view value) {
|
||||
return Append(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Extend the last appended value by appending more data at the end
|
||||
///
|
||||
/// Unlike Append, this does not create a new offset.
|
||||
Status ExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ExtendCurrent(util::string_view value) {
|
||||
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append without checking capacity
|
||||
///
|
||||
/// Offsets and data should have been presized using Reserve() and
|
||||
/// ReserveData(), respectively.
|
||||
void UnsafeAppend(const uint8_t* value, offset_type length) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value, offset_type length) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const std::string& value) {
|
||||
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(util::string_view value) {
|
||||
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Like ExtendCurrent, but do not check capacity
|
||||
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
}
|
||||
|
||||
void UnsafeExtendCurrent(util::string_view value) {
|
||||
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppendEmptyValue() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of strings in one shot.
|
||||
///
|
||||
/// \param[in] values a vector of strings
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<std::string>& values,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = std::accumulate(
|
||||
values.begin(), values.end(), 0ULL,
|
||||
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
|
||||
ARROW_RETURN_NOT_OK(Reserve(values.size()));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
|
||||
|
||||
if (valid_bytes != NULLPTR) {
|
||||
for (std::size_t i = 0; i < values.size(); ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
value_data_builder_.UnsafeAppend(
|
||||
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (std::size_t i = 0; i < values.size(); ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(
|
||||
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(valid_bytes, values.size());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of nul-terminated strings in one shot.
|
||||
/// If one of the values is NULL, it is processed as a null
|
||||
/// value even if the corresponding valid_bytes entry is 1.
|
||||
///
|
||||
/// \param[in] values a contiguous C array of nul-terminated char *
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const char** values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = 0;
|
||||
std::vector<std::size_t> value_lengths(length);
|
||||
bool have_null_value = false;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
if (values[i] != NULLPTR) {
|
||||
auto value_length = strlen(values[i]);
|
||||
value_lengths[i] = value_length;
|
||||
total_length += value_length;
|
||||
} else {
|
||||
have_null_value = true;
|
||||
}
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
|
||||
if (valid_bytes) {
|
||||
int64_t valid_bytes_offset = 0;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
} else {
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
|
||||
i - valid_bytes_offset);
|
||||
UnsafeAppendToBitmap(false);
|
||||
valid_bytes_offset = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
|
||||
} else {
|
||||
if (have_null_value) {
|
||||
std::vector<uint8_t> valid_vector(length, 0);
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
valid_vector[i] = 1;
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_vector.data(), length);
|
||||
} else {
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
}
|
||||
UnsafeAppendToBitmap(NULLPTR, length);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
auto bitmap = array.GetValues<uint8_t>(0, 0);
|
||||
auto offsets = array.GetValues<offset_type>(1);
|
||||
auto data = array.GetValues<uint8_t>(2, 0);
|
||||
for (int64_t i = 0; i < length; i++) {
|
||||
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
|
||||
const offset_type start = offsets[offset + i];
|
||||
const offset_type end = offsets[offset + i + 1];
|
||||
ARROW_RETURN_NOT_OK(Append(data + start, end - start));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_data_builder_.Reset();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) {
|
||||
auto new_size = value_data_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
// One more than requested for offsets
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return value_data_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
// Write final offset (values length)
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// These buffers' padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
|
||||
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
|
||||
null_count_, 0);
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const uint8_t* value_data() const { return value_data_builder_.data(); }
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return value_data_builder_.length(); }
|
||||
/// \return capacity of values buffer
|
||||
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const offset_type* offsets_data() const { return offsets_builder_.data(); }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
const offset_type* offsets = offsets_builder_.data();
|
||||
const auto offset = offsets[i];
|
||||
if (i == (length_ - 1)) {
|
||||
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
|
||||
} else {
|
||||
*out_length = offsets[i + 1] - offset;
|
||||
}
|
||||
return value_data_builder_.data() + offset;
|
||||
}
|
||||
|
||||
offset_type offset(int64_t i) const { return offsets_data()[i]; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
util::string_view GetView(int64_t i) const {
|
||||
offset_type value_length;
|
||||
const uint8_t* value_data = GetValue(i, &value_length);
|
||||
return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
TypedBufferBuilder<uint8_t> value_data_builder_;
|
||||
|
||||
Status AppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
|
||||
void UnsafeAppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
};
|
||||
|
||||
/// \class BinaryBuilder
|
||||
/// \brief Builder class for variable-length binary data
|
||||
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return binary(); }
|
||||
};
|
||||
|
||||
/// \class StringBuilder
|
||||
/// \brief Builder class for UTF8 strings
|
||||
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
|
||||
public:
|
||||
using BinaryBuilder::BinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return utf8(); }
|
||||
};
|
||||
|
||||
/// \class LargeBinaryBuilder
|
||||
/// \brief Builder class for large variable-length binary data
|
||||
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_binary(); }
|
||||
};
|
||||
|
||||
/// \class LargeStringBuilder
|
||||
/// \brief Builder class for large UTF8 strings
|
||||
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
|
||||
public:
|
||||
using LargeBinaryBuilder::LargeBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_utf8(); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeBinaryBuilder
|
||||
|
||||
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
|
||||
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
Status Append(const uint8_t* value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
Status Append(const util::string_view& view) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(view);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::string& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(s);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const Buffer& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(util::string_view(s));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
|
||||
|
||||
template <size_t NBYTES>
|
||||
Status Append(const std::array<uint8_t, NBYTES>& value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(
|
||||
util::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
|
||||
int64_t bitmap_offset);
|
||||
|
||||
Status AppendNull() final;
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(
|
||||
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t* value) {
|
||||
UnsafeAppendToBitmap(true);
|
||||
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
|
||||
byte_builder_.UnsafeAppend(value, byte_width_);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
void UnsafeAppend(util::string_view value) {
|
||||
#ifndef NDEBUG
|
||||
CheckValueSize(static_cast<size_t>(value.size()));
|
||||
#endif
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
|
||||
|
||||
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
UnsafeAppendToBitmap(false);
|
||||
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) const {
|
||||
auto new_size = byte_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return byte_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return byte_builder_.length(); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i) const;
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
util::string_view GetView(int64_t i) const;
|
||||
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<int64_t>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_binary(byte_width_);
|
||||
}
|
||||
|
||||
protected:
|
||||
int32_t byte_width_;
|
||||
BufferBuilder byte_builder_;
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
uint8_t* GetMutableValue(int64_t i) {
|
||||
uint8_t* data_ptr = byte_builder_.mutable_data();
|
||||
return data_ptr + i * byte_width_;
|
||||
}
|
||||
|
||||
void CheckValueSize(int64_t size);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Chunked builders: build a sequence of BinaryArray or StringArray that are
|
||||
// limited to a particular size (to the upper limit of 2GB)
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT ChunkedBinaryBuilder {
|
||||
public:
|
||||
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
virtual ~ChunkedBinaryBuilder() = default;
|
||||
|
||||
Status Append(const uint8_t* value, int32_t length) {
|
||||
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
|
||||
max_chunk_value_length_)) {
|
||||
if (builder_->value_data_length() == 0) {
|
||||
// The current item is larger than max_chunk_size_;
|
||||
// this chunk will be oversize and hold *only* this item
|
||||
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
|
||||
return NextChunk();
|
||||
}
|
||||
// The current item would cause builder_->value_data_length() to exceed
|
||||
// max_chunk_size_, so finish this chunk and append the current item to the next
|
||||
// chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
return Append(value, length);
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
// The current item would cause builder_->length() to exceed max_chunk_length_, so
|
||||
// finish this chunk and append the current item to the next chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
|
||||
return builder_->Append(value, length);
|
||||
}
|
||||
|
||||
Status Append(const util::string_view& value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<int32_t>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNull() {
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
return builder_->AppendNull();
|
||||
}
|
||||
|
||||
Status Reserve(int64_t values);
|
||||
|
||||
virtual Status Finish(ArrayVector* out);
|
||||
|
||||
protected:
|
||||
Status NextChunk();
|
||||
|
||||
// maximum total character data size per chunk
|
||||
int64_t max_chunk_value_length_;
|
||||
|
||||
// maximum elements allowed per chunk
|
||||
int64_t max_chunk_length_ = kListMaximumElements;
|
||||
|
||||
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
|
||||
// add to extra_capacity_ instead and wait to reserve until the next chunk
|
||||
int64_t extra_capacity_ = 0;
|
||||
|
||||
std::unique_ptr<BinaryBuilder> builder_;
|
||||
std::vector<std::shared_ptr<Array>> chunks_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
|
||||
public:
|
||||
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
|
||||
|
||||
Status Finish(ArrayVector* out) override;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,100 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_decimal.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
using ValueType = Decimal128;
|
||||
|
||||
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal128 val);
|
||||
void UnsafeAppend(Decimal128 val);
|
||||
void UnsafeAppend(util::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal128Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
using ValueType = Decimal256;
|
||||
|
||||
explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(const Decimal256& val);
|
||||
void UnsafeAppend(const Decimal256& val);
|
||||
void UnsafeAppend(util::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal256Type> decimal_type_;
|
||||
};
|
||||
|
||||
using DecimalBuilder = Decimal128Builder;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,722 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_base.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/array/util.h"
|
||||
#include "arrow/scalar.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_block_counter.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Dictionary builder
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
struct DictionaryValue {
|
||||
using type = typename T::c_type;
|
||||
using PhysicalType = T;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_base_binary<T>> {
|
||||
using type = util::string_view;
|
||||
using PhysicalType =
|
||||
typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
|
||||
BinaryType, LargeBinaryType>::type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
|
||||
using type = util::string_view;
|
||||
using PhysicalType = BinaryType;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT DictionaryMemoTable {
|
||||
public:
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
|
||||
~DictionaryMemoTable();
|
||||
|
||||
Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
|
||||
|
||||
/// \brief Insert new memo values
|
||||
Status InsertValues(const Array& values);
|
||||
|
||||
int32_t size() const;
|
||||
|
||||
template <typename T>
|
||||
Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
|
||||
// We want to keep the DictionaryMemoTable implementation private, also we can't
|
||||
// use extern template classes because of compiler issues (MinGW?). Instead,
|
||||
// we expose explicit function overrides for each supported physical type.
|
||||
const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
|
||||
return GetOrInsert(physical_type, value, out);
|
||||
}
|
||||
|
||||
private:
|
||||
Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
|
||||
Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
|
||||
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const MonthDayNanoIntervalType*,
|
||||
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
|
||||
Status GetOrInsert(const DayTimeIntervalType*,
|
||||
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
|
||||
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const FloatType*, float value, int32_t* out);
|
||||
Status GetOrInsert(const DoubleType*, double value, int32_t* out);
|
||||
|
||||
Status GetOrInsert(const BinaryType*, util::string_view value, int32_t* out);
|
||||
Status GetOrInsert(const LargeBinaryType*, util::string_view value, int32_t* out);
|
||||
|
||||
class DictionaryMemoTableImpl;
|
||||
std::unique_ptr<DictionaryMemoTableImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \addtogroup dictionary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Array builder for created encoded DictionaryArray from
|
||||
/// dense array
|
||||
///
|
||||
/// Unlike other builders, dictionary builder does not completely
|
||||
/// reset the state on Finish calls.
|
||||
template <typename BuilderType, typename T>
|
||||
class DictionaryBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
using Value = typename DictionaryValue<T>::type;
|
||||
|
||||
// WARNING: the type given below is the value type, not the DictionaryType.
|
||||
// The DictionaryType is instantiated on the Finish() call.
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
!is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(start_int_size, pool),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(index_type, pool),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(start_int_size, pool),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(pool),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(index_type, pool),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
|
||||
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
|
||||
|
||||
// This constructor doesn't check for errors. Use InsertMemoValues instead.
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool),
|
||||
value_type_(dictionary->type()) {}
|
||||
|
||||
~DictionaryBuilderBase() override = default;
|
||||
|
||||
/// \brief The current number of entries in the dictionary
|
||||
int64_t dictionary_length() const { return memo_table_->size(); }
|
||||
|
||||
/// \brief The value byte width (for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
|
||||
return byte_width_;
|
||||
}
|
||||
|
||||
/// \brief Append a scalar value
|
||||
Status Append(Value value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
|
||||
int32_t memo_index;
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
|
||||
length_ += 1;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
|
||||
return Append(util::string_view(reinterpret_cast<const char*>(value), byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
|
||||
return Append(util::string_view(value, byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
|
||||
return Append(reinterpret_cast<const char*>(value), length);
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(util::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for string types)
|
||||
template <typename T1 = T>
|
||||
enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(util::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a decimal (only for Decimal128Type)
|
||||
template <typename T1 = T>
|
||||
enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
|
||||
uint8_t data[16];
|
||||
value.ToBytes(data);
|
||||
return Append(data, 16);
|
||||
}
|
||||
|
||||
/// \brief Append a decimal (only for Decimal128Type)
|
||||
template <typename T1 = T>
|
||||
enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
|
||||
uint8_t data[32];
|
||||
value.ToBytes(data);
|
||||
return Append(data, 32);
|
||||
}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
|
||||
if (!scalar.is_valid) return AppendNulls(n_repeats);
|
||||
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
|
||||
const DictionaryScalar& dict_scalar =
|
||||
internal::checked_cast<const DictionaryScalar&>(scalar);
|
||||
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
|
||||
*dict_scalar.value.dictionary);
|
||||
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT8:
|
||||
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT16:
|
||||
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT16:
|
||||
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT32:
|
||||
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT32:
|
||||
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT64:
|
||||
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT64:
|
||||
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendScalars(const ScalarVector& scalars) override {
|
||||
for (const auto& scalar : scalars) {
|
||||
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
|
||||
// Visit the indices and insert the unpacked values.
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
|
||||
const typename TypeTraits<T>::ArrayType dict(array.dictionary);
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
|
||||
case Type::INT8:
|
||||
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
|
||||
case Type::UINT16:
|
||||
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
|
||||
case Type::INT16:
|
||||
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
|
||||
case Type::UINT32:
|
||||
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
|
||||
case Type::INT32:
|
||||
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
|
||||
case Type::UINT64:
|
||||
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
|
||||
case Type::INT64:
|
||||
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Insert values into the dictionary's memo, but do not append any
|
||||
/// indices. Can be used to initialize a new builder with known dictionary
|
||||
/// values
|
||||
/// \param[in] values dictionary values to add to memo. Type must match
|
||||
/// builder type
|
||||
Status InsertMemoValues(const Array& values) {
|
||||
return memo_table_->InsertValues(values);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
template <typename T1 = T>
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
|
||||
const Array& array) {
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const ArrayType&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
// Perform a partial reset. Call ResetFull to also reset the accumulated
|
||||
// dictionary values
|
||||
ArrayBuilder::Reset();
|
||||
indices_builder_.Reset();
|
||||
}
|
||||
|
||||
/// \brief Reset and also clear accumulated dictionary values in memo table
|
||||
void ResetFull() {
|
||||
Reset();
|
||||
memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Return dictionary indices and a delta dictionary since the last
|
||||
/// time that Finish or FinishDelta were called, and reset state of builder
|
||||
/// (except the memo table)
|
||||
Status FinishDelta(std::shared_ptr<Array>* out_indices,
|
||||
std::shared_ptr<Array>* out_delta) {
|
||||
std::shared_ptr<ArrayData> indices_data;
|
||||
std::shared_ptr<ArrayData> delta_data;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
|
||||
*out_indices = MakeArray(indices_data);
|
||||
*out_delta = MakeArray(delta_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), value_type_);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename c_type>
|
||||
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const ArrayData& array, int64_t offset, int64_t length) {
|
||||
const c_type* values = array.GetValues<c_type>(1) + offset;
|
||||
return VisitBitBlocks(
|
||||
array.buffers[0], array.offset + offset, length,
|
||||
[&](const int64_t position) {
|
||||
const int64_t index = static_cast<int64_t>(values[position]);
|
||||
if (dict.IsValid(index)) {
|
||||
return Append(dict.GetView(index));
|
||||
}
|
||||
return AppendNull();
|
||||
},
|
||||
[&]() { return AppendNull(); });
|
||||
}
|
||||
|
||||
template <typename IndexType>
|
||||
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const Scalar& index_scalar, int64_t n_repeats) {
|
||||
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
|
||||
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
|
||||
if (index_scalar.is_valid && dict.IsValid(index)) {
|
||||
const auto& value = dict.GetView(index);
|
||||
for (int64_t i = 0; i < n_repeats; i++) {
|
||||
ARROW_RETURN_NOT_OK(Append(value));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendNulls(n_repeats);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
|
||||
|
||||
// Set type of array data to the right dictionary type
|
||||
(*out)->type = type();
|
||||
(*out)->dictionary = dictionary;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishWithDictOffset(int64_t dict_offset,
|
||||
std::shared_ptr<ArrayData>* out_indices,
|
||||
std::shared_ptr<ArrayData>* out_dictionary) {
|
||||
// Finalize indices array
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
|
||||
|
||||
// Generate dictionary array from hash table contents
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
|
||||
delta_offset_ = memo_table_->size();
|
||||
|
||||
// Update internals for further uses of this DictionaryBuilder
|
||||
ArrayBuilder::Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::unique_ptr<DictionaryMemoTable> memo_table_;
|
||||
|
||||
// The size of the dictionary memo at last invocation of Finish, to use in
|
||||
// FinishDelta for computing dictionary deltas
|
||||
int32_t delta_offset_;
|
||||
|
||||
// Only used for FixedSizeBinaryType
|
||||
int32_t byte_width_;
|
||||
|
||||
BuilderType indices_builder_;
|
||||
std::shared_ptr<DataType> value_type_;
|
||||
};
|
||||
|
||||
template <typename BuilderType>
|
||||
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
|
||||
public:
|
||||
template <typename B = BuilderType>
|
||||
DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
|
||||
|
||||
template <typename B = BuilderType>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
Status AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
Type::NA, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
|
||||
(*out)->type = dictionary((*out)->type, null());
|
||||
(*out)->dictionary = NullArray(0).data();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), null());
|
||||
}
|
||||
|
||||
protected:
|
||||
BuilderType indices_builder_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
|
||||
/// smallest index size that can accommodate the dictionary indices
|
||||
template <typename T>
|
||||
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief A DictionaryArray builder that always returns int32 dictionary
|
||||
/// indices so that data cast to dictionary form will have a consistent index
|
||||
/// type, e.g. for creating a ChunkedArray
|
||||
template <typename T>
|
||||
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int32_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary / Unicode builders
|
||||
// (compatibility aliases; those used to be derived classes with additional
|
||||
// Append() overloads, but they have been folded into DictionaryBuilderBase)
|
||||
|
||||
using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
|
||||
using StringDictionaryBuilder = DictionaryBuilder<StringType>;
|
||||
using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
|
||||
using StringDictionary32Builder = Dictionary32Builder<StringType>;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,561 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// List builder
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
/// Use this constructor to incrementally build the value array along with offsets and
|
||||
/// null bitmap.
|
||||
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
|
||||
const std::shared_ptr<DataType>& type)
|
||||
: ArrayBuilder(pool),
|
||||
offsets_builder_(pool),
|
||||
value_builder_(value_builder),
|
||||
value_field_(type->field(0)->WithType(NULLPTR)) {}
|
||||
|
||||
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder)
|
||||
: BaseListBuilder(pool, value_builder, list(value_builder->type())) {}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
if (capacity > maximum_elements()) {
|
||||
return Status::CapacityError("List array cannot reserve space for more than ",
|
||||
maximum_elements(), " got ", capacity);
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
|
||||
// One more than requested for offsets
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_builder_->Reset();
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const offset_type* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(valid_bytes, length);
|
||||
offsets_builder_.UnsafeAppend(offsets, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Start a new variable-length list slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// value builder
|
||||
Status Append(bool is_valid = true) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
return AppendNextOffset();
|
||||
}
|
||||
|
||||
Status AppendNull() final { return Append(false); }
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
const int64_t num_values = value_builder_->length();
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final { return Append(true); }
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
const int64_t num_values = value_builder_->length();
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const offset_type* offsets = array.GetValues<offset_type>(1);
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
int64_t slot_length = offsets[row + 1] - offsets[row];
|
||||
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0],
|
||||
offsets[row], slot_length));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// Offset padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets, null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
if (value_builder_->length() == 0) {
|
||||
// Try to make sure we get a non-null values buffer (ARROW-2744)
|
||||
ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> items;
|
||||
ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));
|
||||
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
|
||||
null_count_);
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) const {
|
||||
auto new_length = value_builder_->length() + new_elements;
|
||||
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
|
||||
return Status::CapacityError("List array cannot contain more than ",
|
||||
maximum_elements(), " elements, have ", new_elements);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
|
||||
}
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
std::shared_ptr<Field> value_field_;
|
||||
|
||||
Status AppendNextOffset() {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
|
||||
const int64_t num_values = value_builder_->length();
|
||||
return offsets_builder_.Append(static_cast<offset_type>(num_values));
|
||||
}
|
||||
};
|
||||
|
||||
/// \class ListBuilder
|
||||
/// \brief Builder class for variable-length list array value types
|
||||
///
|
||||
/// To use this class, you must append values to the child array builder and use
|
||||
/// the Append function to delimit each distinct list value (once the values
|
||||
/// have been appended to the child array) or use the bulk API to append
|
||||
/// a sequence of offsets and null values.
|
||||
///
|
||||
/// A note on types. Per arrow/type.h all types in the c++ implementation are
|
||||
/// logical so even though this class always builds list array, this can
|
||||
/// represent multiple different logical types. If no logical type is provided
|
||||
/// at construction time, the class defaults to List<T> where t is taken from the
|
||||
/// value_builder/values that the object is constructed with.
|
||||
class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \class LargeListBuilder
|
||||
/// \brief Builder class for large variable-length list array value types
|
||||
///
|
||||
/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
|
||||
class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Map builder
|
||||
|
||||
/// \class MapBuilder
|
||||
/// \brief Builder class for arrays of variable-size maps
|
||||
///
|
||||
/// To use this class, you must append values to the key and item array builders
|
||||
/// and use the Append function to delimit each distinct map (once the keys and items
|
||||
/// have been appended) or use the bulk API to append a sequence of offsets and null
|
||||
/// maps.
|
||||
///
|
||||
/// Key uniqueness and ordering are not validated.
|
||||
class ARROW_EXPORT MapBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// Use this constructor to define the built array's type explicitly. If key_builder
|
||||
/// or item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If key_builder or
|
||||
/// item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
|
||||
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const int32_t* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Start a new variable-length map slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// key and item builders
|
||||
Status Append();
|
||||
|
||||
Status AppendNull() final;
|
||||
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const int32_t* offsets = array.GetValues<int32_t>(1);
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
const int64_t slot_length = offsets[row + 1] - offsets[row];
|
||||
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
|
||||
*array.child_data[0]->child_data[0], offsets[row], slot_length));
|
||||
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
|
||||
*array.child_data[0]->child_data[1], offsets[row], slot_length));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Get builder to append keys.
|
||||
///
|
||||
/// Append a key with this builder should be followed by appending
|
||||
/// an item or null value with item_builder().
|
||||
ArrayBuilder* key_builder() const { return key_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to append items
|
||||
///
|
||||
/// Appending an item with this builder should have been preceded
|
||||
/// by appending a key with key_builder().
|
||||
ArrayBuilder* item_builder() const { return item_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to add Map entries as struct values.
|
||||
///
|
||||
/// This is used instead of key_builder()/item_builder() and allows
|
||||
/// the Map to be built as a list of struct values.
|
||||
ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
// Key and Item builder may update types, but they don't contain the field names,
|
||||
// so we need to reconstruct the type. (See ARROW-13735.)
|
||||
return std::make_shared<MapType>(
|
||||
field(entries_name_,
|
||||
struct_({field(key_name_, key_builder_->type(), false),
|
||||
field(item_name_, item_builder_->type(), item_nullable_)}),
|
||||
false),
|
||||
keys_sorted_);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) {
|
||||
return list_builder_->ValidateOverflow(new_elements);
|
||||
}
|
||||
|
||||
protected:
|
||||
inline Status AdjustStructBuilderLength();
|
||||
|
||||
protected:
|
||||
bool keys_sorted_ = false;
|
||||
bool item_nullable_ = false;
|
||||
std::string entries_name_;
|
||||
std::string key_name_;
|
||||
std::string item_name_;
|
||||
std::shared_ptr<ListBuilder> list_builder_;
|
||||
std::shared_ptr<ArrayBuilder> key_builder_;
|
||||
std::shared_ptr<ArrayBuilder> item_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeList builder
|
||||
|
||||
/// \class FixedSizeListBuilder
|
||||
/// \brief Builder class for fixed-length list array value types
|
||||
class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// Use this constructor to define the built array's type explicitly. If value_builder
|
||||
/// has indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
std::shared_ptr<ArrayBuilder> const& value_builder,
|
||||
int32_t list_size);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If value_builder has
|
||||
/// indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
std::shared_ptr<ArrayBuilder> const& value_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a valid fixed length list.
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder.
|
||||
Status Append();
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes wil be read and any zero byte
|
||||
/// will cause the corresponding slot to be null
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder. This includes appending nulls for null lists.
|
||||
/// XXX this restriction is confusing, should this method be omitted?
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a null fixed length list.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNull() final;
|
||||
|
||||
/// \brief Append length null fixed length lists.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements);
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
|
||||
*array.child_data[0], list_size_ * (array.offset + row), list_size_));
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Field> value_field_;
|
||||
const int32_t list_size_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
// ---------------------------------------------------------------------------------
|
||||
// StructArray builder
|
||||
/// Append, Resize and Reserve methods are acting on StructBuilder.
|
||||
/// Please make sure all these methods of all child-builders' are consistently
|
||||
/// called to maintain data-structure consistency.
|
||||
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// If any of field_builders has indeterminate type, this builder will also
|
||||
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// Null bitmap is of equal length to every child field, and any zero byte
|
||||
/// will be considered as a null for that field, but users must using app-
|
||||
/// end methods or advance methods of the child builders' independently to
|
||||
/// insert data.
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Append an element to the Struct. All child-builders' Append method must
|
||||
/// be called independently to maintain data-structure consistency.
|
||||
Status Append(bool is_valid = true) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a null value. Automatically appends an empty value to each child
|
||||
/// builder.
|
||||
Status AppendNull() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(false);
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values. Automatically appends empty values to each
|
||||
/// child builder.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(true);
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
|
||||
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i],
|
||||
array.offset + offset, length));
|
||||
}
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(validity, array.offset + offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
|
||||
|
||||
int num_fields() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<DataType> type_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,539 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
|
||||
public:
|
||||
explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {}
|
||||
explicit NullBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: NullBuilder(pool) {}
|
||||
|
||||
/// \brief Append the specified number of null elements
|
||||
Status AppendNulls(int64_t length) final {
|
||||
if (length < 0) return Status::Invalid("length must be positive");
|
||||
null_count_ += length;
|
||||
length_ += length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
|
||||
Status Append(std::nullptr_t) { return AppendNull(); }
|
||||
|
||||
Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override {
|
||||
return AppendNulls(length);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return null(); }
|
||||
|
||||
Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// Base class for all Builders that emit an Array of a scalar numerical type.
|
||||
template <typename T>
|
||||
class NumericBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = T;
|
||||
using value_type = typename T::c_type;
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit NumericBuilder(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), type_(TypeTraits<T>::type_singleton()), data_builder_(pool) {}
|
||||
|
||||
NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
|
||||
: ArrayBuilder(pool), type_(type), data_builder_(pool) {}
|
||||
|
||||
/// Append a single scalar and increase the size if necessary.
|
||||
Status Append(const value_type val) {
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
/// The memory at the corresponding data slot is set to 0 to prevent
|
||||
/// uninitialized memory access
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a empty element
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append several empty elements
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
|
||||
|
||||
void Reset() override { data_builder_.Reset(); }
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
value_type operator[](int64_t index) const { return GetValue(index); }
|
||||
|
||||
value_type& operator[](int64_t index) {
|
||||
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] bitmap a validity bitmap to copy (may be null)
|
||||
/// \param[in] bitmap_offset an offset into the validity bitmap
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
|
||||
int64_t bitmap_offset) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const std::vector<bool>& is_valid) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values,
|
||||
const std::vector<bool>& is_valid) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
|
||||
null_bitmap_builder_.FinishWithLength(length_));
|
||||
ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
|
||||
capacity_ = length_ = null_count_ = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values.
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, with a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<value_type>(1) + offset, length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
/// Append a single scalar under the assumption that the underlying Buffer is
|
||||
/// large enough.
|
||||
///
|
||||
/// This method does not capacity-check; make sure to call Reserve
|
||||
/// beforehand.
|
||||
void UnsafeAppend(const value_type val) {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(true);
|
||||
data_builder_.UnsafeAppend(val);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(false);
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<DataType> type_;
|
||||
TypedBufferBuilder<value_type> data_builder_;
|
||||
};
|
||||
|
||||
// Builders
|
||||
|
||||
using UInt8Builder = NumericBuilder<UInt8Type>;
|
||||
using UInt16Builder = NumericBuilder<UInt16Type>;
|
||||
using UInt32Builder = NumericBuilder<UInt32Type>;
|
||||
using UInt64Builder = NumericBuilder<UInt64Type>;
|
||||
|
||||
using Int8Builder = NumericBuilder<Int8Type>;
|
||||
using Int16Builder = NumericBuilder<Int16Type>;
|
||||
using Int32Builder = NumericBuilder<Int32Type>;
|
||||
using Int64Builder = NumericBuilder<Int64Type>;
|
||||
|
||||
using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
|
||||
using FloatBuilder = NumericBuilder<FloatType>;
|
||||
using DoubleBuilder = NumericBuilder<DoubleType>;
|
||||
|
||||
/// @}
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
using Date32Builder = NumericBuilder<Date32Type>;
|
||||
using Date64Builder = NumericBuilder<Date64Type>;
|
||||
using Time32Builder = NumericBuilder<Time32Type>;
|
||||
using Time64Builder = NumericBuilder<Time64Type>;
|
||||
using TimestampBuilder = NumericBuilder<TimestampType>;
|
||||
using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
|
||||
using DurationBuilder = NumericBuilder<DurationType>;
|
||||
|
||||
/// @}
|
||||
|
||||
class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using value_type = bool;
|
||||
|
||||
explicit BooleanBuilder(MemoryPool* pool = default_memory_pool());
|
||||
|
||||
BooleanBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNull();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeSetNotNull(1);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const bool val) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t val) { return Append(val != 0); }
|
||||
|
||||
/// Scalar append, without checking for capacity
|
||||
void UnsafeAppend(const bool val) {
|
||||
data_builder_.UnsafeAppend(val);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of bytes (non-zero is 1)
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a bitmap of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] validity a validity bitmap to copy (may be null)
|
||||
/// \param[in] offset an offset into the values and validity bitmaps
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
|
||||
int64_t offset);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
// this updates length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, for a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
}
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(int64_t length, bool value);
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<uint8_t>(1, 0), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return boolean(); }
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<bool> data_builder_;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,62 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Contains declarations of time related Arrow builder types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_primitive.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// TODO(ARROW-7938): this class is untested
|
||||
|
||||
class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
|
||||
public:
|
||||
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
|
||||
|
||||
explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: DayTimeIntervalBuilder(day_time_interval(), pool) {}
|
||||
|
||||
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: NumericBuilder<DayTimeIntervalType>(type, pool) {}
|
||||
};
|
||||
|
||||
class ARROW_EXPORT MonthDayNanoIntervalBuilder
|
||||
: public NumericBuilder<MonthDayNanoIntervalType> {
|
||||
public:
|
||||
using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool) {}
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: NumericBuilder<MonthDayNanoIntervalType>(type, pool) {}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,248 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Base class for union array builds.
|
||||
///
|
||||
/// Note that while we subclass ArrayBuilder, as union types do not have a
|
||||
/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
|
||||
class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
|
||||
public:
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Make a new child builder available to the UnionArray
|
||||
///
|
||||
/// \param[in] new_child the child builder
|
||||
/// \param[in] field_name the name of the field in the union array type
|
||||
/// if type inference is used
|
||||
/// \return child index, which is the "type" argument that needs
|
||||
/// to be passed to the "Append" method to add a new element to
|
||||
/// the union array.
|
||||
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
|
||||
const std::string& field_name = "");
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
int64_t length() const override { return types_builder_.length(); }
|
||||
|
||||
protected:
|
||||
BasicUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
int8_t NextTypeId();
|
||||
|
||||
std::vector<std::shared_ptr<Field>> child_fields_;
|
||||
std::vector<int8_t> type_codes_;
|
||||
UnionMode::type mode_;
|
||||
|
||||
std::vector<ArrayBuilder*> type_id_to_children_;
|
||||
std::vector<int> type_id_to_child_id_;
|
||||
// for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
|
||||
int8_t dense_type_id_ = 0;
|
||||
TypedBufferBuilder<int8_t> types_builder_;
|
||||
};
|
||||
|
||||
/// \class DenseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit DenseUnionBuilder(MemoryPool* pool)
|
||||
: BasicUnionBuilder(pool, {}, dense_union(FieldVector{})), offsets_builder_(pool) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
DenseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type)
|
||||
: BasicUnionBuilder(pool, children, type), offsets_builder_(pool) {}
|
||||
|
||||
Status AppendNull() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append a null arbitrarily to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single null to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append an empty value arbitrarily to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single empty value to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called.
|
||||
Status Append(int8_t next_type) {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
|
||||
if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
|
||||
return Status::CapacityError(
|
||||
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
|
||||
"child");
|
||||
}
|
||||
auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
|
||||
return offsets_builder_.Append(offset);
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
private:
|
||||
TypedBufferBuilder<int32_t> offsets_builder_;
|
||||
};
|
||||
|
||||
/// \class SparseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit SparseUnionBuilder(MemoryPool* pool)
|
||||
: BasicUnionBuilder(pool, {}, sparse_union(FieldVector{})) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
SparseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type)
|
||||
: BasicUnionBuilder(pool, children, type) {}
|
||||
|
||||
/// \brief Append a null value.
|
||||
///
|
||||
/// A null is appended to the first child, empty values to the other children.
|
||||
Status AppendNull() final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values.
|
||||
///
|
||||
/// Nulls are appended to the first child, empty values to the other children.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(
|
||||
type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called, and all other child builders must have null or empty value appended.
|
||||
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
|
||||
|
||||
Status AppendArraySlice(const ArrayData& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
@ -0,0 +1,37 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Concatenate arrays
|
||||
///
|
||||
/// \param[in] arrays a vector of arrays to be concatenated
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return the concatenated array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
} // namespace arrow
|
258
.venv/Lib/site-packages/pyarrow/include/arrow/array/data.h
Normal file
258
.venv/Lib/site-packages/pyarrow/include/arrow/array/data.h
Normal file
@ -0,0 +1,258 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic> // IWYU pragma: export
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// When slicing, we do not know the null count of the sliced range without
|
||||
// doing some computation. To avoid doing this eagerly, we set the null count
|
||||
// to -1 (any negative number will do). When Array::null_count is called the
|
||||
// first time, the null count will be computed. See ARROW-33
|
||||
constexpr int64_t kUnknownNullCount = -1;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Generic array data container
|
||||
|
||||
/// \class ArrayData
|
||||
/// \brief Mutable container for generic Arrow array data
|
||||
///
|
||||
/// This data structure is a self-contained representation of the memory and
|
||||
/// metadata inside an Arrow array data structure (called vectors in Java). The
|
||||
/// classes arrow::Array and its subclasses provide strongly-typed accessors
|
||||
/// with support for the visitor pattern and other affordances.
|
||||
///
|
||||
/// This class is designed for easy internal data manipulation, analytical data
|
||||
/// processing, and data transport to and from IPC messages. For example, we
|
||||
/// could cast from int64 to float64 like so:
|
||||
///
|
||||
/// Int64Array arr = GetMyData();
|
||||
/// auto new_data = arr.data()->Copy();
|
||||
/// new_data->type = arrow::float64();
|
||||
/// DoubleArray double_arr(new_data);
|
||||
///
|
||||
/// This object is also useful in an analytics setting where memory may be
|
||||
/// reused. For example, if we had a group of operations all returning doubles,
|
||||
/// say:
|
||||
///
|
||||
/// Log(Sqrt(Expr(arr)))
|
||||
///
|
||||
/// Then the low-level implementations of each of these functions could have
|
||||
/// the signatures
|
||||
///
|
||||
/// void Log(const ArrayData& values, ArrayData* out);
|
||||
///
|
||||
/// As another example a function may consume one or more memory buffers in an
|
||||
/// input array and replace them with newly-allocated data, changing the output
|
||||
/// data type as well.
|
||||
struct ARROW_EXPORT ArrayData {
|
||||
ArrayData() = default;
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
this->child_data = std::move(child_data);
|
||||
}
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
// Move constructor
|
||||
ArrayData(ArrayData&& other) noexcept
|
||||
: type(std::move(other.type)),
|
||||
length(other.length),
|
||||
offset(other.offset),
|
||||
buffers(std::move(other.buffers)),
|
||||
child_data(std::move(other.child_data)),
|
||||
dictionary(std::move(other.dictionary)) {
|
||||
SetNullCount(other.null_count);
|
||||
}
|
||||
|
||||
// Copy constructor
|
||||
ArrayData(const ArrayData& other) noexcept
|
||||
: type(other.type),
|
||||
length(other.length),
|
||||
offset(other.offset),
|
||||
buffers(other.buffers),
|
||||
child_data(other.child_data),
|
||||
dictionary(other.dictionary) {
|
||||
SetNullCount(other.null_count);
|
||||
}
|
||||
|
||||
// Move assignment
|
||||
ArrayData& operator=(ArrayData&& other) {
|
||||
type = std::move(other.type);
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = std::move(other.buffers);
|
||||
child_data = std::move(other.child_data);
|
||||
dictionary = std::move(other.dictionary);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Copy assignment
|
||||
ArrayData& operator=(const ArrayData& other) {
|
||||
type = other.type;
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = other.buffers;
|
||||
child_data = other.child_data;
|
||||
dictionary = other.dictionary;
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i) const {
|
||||
return GetValues<T>(i, offset);
|
||||
}
|
||||
|
||||
// Like GetValues, but returns NULLPTR instead of aborting if the underlying
|
||||
// buffer is not a CPU buffer.
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i] && buffers[i]->is_cpu()) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i) const {
|
||||
return GetValuesSafe<T>(i, offset);
|
||||
}
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i, int64_t absolute_offset) {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i) {
|
||||
return GetMutableValues<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Construct a zero-copy slice of the data with the given offset and length
|
||||
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// \brief Input-checking variant of Slice
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
/// Note that unlike Slice, `length` isn't clamped to the available buffer size.
|
||||
Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
|
||||
void SetNullCount(int64_t v) { null_count.store(v); }
|
||||
|
||||
/// \brief Return null count, or compute and set it if it's not known
|
||||
int64_t GetNullCount() const;
|
||||
|
||||
bool MayHaveNulls() const {
|
||||
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
|
||||
// but no buffer
|
||||
return null_count.load() != 0 && buffers[0] != NULLPTR;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type;
|
||||
int64_t length = 0;
|
||||
mutable std::atomic<int64_t> null_count{0};
|
||||
// The logical start point into the physical buffers (in values, not bytes).
|
||||
// Note that, for child data, this must be *added* to the child data's own offset.
|
||||
int64_t offset = 0;
|
||||
std::vector<std::shared_ptr<Buffer>> buffers;
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data;
|
||||
|
||||
// The dictionary for this Array, if any. Only used for dictionary type
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// Construct a zero-copy view of this ArrayData with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
76
.venv/Lib/site-packages/pyarrow/include/arrow/array/diff.h
Normal file
76
.venv/Lib/site-packages/pyarrow/include/arrow/array/diff.h
Normal file
@ -0,0 +1,76 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Compare two arrays, returning an edit script which expresses the difference
|
||||
/// between them
|
||||
///
|
||||
/// An edit script is an array of struct(insert: bool, run_length: int64_t).
|
||||
/// Each element of "insert" determines whether an element was inserted into (true)
|
||||
/// or deleted from (false) base. Each insertion or deletion is followed by a run of
|
||||
/// elements which are unchanged from base to target; the length of this run is stored
|
||||
/// in "run_length". (Note that the edit script begins and ends with a run of shared
|
||||
/// elements but both fields of the struct must have the same length. To accommodate this
|
||||
/// the first element of "insert" should be ignored.)
|
||||
///
|
||||
/// For example for base "hlloo" and target "hello", the edit script would be
|
||||
/// [
|
||||
/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
|
||||
/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
|
||||
/// {"insert": false, "run_length": 0} // delete("o") then an empty run
|
||||
/// ]
|
||||
///
|
||||
/// Diffing arrays containing nulls is not currently supported.
|
||||
///
|
||||
/// \param[in] base baseline for comparison
|
||||
/// \param[in] target an array of identical type to base whose elements differ from base's
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return an edit script array which can be applied to base to produce target
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief visitor interface for easy traversal of an edit script
|
||||
///
|
||||
/// visitor will be called for each hunk of insertions and deletions.
|
||||
ARROW_EXPORT Status VisitEditScript(
|
||||
const Array& edits,
|
||||
const std::function<Status(int64_t delete_begin, int64_t delete_end,
|
||||
int64_t insert_begin, int64_t insert_end)>& visitor);
|
||||
|
||||
/// \brief return a function which will format an edit script in unified
|
||||
/// diff format to os, given base and target arrays of type
|
||||
ARROW_EXPORT Result<
|
||||
std::function<Status(const Array& edits, const Array& base, const Array& target)>>
|
||||
MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
|
||||
|
||||
} // namespace arrow
|
89
.venv/Lib/site-packages/pyarrow/include/arrow/array/util.h
Normal file
89
.venv/Lib/site-packages/pyarrow/include/arrow/array/util.h
Normal file
@ -0,0 +1,89 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Create a strongly-typed Array instance from generic ArrayData
|
||||
/// \param[in] data the array contents
|
||||
/// \return the resulting Array instance
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// \brief Create a strongly-typed Array instance with all elements null
|
||||
/// \param[in] type the array type
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
|
||||
int64_t length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an Array instance whose slots are the given scalar
|
||||
/// \param[in] scalar the value with which to fill the array
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayFromScalar(
|
||||
const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an empty Array of a given type
|
||||
///
|
||||
/// The output Array will be of the given type.
|
||||
///
|
||||
/// \param[in] type the data type of the empty Array
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
/// \return the resulting Array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Swap endian of each element in a generic ArrayData
|
||||
///
|
||||
/// As dictionaries are often shared between different arrays, dictionaries
|
||||
/// are not swapped by this function and should be handled separately.
|
||||
///
|
||||
/// \param[in] data the array contents
|
||||
/// \return the resulting ArrayData whose elements were swapped
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
|
||||
const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// Given a number of ArrayVectors, treat each ArrayVector as the
|
||||
/// chunks of a chunked array. Then rechunk each ArrayVector such that
|
||||
/// all ArrayVectors are chunked identically. It is mandatory that
|
||||
/// all ArrayVectors contain the same total number of elements.
|
||||
ARROW_EXPORT
|
||||
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
@ -0,0 +1,56 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// Internal functions implementing Array::Validate() and friends.
|
||||
|
||||
// O(1) array metadata validation
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const ArrayData& data);
|
||||
|
||||
// O(N) array data validation.
|
||||
// Note that, starting from 7.0.0, "full" routines also validate metadata.
|
||||
// Before, ValidateArray() needed to be called before ValidateArrayFull()
|
||||
// to ensure metadata correctness, otherwise invalid memory accesses
|
||||
// may occur.
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const ArrayData& data);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const ArrayData& data);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
Reference in New Issue
Block a user