2022-05-23 00:16:32 +04:00

270 lines
9.1 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes for Binary, LargeBinart, String, LargeString,
// FixedSizeBinary
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h" // IWYU pragma: export
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup binary-arrays
///
/// @{
// ----------------------------------------------------------------------
// Binary and String
/// Base class for variable-sized binary arrays, regardless of offset size
/// and logical interpretation.
template <typename TYPE>
class BaseBinaryArray : public FlatArray {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
// Account for base offset
i += data_->offset;
const offset_type pos = raw_value_offsets_[i];
*out_length = raw_value_offsets_[i + 1] - pos;
return raw_data_ + pos;
}
/// \brief Get binary value as a string_view
///
/// \param i the value index
/// \return the view over the selected value
util::string_view GetView(int64_t i) const {
// Account for base offset
i += data_->offset;
const offset_type pos = raw_value_offsets_[i];
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
raw_value_offsets_[i + 1] - pos);
}
util::optional<util::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
/// \brief Get binary value as a string_view
/// Provided for consistency with other arrays.
///
/// \param i the value index
/// \return the view over the selected value
util::string_view Value(int64_t i) const { return GetView(i); }
/// \brief Get binary value as a std::string
///
/// \param i the value index
/// \return the value copied into a std::string
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
const offset_type* raw_value_offsets() const {
return raw_value_offsets_ + data_->offset;
}
const uint8_t* raw_data() const { return raw_data_; }
/// \brief Return the data buffer absolute offset of the data for the value
/// at the passed index.
///
/// Does not perform boundschecking
offset_type value_offset(int64_t i) const {
return raw_value_offsets_[i + data_->offset];
}
/// \brief Return the length of the data for the value at the passed index.
///
/// Does not perform boundschecking
offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
/// \brief Return the total length of the memory in the data buffer
/// referenced by this array. If the array has been sliced then this may be
/// less than the size of the data buffer (data_->buffers[2]).
offset_type total_values_length() const {
if (data_->length > 0) {
return raw_value_offsets_[data_->length + data_->offset] -
raw_value_offsets_[data_->offset];
} else {
return 0;
}
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
// For subclasses
BaseBinaryArray() = default;
// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
}
const offset_type* raw_value_offsets_ = NULLPTR;
const uint8_t* raw_data_ = NULLPTR;
};
/// Concrete Array class for variable-size binary data
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
public:
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as StringArray
BinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for variable-size string (utf-8) data
class ARROW_EXPORT StringArray : public BinaryArray {
public:
using TypeClass = StringType;
explicit StringArray(const std::shared_ptr<ArrayData>& data);
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
/// Concrete Array class for large variable-size binary data
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
public:
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as LargeStringArray
LargeBinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for large variable-size string (utf-8) data
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
public:
using TypeClass = LargeStringType;
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
// ----------------------------------------------------------------------
// Fixed width binary
/// Concrete Array class for fixed-size binary data
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
public:
using TypeClass = FixedSizeBinaryType;
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const uint8_t* GetValue(int64_t i) const;
const uint8_t* Value(int64_t i) const { return GetValue(i); }
util::string_view GetView(int64_t i) const {
return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
}
util::optional<util::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
int32_t byte_width() const { return byte_width_; }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
void SetData(const std::shared_ptr<ArrayData>& data) {
this->PrimitiveArray::SetData(data);
byte_width_ =
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
}
int32_t byte_width_;
};
/// @}
} // namespace arrow