mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-22 10:28:02 +00:00
467 lines
18 KiB
C++
467 lines
18 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <cstddef>
|
|
#include <memory>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <type_traits>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "arrow/array.h"
|
|
#include "arrow/array/builder_base.h"
|
|
#include "arrow/array/builder_binary.h"
|
|
#include "arrow/array/builder_nested.h"
|
|
#include "arrow/array/builder_primitive.h"
|
|
#include "arrow/chunked_array.h"
|
|
#include "arrow/compute/api.h"
|
|
#include "arrow/status.h"
|
|
#include "arrow/table.h"
|
|
#include "arrow/type_fwd.h"
|
|
#include "arrow/type_traits.h"
|
|
#include "arrow/util/checked_cast.h"
|
|
#include "arrow/util/macros.h"
|
|
|
|
namespace arrow {
|
|
|
|
class Schema;
|
|
|
|
namespace stl {
|
|
|
|
namespace internal {
|
|
|
|
template <typename T, typename = void>
|
|
struct is_optional_like : public std::false_type {};
|
|
|
|
template <typename T, typename = void>
|
|
struct is_dereferencable : public std::false_type {};
|
|
|
|
template <typename T>
|
|
struct is_dereferencable<T, arrow::internal::void_t<decltype(*std::declval<T>())>>
|
|
: public std::true_type {};
|
|
|
|
template <typename T>
|
|
struct is_optional_like<
|
|
T, typename std::enable_if<
|
|
std::is_constructible<bool, T>::value && is_dereferencable<T>::value &&
|
|
!std::is_array<typename std::remove_reference<T>::type>::value>::type>
|
|
: public std::true_type {};
|
|
|
|
template <size_t N, typename Tuple>
|
|
using BareTupleElement =
|
|
typename std::decay<typename std::tuple_element<N, Tuple>::type>::type;
|
|
|
|
} // namespace internal
|
|
|
|
template <typename T, typename R = void>
|
|
using enable_if_optional_like =
|
|
typename std::enable_if<internal::is_optional_like<T>::value, R>::type;
|
|
|
|
/// Traits meta class to map standard C/C++ types to equivalent Arrow types.
|
|
template <typename T, typename Enable = void>
|
|
struct ConversionTraits {};
|
|
|
|
/// Returns builder type for given standard C/C++ type.
|
|
template <typename CType>
|
|
using CBuilderType =
|
|
typename TypeTraits<typename ConversionTraits<CType>::ArrowType>::BuilderType;
|
|
|
|
/// Default implementation of AppendListValues.
|
|
///
|
|
/// This function can be specialized by user to take advantage of appending
|
|
/// contiguous ranges while appending. This default implementation will call
|
|
/// ConversionTraits<ValueCType>::AppendRow() for each value in the range.
|
|
template <typename ValueCType, typename Range>
|
|
inline Status AppendListValues(CBuilderType<ValueCType>& value_builder,
|
|
Range&& cell_range) {
|
|
for (auto const& value : cell_range) {
|
|
ARROW_RETURN_NOT_OK(ConversionTraits<ValueCType>::AppendRow(value_builder, value));
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
#define ARROW_STL_CONVERSION(CType_, ArrowType_) \
|
|
template <> \
|
|
struct ConversionTraits<CType_> : public CTypeTraits<CType_> { \
|
|
static Status AppendRow(typename TypeTraits<ArrowType_>::BuilderType& builder, \
|
|
CType_ cell) { \
|
|
return builder.Append(cell); \
|
|
} \
|
|
static CType_ GetEntry(const typename TypeTraits<ArrowType_>::ArrayType& array, \
|
|
size_t j) { \
|
|
return array.Value(j); \
|
|
} \
|
|
}; \
|
|
\
|
|
template <> \
|
|
inline Status AppendListValues<CType_, const std::vector<CType_>&>( \
|
|
typename TypeTraits<ArrowType_>::BuilderType & value_builder, \
|
|
const std::vector<CType_>& cell_range) { \
|
|
return value_builder.AppendValues(cell_range); \
|
|
}
|
|
|
|
ARROW_STL_CONVERSION(bool, BooleanType)
|
|
ARROW_STL_CONVERSION(int8_t, Int8Type)
|
|
ARROW_STL_CONVERSION(int16_t, Int16Type)
|
|
ARROW_STL_CONVERSION(int32_t, Int32Type)
|
|
ARROW_STL_CONVERSION(int64_t, Int64Type)
|
|
ARROW_STL_CONVERSION(uint8_t, UInt8Type)
|
|
ARROW_STL_CONVERSION(uint16_t, UInt16Type)
|
|
ARROW_STL_CONVERSION(uint32_t, UInt32Type)
|
|
ARROW_STL_CONVERSION(uint64_t, UInt64Type)
|
|
ARROW_STL_CONVERSION(float, FloatType)
|
|
ARROW_STL_CONVERSION(double, DoubleType)
|
|
|
|
template <>
|
|
struct ConversionTraits<std::string> : public CTypeTraits<std::string> {
|
|
static Status AppendRow(StringBuilder& builder, const std::string& cell) {
|
|
return builder.Append(cell);
|
|
}
|
|
static std::string GetEntry(const StringArray& array, size_t j) {
|
|
return array.GetString(j);
|
|
}
|
|
};
|
|
|
|
/// Append cell range elements as a single value to the list builder.
|
|
///
|
|
/// Cell range will be added to child builder using AppendListValues<ValueCType>()
|
|
/// if provided. AppendListValues<ValueCType>() has a default implementation, but
|
|
/// it can be specialized by users.
|
|
template <typename ValueCType, typename ListBuilderType, typename Range>
|
|
Status AppendCellRange(ListBuilderType& builder, Range&& cell_range) {
|
|
constexpr bool is_list_builder = std::is_same<ListBuilderType, ListBuilder>::value;
|
|
constexpr bool is_large_list_builder =
|
|
std::is_same<ListBuilderType, LargeListBuilder>::value;
|
|
static_assert(
|
|
is_list_builder || is_large_list_builder,
|
|
"Builder type must be either ListBuilder or LargeListBuilder for appending "
|
|
"multiple rows.");
|
|
|
|
using ChildBuilderType = CBuilderType<ValueCType>;
|
|
ARROW_RETURN_NOT_OK(builder.Append());
|
|
auto& value_builder =
|
|
::arrow::internal::checked_cast<ChildBuilderType&>(*builder.value_builder());
|
|
|
|
// XXX: Remove appended value before returning if status isn't OK?
|
|
return AppendListValues<ValueCType>(value_builder, std::forward<Range>(cell_range));
|
|
}
|
|
|
|
template <typename ValueCType>
|
|
struct ConversionTraits<std::vector<ValueCType>>
|
|
: public CTypeTraits<std::vector<ValueCType>> {
|
|
static Status AppendRow(ListBuilder& builder, const std::vector<ValueCType>& cell) {
|
|
return AppendCellRange<ValueCType>(builder, cell);
|
|
}
|
|
|
|
static std::vector<ValueCType> GetEntry(const ListArray& array, size_t j) {
|
|
using ElementArrayType =
|
|
typename TypeTraits<typename ConversionTraits<ValueCType>::ArrowType>::ArrayType;
|
|
|
|
const ElementArrayType& value_array =
|
|
::arrow::internal::checked_cast<const ElementArrayType&>(*array.values());
|
|
|
|
std::vector<ValueCType> vec(array.value_length(j));
|
|
for (int64_t i = 0; i < array.value_length(j); i++) {
|
|
vec[i] =
|
|
ConversionTraits<ValueCType>::GetEntry(value_array, array.value_offset(j) + i);
|
|
}
|
|
return vec;
|
|
}
|
|
};
|
|
|
|
template <typename Optional>
|
|
struct ConversionTraits<Optional, enable_if_optional_like<Optional>>
|
|
: public CTypeTraits<typename std::decay<decltype(*std::declval<Optional>())>::type> {
|
|
using OptionalInnerType =
|
|
typename std::decay<decltype(*std::declval<Optional>())>::type;
|
|
using typename CTypeTraits<OptionalInnerType>::ArrowType;
|
|
using CTypeTraits<OptionalInnerType>::type_singleton;
|
|
|
|
static Status AppendRow(typename TypeTraits<ArrowType>::BuilderType& builder,
|
|
const Optional& cell) {
|
|
if (cell) {
|
|
return ConversionTraits<OptionalInnerType>::AppendRow(builder, *cell);
|
|
} else {
|
|
return builder.AppendNull();
|
|
}
|
|
}
|
|
};
|
|
|
|
/// Build an arrow::Schema based upon the types defined in a std::tuple-like structure.
|
|
///
|
|
/// While the type information is available at compile-time, we still need to add the
|
|
/// column names at runtime, thus these methods are not constexpr.
|
|
template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
|
|
struct SchemaFromTuple {
|
|
using Element = internal::BareTupleElement<N - 1, Tuple>;
|
|
|
|
// Implementations that take a vector-like object for the column names.
|
|
|
|
/// Recursively build a vector of arrow::Field from the defined types.
|
|
///
|
|
/// In most cases MakeSchema is the better entrypoint for the Schema creation.
|
|
static std::vector<std::shared_ptr<Field>> MakeSchemaRecursion(
|
|
const std::vector<std::string>& names) {
|
|
std::vector<std::shared_ptr<Field>> ret =
|
|
SchemaFromTuple<Tuple, N - 1>::MakeSchemaRecursion(names);
|
|
auto type = ConversionTraits<Element>::type_singleton();
|
|
ret.push_back(field(names[N - 1], type, internal::is_optional_like<Element>::value));
|
|
return ret;
|
|
}
|
|
|
|
/// Build a Schema from the types of the tuple-like structure passed in as template
|
|
/// parameter assign the column names at runtime.
|
|
///
|
|
/// An example usage of this API can look like the following:
|
|
///
|
|
/// \code{.cpp}
|
|
/// using TupleType = std::tuple<int, std::vector<std::string>>;
|
|
/// std::shared_ptr<Schema> schema =
|
|
/// SchemaFromTuple<TupleType>::MakeSchema({"int_column", "list_of_strings_column"});
|
|
/// \endcode
|
|
static std::shared_ptr<Schema> MakeSchema(const std::vector<std::string>& names) {
|
|
return std::make_shared<Schema>(MakeSchemaRecursion(names));
|
|
}
|
|
|
|
// Implementations that take a tuple-like object for the column names.
|
|
|
|
/// Recursively build a vector of arrow::Field from the defined types.
|
|
///
|
|
/// In most cases MakeSchema is the better entrypoint for the Schema creation.
|
|
template <typename NamesTuple>
|
|
static std::vector<std::shared_ptr<Field>> MakeSchemaRecursionT(
|
|
const NamesTuple& names) {
|
|
using std::get;
|
|
|
|
std::vector<std::shared_ptr<Field>> ret =
|
|
SchemaFromTuple<Tuple, N - 1>::MakeSchemaRecursionT(names);
|
|
std::shared_ptr<DataType> type = ConversionTraits<Element>::type_singleton();
|
|
ret.push_back(
|
|
field(get<N - 1>(names), type, internal::is_optional_like<Element>::value));
|
|
return ret;
|
|
}
|
|
|
|
/// Build a Schema from the types of the tuple-like structure passed in as template
|
|
/// parameter assign the column names at runtime.
|
|
///
|
|
/// An example usage of this API can look like the following:
|
|
///
|
|
/// \code{.cpp}
|
|
/// using TupleType = std::tuple<int, std::vector<std::string>>;
|
|
/// std::shared_ptr<Schema> schema =
|
|
/// SchemaFromTuple<TupleType>::MakeSchema({"int_column", "list_of_strings_column"});
|
|
/// \endcode
|
|
template <typename NamesTuple>
|
|
static std::shared_ptr<Schema> MakeSchema(const NamesTuple& names) {
|
|
return std::make_shared<Schema>(MakeSchemaRecursionT<NamesTuple>(names));
|
|
}
|
|
};
|
|
|
|
template <typename Tuple>
|
|
struct SchemaFromTuple<Tuple, 0> {
|
|
static std::vector<std::shared_ptr<Field>> MakeSchemaRecursion(
|
|
const std::vector<std::string>& names) {
|
|
std::vector<std::shared_ptr<Field>> ret;
|
|
ret.reserve(names.size());
|
|
return ret;
|
|
}
|
|
|
|
template <typename NamesTuple>
|
|
static std::vector<std::shared_ptr<Field>> MakeSchemaRecursionT(
|
|
const NamesTuple& names) {
|
|
std::vector<std::shared_ptr<Field>> ret;
|
|
ret.reserve(std::tuple_size<NamesTuple>::value);
|
|
return ret;
|
|
}
|
|
};
|
|
|
|
namespace internal {
|
|
|
|
template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
|
|
struct CreateBuildersRecursive {
|
|
static Status Make(MemoryPool* pool,
|
|
std::vector<std::unique_ptr<ArrayBuilder>>* builders) {
|
|
using Element = BareTupleElement<N - 1, Tuple>;
|
|
std::shared_ptr<DataType> type = ConversionTraits<Element>::type_singleton();
|
|
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &builders->at(N - 1)));
|
|
|
|
return CreateBuildersRecursive<Tuple, N - 1>::Make(pool, builders);
|
|
}
|
|
};
|
|
|
|
template <typename Tuple>
|
|
struct CreateBuildersRecursive<Tuple, 0> {
|
|
static Status Make(MemoryPool*, std::vector<std::unique_ptr<ArrayBuilder>>*) {
|
|
return Status::OK();
|
|
}
|
|
};
|
|
|
|
template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
|
|
struct RowIterator {
|
|
static Status Append(const std::vector<std::unique_ptr<ArrayBuilder>>& builders,
|
|
const Tuple& row) {
|
|
using std::get;
|
|
using Element = BareTupleElement<N - 1, Tuple>;
|
|
using BuilderType =
|
|
typename TypeTraits<typename ConversionTraits<Element>::ArrowType>::BuilderType;
|
|
|
|
BuilderType& builder =
|
|
::arrow::internal::checked_cast<BuilderType&>(*builders[N - 1]);
|
|
ARROW_RETURN_NOT_OK(ConversionTraits<Element>::AppendRow(builder, get<N - 1>(row)));
|
|
|
|
return RowIterator<Tuple, N - 1>::Append(builders, row);
|
|
}
|
|
};
|
|
|
|
template <typename Tuple>
|
|
struct RowIterator<Tuple, 0> {
|
|
static Status Append(const std::vector<std::unique_ptr<ArrayBuilder>>& builders,
|
|
const Tuple& row) {
|
|
return Status::OK();
|
|
}
|
|
};
|
|
|
|
template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
|
|
struct EnsureColumnTypes {
|
|
static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner,
|
|
const compute::CastOptions& cast_options, compute::ExecContext* ctx,
|
|
std::reference_wrapper<const ::arrow::Table>* result) {
|
|
using Element = BareTupleElement<N - 1, Tuple>;
|
|
std::shared_ptr<DataType> expected_type = ConversionTraits<Element>::type_singleton();
|
|
|
|
if (!table.schema()->field(N - 1)->type()->Equals(*expected_type)) {
|
|
ARROW_ASSIGN_OR_RAISE(
|
|
Datum casted,
|
|
compute::Cast(table.column(N - 1), expected_type, cast_options, ctx));
|
|
auto new_field = table.schema()->field(N - 1)->WithType(expected_type);
|
|
ARROW_ASSIGN_OR_RAISE(*table_owner,
|
|
table.SetColumn(N - 1, new_field, casted.chunked_array()));
|
|
*result = **table_owner;
|
|
}
|
|
|
|
return EnsureColumnTypes<Tuple, N - 1>::Cast(result->get(), table_owner, cast_options,
|
|
ctx, result);
|
|
}
|
|
};
|
|
|
|
template <typename Tuple>
|
|
struct EnsureColumnTypes<Tuple, 0> {
|
|
static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner,
|
|
const compute::CastOptions& cast_options, compute::ExecContext* ctx,
|
|
std::reference_wrapper<const ::arrow::Table>* result) {
|
|
return Status::OK();
|
|
}
|
|
};
|
|
|
|
template <typename Range, typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
|
|
struct TupleSetter {
|
|
static void Fill(const Table& table, Range* rows) {
|
|
using std::get;
|
|
using Element = typename std::tuple_element<N - 1, Tuple>::type;
|
|
using ArrayType =
|
|
typename TypeTraits<typename ConversionTraits<Element>::ArrowType>::ArrayType;
|
|
|
|
auto iter = rows->begin();
|
|
const ChunkedArray& chunked_array = *table.column(N - 1);
|
|
for (int i = 0; i < chunked_array.num_chunks(); i++) {
|
|
const ArrayType& array =
|
|
::arrow::internal::checked_cast<const ArrayType&>(*chunked_array.chunk(i));
|
|
for (int64_t j = 0; j < array.length(); j++) {
|
|
get<N - 1>(*iter++) = ConversionTraits<Element>::GetEntry(array, j);
|
|
}
|
|
}
|
|
|
|
return TupleSetter<Range, Tuple, N - 1>::Fill(table, rows);
|
|
}
|
|
};
|
|
|
|
template <typename Range, typename Tuple>
|
|
struct TupleSetter<Range, Tuple, 0> {
|
|
static void Fill(const Table& table, Range* rows) {}
|
|
};
|
|
|
|
} // namespace internal
|
|
|
|
template <typename Range>
|
|
Status TableFromTupleRange(MemoryPool* pool, Range&& rows,
|
|
const std::vector<std::string>& names,
|
|
std::shared_ptr<Table>* table) {
|
|
using row_type = typename std::iterator_traits<decltype(std::begin(rows))>::value_type;
|
|
constexpr std::size_t n_columns = std::tuple_size<row_type>::value;
|
|
|
|
std::shared_ptr<Schema> schema = SchemaFromTuple<row_type>::MakeSchema(names);
|
|
|
|
std::vector<std::unique_ptr<ArrayBuilder>> builders(n_columns);
|
|
ARROW_RETURN_NOT_OK(internal::CreateBuildersRecursive<row_type>::Make(pool, &builders));
|
|
|
|
for (auto const& row : rows) {
|
|
ARROW_RETURN_NOT_OK(internal::RowIterator<row_type>::Append(builders, row));
|
|
}
|
|
|
|
std::vector<std::shared_ptr<Array>> arrays;
|
|
for (auto const& builder : builders) {
|
|
std::shared_ptr<Array> array;
|
|
ARROW_RETURN_NOT_OK(builder->Finish(&array));
|
|
arrays.emplace_back(array);
|
|
}
|
|
|
|
*table = Table::Make(std::move(schema), std::move(arrays));
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
template <typename Range>
|
|
Status TupleRangeFromTable(const Table& table, const compute::CastOptions& cast_options,
|
|
compute::ExecContext* ctx, Range* rows) {
|
|
using row_type = typename std::decay<decltype(*std::begin(*rows))>::type;
|
|
constexpr std::size_t n_columns = std::tuple_size<row_type>::value;
|
|
|
|
if (table.schema()->num_fields() != n_columns) {
|
|
std::stringstream ss;
|
|
ss << "Number of columns in the table does not match the width of the target: ";
|
|
ss << table.schema()->num_fields() << " != " << n_columns;
|
|
return Status::Invalid(ss.str());
|
|
}
|
|
|
|
// TODO: Use std::size with C++17
|
|
if (rows->size() != static_cast<size_t>(table.num_rows())) {
|
|
std::stringstream ss;
|
|
ss << "Number of rows in the table does not match the size of the target: ";
|
|
ss << table.num_rows() << " != " << rows->size();
|
|
return Status::Invalid(ss.str());
|
|
}
|
|
|
|
// Check that all columns have the correct type, otherwise cast them.
|
|
std::shared_ptr<Table> table_owner;
|
|
std::reference_wrapper<const ::arrow::Table> current_table(table);
|
|
|
|
ARROW_RETURN_NOT_OK(internal::EnsureColumnTypes<row_type>::Cast(
|
|
table, &table_owner, cast_options, ctx, ¤t_table));
|
|
|
|
internal::TupleSetter<Range, row_type>::Fill(current_table.get(), rows);
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
} // namespace stl
|
|
} // namespace arrow
|