mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-22 18:32:15 +00:00
185 lines
6.1 KiB
C++
185 lines
6.1 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <cassert>
|
|
#include <memory>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include "arrow/result.h"
|
|
#include "arrow/status.h"
|
|
#include "arrow/type.h"
|
|
#include "arrow/type_fwd.h"
|
|
|
|
#include "parquet/level_conversion.h"
|
|
#include "parquet/platform.h"
|
|
#include "parquet/schema.h"
|
|
|
|
namespace parquet {
|
|
|
|
class ArrowReaderProperties;
|
|
class ArrowWriterProperties;
|
|
class WriterProperties;
|
|
|
|
namespace arrow {
|
|
|
|
/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
|
|
/// schema into a Parquet schema.
|
|
///
|
|
/// @{
|
|
|
|
PARQUET_EXPORT
|
|
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
|
|
const WriterProperties& properties,
|
|
const ArrowWriterProperties& arrow_properties,
|
|
schema::NodePtr* out);
|
|
|
|
PARQUET_EXPORT
|
|
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
|
const WriterProperties& properties,
|
|
const ArrowWriterProperties& arrow_properties,
|
|
std::shared_ptr<SchemaDescriptor>* out);
|
|
|
|
PARQUET_EXPORT
|
|
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
|
const WriterProperties& properties,
|
|
std::shared_ptr<SchemaDescriptor>* out);
|
|
|
|
/// @}
|
|
|
|
/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
|
|
/// schema into an Arrow schema.
|
|
///
|
|
/// @{
|
|
|
|
PARQUET_EXPORT
|
|
::arrow::Status FromParquetSchema(
|
|
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
|
|
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
|
|
std::shared_ptr<::arrow::Schema>* out);
|
|
|
|
PARQUET_EXPORT
|
|
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
|
const ArrowReaderProperties& properties,
|
|
std::shared_ptr<::arrow::Schema>* out);
|
|
|
|
PARQUET_EXPORT
|
|
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
|
std::shared_ptr<::arrow::Schema>* out);
|
|
|
|
/// @}
|
|
|
|
/// \brief Bridge between an arrow::Field and parquet column indices.
|
|
struct PARQUET_EXPORT SchemaField {
|
|
std::shared_ptr<::arrow::Field> field;
|
|
std::vector<SchemaField> children;
|
|
|
|
// Only set for leaf nodes
|
|
int column_index = -1;
|
|
|
|
parquet::internal::LevelInfo level_info;
|
|
|
|
bool is_leaf() const { return column_index != -1; }
|
|
};
|
|
|
|
/// \brief Bridge between a parquet Schema and an arrow Schema.
|
|
///
|
|
/// Expose parquet columns as a tree structure. Useful traverse and link
|
|
/// between arrow's Schema and parquet's Schema.
|
|
struct PARQUET_EXPORT SchemaManifest {
|
|
static ::arrow::Status Make(
|
|
const SchemaDescriptor* schema,
|
|
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
|
|
const ArrowReaderProperties& properties, SchemaManifest* manifest);
|
|
|
|
const SchemaDescriptor* descr;
|
|
std::shared_ptr<::arrow::Schema> origin_schema;
|
|
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
|
|
std::vector<SchemaField> schema_fields;
|
|
|
|
std::unordered_map<int, const SchemaField*> column_index_to_field;
|
|
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
|
|
|
|
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
|
|
auto it = column_index_to_field.find(column_index);
|
|
if (it == column_index_to_field.end()) {
|
|
return ::arrow::Status::KeyError("Column index ", column_index,
|
|
" not found in schema manifest, may be malformed");
|
|
}
|
|
*out = it->second;
|
|
return ::arrow::Status::OK();
|
|
}
|
|
|
|
const SchemaField* GetParent(const SchemaField* field) const {
|
|
// Returns nullptr also if not found
|
|
auto it = child_to_parent.find(field);
|
|
if (it == child_to_parent.end()) {
|
|
return NULLPTR;
|
|
}
|
|
return it->second;
|
|
}
|
|
|
|
/// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
|
|
/// correspond to the column root (first node below the parquet schema's root group) of
|
|
/// each leaf referenced in column_indices.
|
|
///
|
|
/// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
|
|
/// the roots are `a` and `i` (return=[0,2]).
|
|
///
|
|
/// root
|
|
/// -- a <------
|
|
/// -- -- b | |
|
|
/// -- -- -- c |
|
|
/// -- -- -- d |
|
|
/// -- -- -- -- e
|
|
/// -- f
|
|
/// -- -- g
|
|
/// -- -- -- h
|
|
/// -- i <---
|
|
/// -- -- j |
|
|
/// -- -- -- k
|
|
::arrow::Result<std::vector<int>> GetFieldIndices(
|
|
const std::vector<int>& column_indices) const {
|
|
const schema::GroupNode* group = descr->group_node();
|
|
std::unordered_set<int> already_added;
|
|
|
|
std::vector<int> out;
|
|
for (int column_idx : column_indices) {
|
|
if (column_idx < 0 || column_idx >= descr->num_columns()) {
|
|
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
|
}
|
|
|
|
auto field_node = descr->GetColumnRoot(column_idx);
|
|
auto field_idx = group->FieldIndex(*field_node);
|
|
if (field_idx == -1) {
|
|
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
|
}
|
|
|
|
if (already_added.insert(field_idx).second) {
|
|
out.push_back(field_idx);
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
};
|
|
|
|
} // namespace arrow
|
|
} // namespace parquet
|