// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" #include "parquet/level_conversion.h" #include "parquet/platform.h" #include "parquet/schema.h" namespace parquet { class ArrowReaderProperties; class ArrowWriterProperties; class WriterProperties; namespace arrow { /// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow /// schema into a Parquet schema. /// /// @{ PARQUET_EXPORT ::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, schema::NodePtr* out); PARQUET_EXPORT ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, std::shared_ptr* out); PARQUET_EXPORT ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, std::shared_ptr* out); /// @} /// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet /// schema into an Arrow schema. /// /// @{ PARQUET_EXPORT ::arrow::Status FromParquetSchema( const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, const std::shared_ptr& key_value_metadata, std::shared_ptr<::arrow::Schema>* out); PARQUET_EXPORT ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, std::shared_ptr<::arrow::Schema>* out); PARQUET_EXPORT ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out); /// @} /// \brief Bridge between an arrow::Field and parquet column indices. struct PARQUET_EXPORT SchemaField { std::shared_ptr<::arrow::Field> field; std::vector children; // Only set for leaf nodes int column_index = -1; parquet::internal::LevelInfo level_info; bool is_leaf() const { return column_index != -1; } }; /// \brief Bridge between a parquet Schema and an arrow Schema. /// /// Expose parquet columns as a tree structure. Useful traverse and link /// between arrow's Schema and parquet's Schema. struct PARQUET_EXPORT SchemaManifest { static ::arrow::Status Make( const SchemaDescriptor* schema, const std::shared_ptr& metadata, const ArrowReaderProperties& properties, SchemaManifest* manifest); const SchemaDescriptor* descr; std::shared_ptr<::arrow::Schema> origin_schema; std::shared_ptr schema_metadata; std::vector schema_fields; std::unordered_map column_index_to_field; std::unordered_map child_to_parent; ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const { auto it = column_index_to_field.find(column_index); if (it == column_index_to_field.end()) { return ::arrow::Status::KeyError("Column index ", column_index, " not found in schema manifest, may be malformed"); } *out = it->second; return ::arrow::Status::OK(); } const SchemaField* GetParent(const SchemaField* field) const { // Returns nullptr also if not found auto it = child_to_parent.find(field); if (it == child_to_parent.end()) { return NULLPTR; } return it->second; } /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which /// correspond to the column root (first node below the parquet schema's root group) of /// each leaf referenced in column_indices. /// /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3]) /// the roots are `a` and `i` (return=[0,2]). /// /// root /// -- a <------ /// -- -- b | | /// -- -- -- c | /// -- -- -- d | /// -- -- -- -- e /// -- f /// -- -- g /// -- -- -- h /// -- i <--- /// -- -- j | /// -- -- -- k ::arrow::Result> GetFieldIndices( const std::vector& column_indices) const { const schema::GroupNode* group = descr->group_node(); std::unordered_set already_added; std::vector out; for (int column_idx : column_indices) { if (column_idx < 0 || column_idx >= descr->num_columns()) { return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid"); } auto field_node = descr->GetColumnRoot(column_idx); auto field_idx = group->FieldIndex(*field_node); if (field_idx == -1) { return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid"); } if (already_added.insert(field_idx).second) { out.push_back(field_idx); } } return out; } }; } // namespace arrow } // namespace parquet