// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include // N.B. we don't include async_generator.h as it's relatively heavy #include #include #include #include "parquet/file_reader.h" #include "parquet/platform.h" #include "parquet/properties.h" namespace arrow { class ChunkedArray; class KeyValueMetadata; class RecordBatchReader; struct Scalar; class Schema; class Table; class RecordBatch; } // namespace arrow namespace parquet { class FileMetaData; class SchemaDescriptor; namespace arrow { class ColumnChunkReader; class ColumnReader; struct SchemaManifest; class RowGroupReader; /// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches. /// /// This interfaces caters for different use cases and thus provides different /// interfaces. In its most simplistic form, we cater for a user that wants to /// read the whole Parquet at once with the `FileReader::ReadTable` method. /// /// More advanced users that also want to implement parallelism on top of each /// single Parquet files should do this on the RowGroup level. For this, they can /// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified /// RowGroup as a table. /// /// In the most advanced situation, where a consumer wants to independently read /// RowGroups in parallel and consume each column individually, they can call /// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column` /// instance. /// /// The parquet format supports an optional integer field_id which can be assigned /// to a field. Arrow will convert these field IDs to a metadata key named /// PARQUET:field_id on the appropriate field. // TODO(wesm): nested data does not always make sense with this user // interface unless you are only reading a single leaf node from a branch of // a table. For example: // // repeated group data { // optional group record { // optional int32 val1; // optional byte_array val2; // optional bool val3; // } // optional int32 val4; // } // // In the Parquet file, there are 3 leaf nodes: // // * data.record.val1 // * data.record.val2 // * data.record.val3 // * data.val4 // // When materializing this data in an Arrow array, we would have: // // data: list), // val3: bool, // >, // val4: int32 // >> // // However, in the Parquet format, each leaf node has its own repetition and // definition levels describing the structure of the intermediate nodes in // this array structure. Thus, we will need to scan the leaf data for a group // of leaf nodes part of the same type tree to create a single result Arrow // nested array structure. // // This is additionally complicated "chunky" repeated fields or very large byte // arrays class PARQUET_EXPORT FileReader { public: /// Factory function to create a FileReader from a ParquetFileReader and properties static ::arrow::Status Make(::arrow::MemoryPool* pool, std::unique_ptr reader, const ArrowReaderProperties& properties, std::unique_ptr* out); /// Factory function to create a FileReader from a ParquetFileReader static ::arrow::Status Make(::arrow::MemoryPool* pool, std::unique_ptr reader, std::unique_ptr* out); // Since the distribution of columns amongst a Parquet file's row groups may // be uneven (the number of values in each column chunk can be different), we // provide a column-oriented read interface. The ColumnReader hides the // details of paging through the file's row groups and yielding // fully-materialized arrow::Array instances // // Returns error status if the column of interest is not flat. virtual ::arrow::Status GetColumn(int i, std::unique_ptr* out) = 0; /// \brief Return arrow schema for all the columns. virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0; /// \brief Read column as a whole into a chunked array. /// /// The indicated column index is relative to the schema virtual ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; // NOTE: Experimental API // Reads a specific top level schema field into an Array // The index i refers the index of the top level schema field, which may // be nested or flat - e.g. // // 0 foo.bar // foo.bar.baz // foo.qux // 1 foo2 // 2 foo3 // // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc virtual ::arrow::Status ReadSchemaField( int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; /// \brief Return a RecordBatchReader of row groups selected from row_group_indices. /// /// Note that the ordering in row_group_indices matters. FileReaders must outlive /// their RecordBatchReaders. /// /// \returns error Status if row_group_indices contains an invalid index virtual ::arrow::Status GetRecordBatchReader( const std::vector& row_group_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, std::shared_ptr<::arrow::RecordBatchReader>* out); /// \brief Return a RecordBatchReader of row groups selected from /// row_group_indices, whose columns are selected by column_indices. /// /// Note that the ordering in row_group_indices and column_indices /// matter. FileReaders must outlive their RecordBatchReaders. /// /// \returns error Status if either row_group_indices or column_indices /// contains an invalid index virtual ::arrow::Status GetRecordBatchReader( const std::vector& row_group_indices, const std::vector& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; /// \brief Return a generator of record batches. /// /// The FileReader must outlive the generator, so this requires that you pass in a /// shared_ptr. /// /// \returns error Result if either row_group_indices or column_indices contains an /// invalid index virtual ::arrow::Result< std::function<::arrow::Future>()>> GetRecordBatchGenerator(std::shared_ptr reader, const std::vector row_group_indices, const std::vector column_indices, ::arrow::internal::Executor* cpu_executor = NULLPTR, int64_t rows_to_readahead = 0) = 0; ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, std::shared_ptr<::arrow::RecordBatchReader>* out); /// Read all columns into a Table virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0; /// \brief Read the given columns into a Table /// /// The indicated column indices are relative to the schema virtual ::arrow::Status ReadTable(const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) = 0; virtual ::arrow::Status ReadRowGroup(int i, const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) = 0; virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0; virtual ::arrow::Status ReadRowGroups(const std::vector& row_groups, const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) = 0; virtual ::arrow::Status ReadRowGroups(const std::vector& row_groups, std::shared_ptr<::arrow::Table>* out) = 0; /// \brief Scan file contents with one thread, return number of rows virtual ::arrow::Status ScanContents(std::vector columns, const int32_t column_batch_size, int64_t* num_rows) = 0; /// \brief Return a reader for the RowGroup, this object must not outlive the /// FileReader. virtual std::shared_ptr RowGroup(int row_group_index) = 0; /// \brief The number of row groups in the file virtual int num_row_groups() const = 0; virtual ParquetFileReader* parquet_reader() const = 0; /// Set whether to use multiple threads during reads of multiple columns. /// By default only one thread is used. virtual void set_use_threads(bool use_threads) = 0; /// Set number of records to read per batch for the RecordBatchReader. virtual void set_batch_size(int64_t batch_size) = 0; virtual const ArrowReaderProperties& properties() const = 0; virtual const SchemaManifest& manifest() const = 0; virtual ~FileReader() = default; }; class RowGroupReader { public: virtual ~RowGroupReader() = default; virtual std::shared_ptr Column(int column_index) = 0; virtual ::arrow::Status ReadTable(const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) = 0; virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0; private: struct Iterator; }; class ColumnChunkReader { public: virtual ~ColumnChunkReader() = default; virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0; }; // At this point, the column reader is a stream iterator. It only knows how to // read the next batch of values for a particular column from the file until it // runs out. // // We also do not expose any internal Parquet details, such as row groups. This // might change in the future. class PARQUET_EXPORT ColumnReader { public: virtual ~ColumnReader() = default; // Scan the next array of the indicated size. The actual size of the // returned array may be less than the passed size depending how much data is // available in the file. // // When all the data in the file has been exhausted, the result is set to // nullptr. // // Returns Status::OK on a successful read, including if you have exhausted // the data available in the file. virtual ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; }; /// \brief Experimental helper class for bindings (like Python) that struggle /// either with std::move or C++ exceptions class PARQUET_EXPORT FileReaderBuilder { public: FileReaderBuilder(); /// Create FileReaderBuilder from Arrow file and optional properties / metadata ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file, const ReaderProperties& properties = default_reader_properties(), std::shared_ptr metadata = NULLPTR); ParquetFileReader* raw_reader() { return raw_reader_.get(); } /// Set Arrow MemoryPool for memory allocation FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool); /// Set Arrow reader properties FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties); /// Build FileReader instance ::arrow::Status Build(std::unique_ptr* out); private: ::arrow::MemoryPool* pool_; ArrowReaderProperties properties_; std::unique_ptr raw_reader_; }; /// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers /// /// @{ /// \brief Build FileReader from Arrow file and MemoryPool /// /// Advanced settings are supported through the FileReaderBuilder class. PARQUET_EXPORT ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator, std::unique_ptr* reader); /// @} PARQUET_EXPORT ::arrow::Status StatisticsAsScalars(const Statistics& Statistics, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max); namespace internal { PARQUET_EXPORT ::arrow::Status FuzzReader(const uint8_t* data, int64_t size); } // namespace internal } // namespace arrow } // namespace parquet