mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-01 14:07:48 +00:00
first commit
This commit is contained in:
20
.venv/Lib/site-packages/pyarrow/include/parquet/api/io.h
Normal file
20
.venv/Lib/site-packages/pyarrow/include/parquet/api/io.h
Normal file
@ -0,0 +1,20 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parquet/exception.h"
|
35
.venv/Lib/site-packages/pyarrow/include/parquet/api/reader.h
Normal file
35
.venv/Lib/site-packages/pyarrow/include/parquet/api/reader.h
Normal file
@ -0,0 +1,35 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
// Column reader API
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/column_scanner.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/file_reader.h"
|
||||
#include "parquet/metadata.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/printer.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/statistics.h"
|
||||
|
||||
// Schemas
|
||||
#include "parquet/api/schema.h"
|
||||
|
||||
// IO
|
||||
#include "parquet/api/io.h"
|
21
.venv/Lib/site-packages/pyarrow/include/parquet/api/schema.h
Normal file
21
.venv/Lib/site-packages/pyarrow/include/parquet/api/schema.h
Normal file
@ -0,0 +1,21 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
// Schemas
|
||||
#include "parquet/schema.h"
|
25
.venv/Lib/site-packages/pyarrow/include/parquet/api/writer.h
Normal file
25
.venv/Lib/site-packages/pyarrow/include/parquet/api/writer.h
Normal file
@ -0,0 +1,25 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parquet/api/io.h"
|
||||
#include "parquet/api/schema.h"
|
||||
#include "parquet/column_writer.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/file_writer.h"
|
||||
#include "parquet/statistics.h"
|
344
.venv/Lib/site-packages/pyarrow/include/parquet/arrow/reader.h
Normal file
344
.venv/Lib/site-packages/pyarrow/include/parquet/arrow/reader.h
Normal file
@ -0,0 +1,344 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
// N.B. we don't include async_generator.h as it's relatively heavy
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/file_reader.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ChunkedArray;
|
||||
class KeyValueMetadata;
|
||||
class RecordBatchReader;
|
||||
struct Scalar;
|
||||
class Schema;
|
||||
class Table;
|
||||
class RecordBatch;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class FileMetaData;
|
||||
class SchemaDescriptor;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ColumnChunkReader;
|
||||
class ColumnReader;
|
||||
struct SchemaManifest;
|
||||
class RowGroupReader;
|
||||
|
||||
/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
|
||||
///
|
||||
/// This interfaces caters for different use cases and thus provides different
|
||||
/// interfaces. In its most simplistic form, we cater for a user that wants to
|
||||
/// read the whole Parquet at once with the `FileReader::ReadTable` method.
|
||||
///
|
||||
/// More advanced users that also want to implement parallelism on top of each
|
||||
/// single Parquet files should do this on the RowGroup level. For this, they can
|
||||
/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
|
||||
/// RowGroup as a table.
|
||||
///
|
||||
/// In the most advanced situation, where a consumer wants to independently read
|
||||
/// RowGroups in parallel and consume each column individually, they can call
|
||||
/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
|
||||
/// instance.
|
||||
///
|
||||
/// The parquet format supports an optional integer field_id which can be assigned
|
||||
/// to a field. Arrow will convert these field IDs to a metadata key named
|
||||
/// PARQUET:field_id on the appropriate field.
|
||||
// TODO(wesm): nested data does not always make sense with this user
|
||||
// interface unless you are only reading a single leaf node from a branch of
|
||||
// a table. For example:
|
||||
//
|
||||
// repeated group data {
|
||||
// optional group record {
|
||||
// optional int32 val1;
|
||||
// optional byte_array val2;
|
||||
// optional bool val3;
|
||||
// }
|
||||
// optional int32 val4;
|
||||
// }
|
||||
//
|
||||
// In the Parquet file, there are 3 leaf nodes:
|
||||
//
|
||||
// * data.record.val1
|
||||
// * data.record.val2
|
||||
// * data.record.val3
|
||||
// * data.val4
|
||||
//
|
||||
// When materializing this data in an Arrow array, we would have:
|
||||
//
|
||||
// data: list<struct<
|
||||
// record: struct<
|
||||
// val1: int32,
|
||||
// val2: string (= list<uint8>),
|
||||
// val3: bool,
|
||||
// >,
|
||||
// val4: int32
|
||||
// >>
|
||||
//
|
||||
// However, in the Parquet format, each leaf node has its own repetition and
|
||||
// definition levels describing the structure of the intermediate nodes in
|
||||
// this array structure. Thus, we will need to scan the leaf data for a group
|
||||
// of leaf nodes part of the same type tree to create a single result Arrow
|
||||
// nested array structure.
|
||||
//
|
||||
// This is additionally complicated "chunky" repeated fields or very large byte
|
||||
// arrays
|
||||
class PARQUET_EXPORT FileReader {
|
||||
public:
|
||||
/// Factory function to create a FileReader from a ParquetFileReader and properties
|
||||
static ::arrow::Status Make(::arrow::MemoryPool* pool,
|
||||
std::unique_ptr<ParquetFileReader> reader,
|
||||
const ArrowReaderProperties& properties,
|
||||
std::unique_ptr<FileReader>* out);
|
||||
|
||||
/// Factory function to create a FileReader from a ParquetFileReader
|
||||
static ::arrow::Status Make(::arrow::MemoryPool* pool,
|
||||
std::unique_ptr<ParquetFileReader> reader,
|
||||
std::unique_ptr<FileReader>* out);
|
||||
|
||||
// Since the distribution of columns amongst a Parquet file's row groups may
|
||||
// be uneven (the number of values in each column chunk can be different), we
|
||||
// provide a column-oriented read interface. The ColumnReader hides the
|
||||
// details of paging through the file's row groups and yielding
|
||||
// fully-materialized arrow::Array instances
|
||||
//
|
||||
// Returns error status if the column of interest is not flat.
|
||||
virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
|
||||
|
||||
/// \brief Return arrow schema for all the columns.
|
||||
virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
|
||||
|
||||
/// \brief Read column as a whole into a chunked array.
|
||||
///
|
||||
/// The indicated column index is relative to the schema
|
||||
virtual ::arrow::Status ReadColumn(int i,
|
||||
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
|
||||
// NOTE: Experimental API
|
||||
// Reads a specific top level schema field into an Array
|
||||
// The index i refers the index of the top level schema field, which may
|
||||
// be nested or flat - e.g.
|
||||
//
|
||||
// 0 foo.bar
|
||||
// foo.bar.baz
|
||||
// foo.qux
|
||||
// 1 foo2
|
||||
// 2 foo3
|
||||
//
|
||||
// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
|
||||
virtual ::arrow::Status ReadSchemaField(
|
||||
int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices matters. FileReaders must outlive
|
||||
/// their RecordBatchReaders.
|
||||
///
|
||||
/// \returns error Status if row_group_indices contains an invalid index
|
||||
virtual ::arrow::Status GetRecordBatchReader(
|
||||
const std::vector<int>& row_group_indices,
|
||||
std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
|
||||
|
||||
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from
|
||||
/// row_group_indices, whose columns are selected by column_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices and column_indices
|
||||
/// matter. FileReaders must outlive their RecordBatchReaders.
|
||||
///
|
||||
/// \returns error Status if either row_group_indices or column_indices
|
||||
/// contains an invalid index
|
||||
virtual ::arrow::Status GetRecordBatchReader(
|
||||
const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
|
||||
std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
|
||||
|
||||
/// \brief Return a generator of record batches.
|
||||
///
|
||||
/// The FileReader must outlive the generator, so this requires that you pass in a
|
||||
/// shared_ptr.
|
||||
///
|
||||
/// \returns error Result if either row_group_indices or column_indices contains an
|
||||
/// invalid index
|
||||
virtual ::arrow::Result<
|
||||
std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
|
||||
GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
|
||||
const std::vector<int> row_group_indices,
|
||||
const std::vector<int> column_indices,
|
||||
::arrow::internal::Executor* cpu_executor = NULLPTR,
|
||||
int64_t rows_to_readahead = 0) = 0;
|
||||
|
||||
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// Read all columns into a Table
|
||||
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
/// \brief Read the given columns into a Table
|
||||
///
|
||||
/// The indicated column indices are relative to the schema
|
||||
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
|
||||
const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
/// \brief Scan file contents with one thread, return number of rows
|
||||
virtual ::arrow::Status ScanContents(std::vector<int> columns,
|
||||
const int32_t column_batch_size,
|
||||
int64_t* num_rows) = 0;
|
||||
|
||||
/// \brief Return a reader for the RowGroup, this object must not outlive the
|
||||
/// FileReader.
|
||||
virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
|
||||
|
||||
/// \brief The number of row groups in the file
|
||||
virtual int num_row_groups() const = 0;
|
||||
|
||||
virtual ParquetFileReader* parquet_reader() const = 0;
|
||||
|
||||
/// Set whether to use multiple threads during reads of multiple columns.
|
||||
/// By default only one thread is used.
|
||||
virtual void set_use_threads(bool use_threads) = 0;
|
||||
|
||||
/// Set number of records to read per batch for the RecordBatchReader.
|
||||
virtual void set_batch_size(int64_t batch_size) = 0;
|
||||
|
||||
virtual const ArrowReaderProperties& properties() const = 0;
|
||||
|
||||
virtual const SchemaManifest& manifest() const = 0;
|
||||
|
||||
virtual ~FileReader() = default;
|
||||
};
|
||||
|
||||
class RowGroupReader {
|
||||
public:
|
||||
virtual ~RowGroupReader() = default;
|
||||
virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
|
||||
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
private:
|
||||
struct Iterator;
|
||||
};
|
||||
|
||||
class ColumnChunkReader {
|
||||
public:
|
||||
virtual ~ColumnChunkReader() = default;
|
||||
virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
};
|
||||
|
||||
// At this point, the column reader is a stream iterator. It only knows how to
|
||||
// read the next batch of values for a particular column from the file until it
|
||||
// runs out.
|
||||
//
|
||||
// We also do not expose any internal Parquet details, such as row groups. This
|
||||
// might change in the future.
|
||||
class PARQUET_EXPORT ColumnReader {
|
||||
public:
|
||||
virtual ~ColumnReader() = default;
|
||||
|
||||
// Scan the next array of the indicated size. The actual size of the
|
||||
// returned array may be less than the passed size depending how much data is
|
||||
// available in the file.
|
||||
//
|
||||
// When all the data in the file has been exhausted, the result is set to
|
||||
// nullptr.
|
||||
//
|
||||
// Returns Status::OK on a successful read, including if you have exhausted
|
||||
// the data available in the file.
|
||||
virtual ::arrow::Status NextBatch(int64_t batch_size,
|
||||
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
};
|
||||
|
||||
/// \brief Experimental helper class for bindings (like Python) that struggle
|
||||
/// either with std::move or C++ exceptions
|
||||
class PARQUET_EXPORT FileReaderBuilder {
|
||||
public:
|
||||
FileReaderBuilder();
|
||||
|
||||
/// Create FileReaderBuilder from Arrow file and optional properties / metadata
|
||||
::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
|
||||
const ReaderProperties& properties = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
ParquetFileReader* raw_reader() { return raw_reader_.get(); }
|
||||
|
||||
/// Set Arrow MemoryPool for memory allocation
|
||||
FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
|
||||
/// Set Arrow reader properties
|
||||
FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
|
||||
/// Build FileReader instance
|
||||
::arrow::Status Build(std::unique_ptr<FileReader>* out);
|
||||
|
||||
private:
|
||||
::arrow::MemoryPool* pool_;
|
||||
ArrowReaderProperties properties_;
|
||||
std::unique_ptr<ParquetFileReader> raw_reader_;
|
||||
};
|
||||
|
||||
/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Build FileReader from Arrow file and MemoryPool
|
||||
///
|
||||
/// Advanced settings are supported through the FileReaderBuilder class.
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
|
||||
::arrow::MemoryPool* allocator,
|
||||
std::unique_ptr<FileReader>* reader);
|
||||
|
||||
/// @}
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
|
||||
std::shared_ptr<::arrow::Scalar>* min,
|
||||
std::shared_ptr<::arrow::Scalar>* max);
|
||||
|
||||
namespace internal {
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
184
.venv/Lib/site-packages/pyarrow/include/parquet/arrow/schema.h
Normal file
184
.venv/Lib/site-packages/pyarrow/include/parquet/arrow/schema.h
Normal file
@ -0,0 +1,184 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
#include "parquet/level_conversion.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ArrowReaderProperties;
|
||||
class ArrowWriterProperties;
|
||||
class WriterProperties;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
|
||||
/// schema into a Parquet schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
schema::NodePtr* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
|
||||
/// schema into an Arrow schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(
|
||||
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
const ArrowReaderProperties& properties,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Bridge between an arrow::Field and parquet column indices.
|
||||
struct PARQUET_EXPORT SchemaField {
|
||||
std::shared_ptr<::arrow::Field> field;
|
||||
std::vector<SchemaField> children;
|
||||
|
||||
// Only set for leaf nodes
|
||||
int column_index = -1;
|
||||
|
||||
parquet::internal::LevelInfo level_info;
|
||||
|
||||
bool is_leaf() const { return column_index != -1; }
|
||||
};
|
||||
|
||||
/// \brief Bridge between a parquet Schema and an arrow Schema.
|
||||
///
|
||||
/// Expose parquet columns as a tree structure. Useful traverse and link
|
||||
/// between arrow's Schema and parquet's Schema.
|
||||
struct PARQUET_EXPORT SchemaManifest {
|
||||
static ::arrow::Status Make(
|
||||
const SchemaDescriptor* schema,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
|
||||
const ArrowReaderProperties& properties, SchemaManifest* manifest);
|
||||
|
||||
const SchemaDescriptor* descr;
|
||||
std::shared_ptr<::arrow::Schema> origin_schema;
|
||||
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
|
||||
std::vector<SchemaField> schema_fields;
|
||||
|
||||
std::unordered_map<int, const SchemaField*> column_index_to_field;
|
||||
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
|
||||
|
||||
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
|
||||
auto it = column_index_to_field.find(column_index);
|
||||
if (it == column_index_to_field.end()) {
|
||||
return ::arrow::Status::KeyError("Column index ", column_index,
|
||||
" not found in schema manifest, may be malformed");
|
||||
}
|
||||
*out = it->second;
|
||||
return ::arrow::Status::OK();
|
||||
}
|
||||
|
||||
const SchemaField* GetParent(const SchemaField* field) const {
|
||||
// Returns nullptr also if not found
|
||||
auto it = child_to_parent.find(field);
|
||||
if (it == child_to_parent.end()) {
|
||||
return NULLPTR;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
/// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
|
||||
/// correspond to the column root (first node below the parquet schema's root group) of
|
||||
/// each leaf referenced in column_indices.
|
||||
///
|
||||
/// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
|
||||
/// the roots are `a` and `i` (return=[0,2]).
|
||||
///
|
||||
/// root
|
||||
/// -- a <------
|
||||
/// -- -- b | |
|
||||
/// -- -- -- c |
|
||||
/// -- -- -- d |
|
||||
/// -- -- -- -- e
|
||||
/// -- f
|
||||
/// -- -- g
|
||||
/// -- -- -- h
|
||||
/// -- i <---
|
||||
/// -- -- j |
|
||||
/// -- -- -- k
|
||||
::arrow::Result<std::vector<int>> GetFieldIndices(
|
||||
const std::vector<int>& column_indices) const {
|
||||
const schema::GroupNode* group = descr->group_node();
|
||||
std::unordered_set<int> already_added;
|
||||
|
||||
std::vector<int> out;
|
||||
for (int column_idx : column_indices) {
|
||||
if (column_idx < 0 || column_idx >= descr->num_columns()) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
auto field_node = descr->GetColumnRoot(column_idx);
|
||||
auto field_idx = group->FieldIndex(*field_node);
|
||||
if (field_idx == -1) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
if (already_added.insert(field_idx).second) {
|
||||
out.push_back(field_idx);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
@ -0,0 +1,507 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/array/builder_decimal.h"
|
||||
#include "arrow/array/builder_primitive.h"
|
||||
#include "arrow/testing/gtest_util.h"
|
||||
#include "arrow/testing/random.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "parquet/column_reader.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
using internal::RecordReader;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using ::arrow::Array;
|
||||
using ::arrow::ChunkedArray;
|
||||
using ::arrow::Status;
|
||||
|
||||
template <int32_t PRECISION>
|
||||
struct DecimalWithPrecisionAndScale {
|
||||
static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value");
|
||||
|
||||
using type = ::arrow::Decimal128Type;
|
||||
static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id;
|
||||
static constexpr int32_t precision = PRECISION;
|
||||
static constexpr int32_t scale = PRECISION - 1;
|
||||
};
|
||||
|
||||
template <int32_t PRECISION>
|
||||
struct Decimal256WithPrecisionAndScale {
|
||||
static_assert(PRECISION >= 1 && PRECISION <= 76, "Invalid precision value");
|
||||
|
||||
using type = ::arrow::Decimal256Type;
|
||||
static constexpr ::arrow::Type::type type_id = ::arrow::Decimal256Type::type_id;
|
||||
static constexpr int32_t precision = PRECISION;
|
||||
static constexpr int32_t scale = PRECISION - 1;
|
||||
};
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_floating_point<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using c_type = typename ArrowType::c_type;
|
||||
std::vector<c_type> values;
|
||||
::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1), &values);
|
||||
::arrow::NumericBuilder<ArrowType> builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_integer<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
::arrow::randint(size, 0, 64, &values);
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_date<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
::arrow::randint(size, 0, 24, &values);
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
values[i] *= 86400000;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_base_binary<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
BuilderType builder;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
RETURN_NOT_OK(builder.Append("test-string"));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
// set byte_width to the length of "fixed": 5
|
||||
// todo: find a way to generate test data with more diversity.
|
||||
BuilderType builder(::arrow::fixed_size_binary(5));
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
RETURN_NOT_OK(builder.Append("fixed"));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
|
||||
auto gen = ::arrow::random::RandomArrayGenerator(seed);
|
||||
std::shared_ptr<Array> decimals;
|
||||
int32_t byte_width = 0;
|
||||
if (precision <= ::arrow::Decimal128Type::kMaxPrecision) {
|
||||
decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
|
||||
byte_width = ::arrow::Decimal128Type::kByteWidth;
|
||||
} else {
|
||||
decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
|
||||
byte_width = ::arrow::Decimal256Type::kByteWidth;
|
||||
}
|
||||
std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<
|
||||
std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>
|
||||
NonNullArray(size_t size, std::shared_ptr<Array>* out) {
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
|
||||
|
||||
const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
|
||||
::arrow::Decimal128Builder builder(type);
|
||||
const int32_t byte_width =
|
||||
static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
|
||||
|
||||
constexpr int32_t seed = 0;
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
|
||||
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<
|
||||
std::is_same<ArrowType, Decimal256WithPrecisionAndScale<precision>>::value, Status>
|
||||
NonNullArray(size_t size, std::shared_ptr<Array>* out) {
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = Decimal256WithPrecisionAndScale<precision>::scale;
|
||||
|
||||
const auto type = ::arrow::decimal256(kDecimalPrecision, kDecimalScale);
|
||||
::arrow::Decimal256Builder builder(type);
|
||||
const int32_t byte_width =
|
||||
static_cast<const ::arrow::Decimal256Type&>(*type).byte_width();
|
||||
|
||||
constexpr int32_t seed = 0;
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
|
||||
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_boolean<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<uint8_t> values;
|
||||
::arrow::randint(size, 0, 1, &values);
|
||||
::arrow::BooleanBuilder builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
|
||||
using c_type = typename ArrowType::c_type;
|
||||
std::vector<c_type> values;
|
||||
::arrow::random_real(size, seed, static_cast<c_type>(-1e10), static_cast<c_type>(1e10),
|
||||
&values);
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
::arrow::NumericBuilder<ArrowType> builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_integer<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
::arrow::randint(size, 0, 64, &values);
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_date<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
::arrow::randint(size, 0, 24, &values);
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
values[i] *= 86400000;
|
||||
}
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_base_binary<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
BuilderType builder;
|
||||
|
||||
const int kBufferSize = 10;
|
||||
uint8_t buffer[kBufferSize];
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (!valid_bytes[i]) {
|
||||
RETURN_NOT_OK(builder.AppendNull());
|
||||
} else {
|
||||
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
|
||||
if (ArrowType::is_utf8) {
|
||||
// Trivially force data to be valid UTF8 by making it all ASCII
|
||||
for (auto& byte : buffer) {
|
||||
byte &= 0x7f;
|
||||
}
|
||||
}
|
||||
RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
|
||||
}
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet,
|
||||
// same as NullableArray<String|Binary>(..)
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
const int byte_width = 10;
|
||||
BuilderType builder(::arrow::fixed_size_binary(byte_width));
|
||||
|
||||
const int kBufferSize = byte_width;
|
||||
uint8_t buffer[kBufferSize];
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (!valid_bytes[i]) {
|
||||
RETURN_NOT_OK(builder.AppendNull());
|
||||
} else {
|
||||
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
|
||||
RETURN_NOT_OK(builder.Append(buffer));
|
||||
}
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<
|
||||
std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>
|
||||
NullableArray(size_t size, size_t num_nulls, uint32_t seed,
|
||||
std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, '\1');
|
||||
|
||||
for (size_t i = 0; i < num_nulls; ++i) {
|
||||
valid_bytes[i * 2] = '\0';
|
||||
}
|
||||
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
|
||||
const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
|
||||
const int32_t byte_width =
|
||||
static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
|
||||
random_decimals(size, seed, precision, out_buf->mutable_data());
|
||||
|
||||
::arrow::Decimal128Builder builder(type);
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<
|
||||
std::is_same<ArrowType, Decimal256WithPrecisionAndScale<precision>>::value, Status>
|
||||
NullableArray(size_t size, size_t num_nulls, uint32_t seed,
|
||||
std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, '\1');
|
||||
|
||||
for (size_t i = 0; i < num_nulls; ++i) {
|
||||
valid_bytes[i * 2] = '\0';
|
||||
}
|
||||
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = Decimal256WithPrecisionAndScale<precision>::scale;
|
||||
const auto type = ::arrow::decimal256(kDecimalPrecision, kDecimalScale);
|
||||
const int32_t byte_width =
|
||||
static_cast<const ::arrow::Decimal256Type&>(*type).byte_width();
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
|
||||
random_decimals(size, seed, precision, out_buf->mutable_data());
|
||||
|
||||
::arrow::Decimal256Builder builder(type);
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet.
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_boolean<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<uint8_t> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
|
||||
::arrow::randint(size, 0, 1, &values);
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
::arrow::BooleanBuilder builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
/// Wrap an Array into a ListArray by splitting it up into size lists.
|
||||
///
|
||||
/// This helper function only supports (size/2) nulls.
|
||||
Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
|
||||
int64_t null_count, const std::string& item_name,
|
||||
bool nullable_values, std::shared_ptr<::arrow::ListArray>* out) {
|
||||
// We always include an empty list
|
||||
int64_t non_null_entries = size - null_count - 1;
|
||||
int64_t length_per_entry = values->length() / non_null_entries;
|
||||
|
||||
auto offsets = AllocateBuffer();
|
||||
RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
|
||||
int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
|
||||
|
||||
auto null_bitmap = AllocateBuffer();
|
||||
int64_t bitmap_size = ::arrow::bit_util::BytesForBits(size);
|
||||
RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
|
||||
uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
|
||||
memset(null_bitmap_ptr, 0, bitmap_size);
|
||||
|
||||
int32_t current_offset = 0;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
offsets_ptr[i] = current_offset;
|
||||
if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
|
||||
// Non-null list (list with index 1 is always empty).
|
||||
::arrow::bit_util::SetBit(null_bitmap_ptr, i);
|
||||
if (i != 1) {
|
||||
current_offset += static_cast<int32_t>(length_per_entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
offsets_ptr[size] = static_cast<int32_t>(values->length());
|
||||
|
||||
auto value_field = ::arrow::field(item_name, values->type(), nullable_values);
|
||||
*out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
|
||||
values, null_bitmap, null_count);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Make an array containing only empty lists, with a null values array
|
||||
Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
|
||||
// Allocate an offsets buffer containing only zeroes
|
||||
const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
|
||||
ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, ::arrow::AllocateBuffer(offsets_nbytes));
|
||||
memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
|
||||
|
||||
auto value_field =
|
||||
::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
|
||||
auto list_type = ::arrow::list(value_field);
|
||||
|
||||
std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
|
||||
nullptr /* values */};
|
||||
auto child_data =
|
||||
::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
|
||||
|
||||
std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
|
||||
std::move(offsets_buffer)};
|
||||
auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
|
||||
array_data->child_data.push_back(child_data);
|
||||
|
||||
*out_array = ::arrow::MakeArray(array_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<::arrow::Table> MakeSimpleTable(
|
||||
const std::shared_ptr<ChunkedArray>& values, bool nullable) {
|
||||
auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)});
|
||||
return ::arrow::Table::Make(schema, {values});
|
||||
}
|
||||
|
||||
std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
|
||||
bool nullable) {
|
||||
auto carr = std::make_shared<::arrow::ChunkedArray>(values);
|
||||
return MakeSimpleTable(carr, nullable);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ExpectArray(T* expected, Array* result) {
|
||||
auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
|
||||
for (int i = 0; i < result->length(); i++) {
|
||||
EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
void ExpectArrayT(void* expected, Array* result) {
|
||||
::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
|
||||
for (int64_t i = 0; i < result->length(); i++) {
|
||||
EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
|
||||
reinterpret_cast<const typename ArrowType::c_type*>(
|
||||
p_array->values()->data())[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
|
||||
::arrow::BooleanBuilder builder;
|
||||
ARROW_EXPECT_OK(
|
||||
builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
|
||||
|
||||
std::shared_ptr<Array> expected_array;
|
||||
ARROW_EXPECT_OK(builder.Finish(&expected_array));
|
||||
EXPECT_TRUE(result->Equals(*expected_array));
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
} // namespace parquet
|
109
.venv/Lib/site-packages/pyarrow/include/parquet/arrow/writer.h
Normal file
109
.venv/Lib/site-packages/pyarrow/include/parquet/arrow/writer.h
Normal file
@ -0,0 +1,109 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class ChunkedArray;
|
||||
class Schema;
|
||||
class Table;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class FileMetaData;
|
||||
class ParquetFileWriter;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Iterative FileWriter class
|
||||
///
|
||||
/// Start a new RowGroup or Chunk with NewRowGroup.
|
||||
/// Write column-by-column the whole column chunk.
|
||||
///
|
||||
/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
|
||||
/// value is a nonnegative integer, then it will be used as the field_id in the parquet
|
||||
/// file.
|
||||
class PARQUET_EXPORT FileWriter {
|
||||
public:
|
||||
static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
|
||||
std::shared_ptr<::arrow::Schema> schema,
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties,
|
||||
std::unique_ptr<FileWriter>* out);
|
||||
|
||||
static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
std::shared_ptr<WriterProperties> properties,
|
||||
std::unique_ptr<FileWriter>* writer);
|
||||
|
||||
static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
std::shared_ptr<WriterProperties> properties,
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties,
|
||||
std::unique_ptr<FileWriter>* writer);
|
||||
|
||||
virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
|
||||
|
||||
/// \brief Write a Table to Parquet.
|
||||
virtual ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size) = 0;
|
||||
|
||||
virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0;
|
||||
virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
|
||||
|
||||
/// \brief Write ColumnChunk in row group using slice of a ChunkedArray
|
||||
virtual ::arrow::Status WriteColumnChunk(
|
||||
const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
|
||||
int64_t size) = 0;
|
||||
|
||||
virtual ::arrow::Status WriteColumnChunk(
|
||||
const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
|
||||
virtual ::arrow::Status Close() = 0;
|
||||
virtual ~FileWriter();
|
||||
|
||||
virtual MemoryPool* memory_pool() const = 0;
|
||||
virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
/// \brief Write a Table to Parquet.
|
||||
::arrow::Status PARQUET_EXPORT
|
||||
WriteTable(const ::arrow::Table& table, MemoryPool* pool,
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
|
||||
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties =
|
||||
default_arrow_writer_properties());
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
247
.venv/Lib/site-packages/pyarrow/include/parquet/bloom_filter.h
Normal file
247
.venv/Lib/site-packages/pyarrow/include/parquet/bloom_filter.h
Normal file
@ -0,0 +1,247 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "parquet/hasher.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
// A Bloom filter is a compact structure to indicate whether an item is not in a set or
|
||||
// probably in a set. The Bloom filter usually consists of a bit set that represents a
|
||||
// set of elements, a hash strategy and a Bloom filter algorithm.
|
||||
class PARQUET_EXPORT BloomFilter {
|
||||
public:
|
||||
// Maximum Bloom filter size, it sets to HDFS default block size 128MB
|
||||
// This value will be reconsidered when implementing Bloom filter producer.
|
||||
static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
|
||||
|
||||
/// Determine whether an element exist in set or not.
|
||||
///
|
||||
/// @param hash the element to contain.
|
||||
/// @return false if value is definitely not in set, and true means PROBABLY
|
||||
/// in set.
|
||||
virtual bool FindHash(uint64_t hash) const = 0;
|
||||
|
||||
/// Insert element to set represented by Bloom filter bitset.
|
||||
/// @param hash the hash of value to insert into Bloom filter.
|
||||
virtual void InsertHash(uint64_t hash) = 0;
|
||||
|
||||
/// Write this Bloom filter to an output stream. A Bloom filter structure should
|
||||
/// include bitset length, hash strategy, algorithm, and bitset.
|
||||
///
|
||||
/// @param sink the output stream to write
|
||||
virtual void WriteTo(ArrowOutputStream* sink) const = 0;
|
||||
|
||||
/// Get the number of bytes of bitset
|
||||
virtual uint32_t GetBitsetSize() const = 0;
|
||||
|
||||
/// Compute hash for 32 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int32_t value) const = 0;
|
||||
|
||||
/// Compute hash for 64 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int64_t value) const = 0;
|
||||
|
||||
/// Compute hash for float value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(float value) const = 0;
|
||||
|
||||
/// Compute hash for double value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(double value) const = 0;
|
||||
|
||||
/// Compute hash for Int96 value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const Int96* value) const = 0;
|
||||
|
||||
/// Compute hash for ByteArray value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const ByteArray* value) const = 0;
|
||||
|
||||
/// Compute hash for fixed byte array value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value address.
|
||||
/// @param len the value length.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
|
||||
|
||||
virtual ~BloomFilter() {}
|
||||
|
||||
protected:
|
||||
// Hash strategy available for Bloom filter.
|
||||
enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
|
||||
|
||||
// Bloom filter algorithm.
|
||||
enum class Algorithm : uint32_t { BLOCK = 0 };
|
||||
};
|
||||
|
||||
// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
|
||||
// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
|
||||
// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
|
||||
//
|
||||
// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
|
||||
// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
|
||||
class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
|
||||
public:
|
||||
/// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
|
||||
BlockSplitBloomFilter();
|
||||
|
||||
/// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
|
||||
/// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
|
||||
/// rounded up/down to lower/upper bound if num_bytes is out of range and also
|
||||
/// will be rounded up to a power of 2.
|
||||
///
|
||||
/// @param num_bytes The number of bytes to store Bloom filter bitset.
|
||||
void Init(uint32_t num_bytes);
|
||||
|
||||
/// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
|
||||
/// bitset because the given bitset may not satisfy the 32-byte alignment requirement
|
||||
/// which may lead to segfault when performing SIMD instructions. It is the caller's
|
||||
/// responsibility to free the bitset passed in. This is used when reconstructing
|
||||
/// a Bloom filter from a parquet file.
|
||||
///
|
||||
/// @param bitset The given bitset to initialize the Bloom filter.
|
||||
/// @param num_bytes The number of bytes of given bitset.
|
||||
void Init(const uint8_t* bitset, uint32_t num_bytes);
|
||||
|
||||
// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
|
||||
static constexpr uint32_t kMinimumBloomFilterBytes = 32;
|
||||
|
||||
/// Calculate optimal size according to the number of distinct values and false
|
||||
/// positive probability.
|
||||
///
|
||||
/// @param ndv The number of distinct values.
|
||||
/// @param fpp The false positive probability.
|
||||
/// @return it always return a value between kMinimumBloomFilterBytes and
|
||||
/// kMaximumBloomFilterBytes, and the return value is always a power of 2
|
||||
static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
|
||||
DCHECK(fpp > 0.0 && fpp < 1.0);
|
||||
const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
|
||||
uint32_t num_bits;
|
||||
|
||||
// Handle overflow.
|
||||
if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
|
||||
num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
|
||||
} else {
|
||||
num_bits = static_cast<uint32_t>(m);
|
||||
}
|
||||
|
||||
// Round up to lower bound
|
||||
if (num_bits < kMinimumBloomFilterBytes << 3) {
|
||||
num_bits = kMinimumBloomFilterBytes << 3;
|
||||
}
|
||||
|
||||
// Get next power of 2 if bits is not power of 2.
|
||||
if ((num_bits & (num_bits - 1)) != 0) {
|
||||
num_bits = static_cast<uint32_t>(::arrow::bit_util::NextPower2(num_bits));
|
||||
}
|
||||
|
||||
// Round down to upper bound
|
||||
if (num_bits > kMaximumBloomFilterBytes << 3) {
|
||||
num_bits = kMaximumBloomFilterBytes << 3;
|
||||
}
|
||||
|
||||
return num_bits;
|
||||
}
|
||||
|
||||
bool FindHash(uint64_t hash) const override;
|
||||
void InsertHash(uint64_t hash) override;
|
||||
void WriteTo(ArrowOutputStream* sink) const override;
|
||||
uint32_t GetBitsetSize() const override { return num_bytes_; }
|
||||
|
||||
uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(float value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(double value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(const FLBA* value, uint32_t len) const override {
|
||||
return hasher_->Hash(value, len);
|
||||
}
|
||||
|
||||
/// Deserialize the Bloom filter from an input stream. It is used when reconstructing
|
||||
/// a Bloom filter from a parquet filter.
|
||||
///
|
||||
/// @param input_stream The input stream from which to construct the Bloom filter
|
||||
/// @return The BlockSplitBloomFilter.
|
||||
static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
|
||||
|
||||
private:
|
||||
// Bytes in a tiny Bloom filter block.
|
||||
static constexpr int kBytesPerFilterBlock = 32;
|
||||
|
||||
// The number of bits to be set in each tiny Bloom filter
|
||||
static constexpr int kBitsSetPerBlock = 8;
|
||||
|
||||
// A mask structure used to set bits in each tiny Bloom filter.
|
||||
struct BlockMask {
|
||||
uint32_t item[kBitsSetPerBlock];
|
||||
};
|
||||
|
||||
// The block-based algorithm needs eight odd SALT values to calculate eight indexes
|
||||
// of bit to set, one bit in each 32-bit word.
|
||||
static constexpr uint32_t SALT[kBitsSetPerBlock] = {
|
||||
0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
|
||||
0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
|
||||
|
||||
/// Set bits in mask array according to input key.
|
||||
/// @param key the value to calculate mask values.
|
||||
/// @param mask the mask array is used to set inside a block
|
||||
void SetMask(uint32_t key, BlockMask& mask) const;
|
||||
|
||||
// Memory pool to allocate aligned buffer for bitset
|
||||
::arrow::MemoryPool* pool_;
|
||||
|
||||
// The underlying buffer of bitset.
|
||||
std::shared_ptr<Buffer> data_;
|
||||
|
||||
// The number of bytes of Bloom filter bitset.
|
||||
uint32_t num_bytes_;
|
||||
|
||||
// Hash strategy used in this Bloom filter.
|
||||
HashStrategy hash_strategy_;
|
||||
|
||||
// Algorithm used in this Bloom filter.
|
||||
Algorithm algorithm_;
|
||||
|
||||
// The hash pointer points to actual hash class used.
|
||||
std::unique_ptr<Hasher> hasher_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
160
.venv/Lib/site-packages/pyarrow/include/parquet/column_page.h
Normal file
160
.venv/Lib/site-packages/pyarrow/include/parquet/column_page.h
Normal file
@ -0,0 +1,160 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module defines an abstract interface for iterating through pages in a
|
||||
// Parquet column chunk within a row group. It could be extended in the future
|
||||
// to iterate through all data pages in all chunks in a file.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "parquet/statistics.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
// TODO: Parallel processing is not yet safe because of memory-ownership
|
||||
// semantics (the PageReader may or may not own the memory referenced by a
|
||||
// page)
|
||||
//
|
||||
// TODO(wesm): In the future Parquet implementations may store the crc code
|
||||
// in format::PageHeader. parquet-mr currently does not, so we also skip it
|
||||
// here, both on the read and write path
|
||||
class Page {
|
||||
public:
|
||||
Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
|
||||
: buffer_(buffer), type_(type) {}
|
||||
|
||||
PageType::type type() const { return type_; }
|
||||
|
||||
std::shared_ptr<Buffer> buffer() const { return buffer_; }
|
||||
|
||||
// @returns: a pointer to the page's data
|
||||
const uint8_t* data() const { return buffer_->data(); }
|
||||
|
||||
// @returns: the total size in bytes of the page's data buffer
|
||||
int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Buffer> buffer_;
|
||||
PageType::type type_;
|
||||
};
|
||||
|
||||
/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
|
||||
class DataPage : public Page {
|
||||
public:
|
||||
int32_t num_values() const { return num_values_; }
|
||||
Encoding::type encoding() const { return encoding_; }
|
||||
int64_t uncompressed_size() const { return uncompressed_size_; }
|
||||
const EncodedStatistics& statistics() const { return statistics_; }
|
||||
|
||||
virtual ~DataPage() = default;
|
||||
|
||||
protected:
|
||||
DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
||||
Encoding::type encoding, int64_t uncompressed_size,
|
||||
const EncodedStatistics& statistics = EncodedStatistics())
|
||||
: Page(buffer, type),
|
||||
num_values_(num_values),
|
||||
encoding_(encoding),
|
||||
uncompressed_size_(uncompressed_size),
|
||||
statistics_(statistics) {}
|
||||
|
||||
int32_t num_values_;
|
||||
Encoding::type encoding_;
|
||||
int64_t uncompressed_size_;
|
||||
EncodedStatistics statistics_;
|
||||
};
|
||||
|
||||
class DataPageV1 : public DataPage {
|
||||
public:
|
||||
DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
||||
Encoding::type encoding, Encoding::type definition_level_encoding,
|
||||
Encoding::type repetition_level_encoding, int64_t uncompressed_size,
|
||||
const EncodedStatistics& statistics = EncodedStatistics())
|
||||
: DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
|
||||
statistics),
|
||||
definition_level_encoding_(definition_level_encoding),
|
||||
repetition_level_encoding_(repetition_level_encoding) {}
|
||||
|
||||
Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
|
||||
|
||||
Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
|
||||
|
||||
private:
|
||||
Encoding::type definition_level_encoding_;
|
||||
Encoding::type repetition_level_encoding_;
|
||||
};
|
||||
|
||||
class DataPageV2 : public DataPage {
|
||||
public:
|
||||
DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
|
||||
int32_t num_rows, Encoding::type encoding,
|
||||
int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
|
||||
int64_t uncompressed_size, bool is_compressed = false,
|
||||
const EncodedStatistics& statistics = EncodedStatistics())
|
||||
: DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
|
||||
statistics),
|
||||
num_nulls_(num_nulls),
|
||||
num_rows_(num_rows),
|
||||
definition_levels_byte_length_(definition_levels_byte_length),
|
||||
repetition_levels_byte_length_(repetition_levels_byte_length),
|
||||
is_compressed_(is_compressed) {}
|
||||
|
||||
int32_t num_nulls() const { return num_nulls_; }
|
||||
|
||||
int32_t num_rows() const { return num_rows_; }
|
||||
|
||||
int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
|
||||
|
||||
int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
|
||||
|
||||
bool is_compressed() const { return is_compressed_; }
|
||||
|
||||
private:
|
||||
int32_t num_nulls_;
|
||||
int32_t num_rows_;
|
||||
int32_t definition_levels_byte_length_;
|
||||
int32_t repetition_levels_byte_length_;
|
||||
bool is_compressed_;
|
||||
};
|
||||
|
||||
class DictionaryPage : public Page {
|
||||
public:
|
||||
DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
||||
Encoding::type encoding, bool is_sorted = false)
|
||||
: Page(buffer, PageType::DICTIONARY_PAGE),
|
||||
num_values_(num_values),
|
||||
encoding_(encoding),
|
||||
is_sorted_(is_sorted) {}
|
||||
|
||||
int32_t num_values() const { return num_values_; }
|
||||
|
||||
Encoding::type encoding() const { return encoding_; }
|
||||
|
||||
bool is_sorted() const { return is_sorted_; }
|
||||
|
||||
private:
|
||||
int32_t num_values_;
|
||||
Encoding::type encoding_;
|
||||
bool is_sorted_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
376
.venv/Lib/site-packages/pyarrow/include/parquet/column_reader.h
Normal file
376
.venv/Lib/site-packages/pyarrow/include/parquet/column_reader.h
Normal file
@ -0,0 +1,376 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/level_conversion.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class ChunkedArray;
|
||||
|
||||
namespace bit_util {
|
||||
class BitReader;
|
||||
} // namespace bit_util
|
||||
|
||||
namespace util {
|
||||
class RleDecoder;
|
||||
} // namespace util
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class Decryptor;
|
||||
class Page;
|
||||
|
||||
// 16 MB is the default maximum page header size
|
||||
static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
|
||||
|
||||
// 16 KB is the default expected page header size
|
||||
static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
|
||||
|
||||
class PARQUET_EXPORT LevelDecoder {
|
||||
public:
|
||||
LevelDecoder();
|
||||
~LevelDecoder();
|
||||
|
||||
// Initialize the LevelDecoder state with new data
|
||||
// and return the number of bytes consumed
|
||||
int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
|
||||
const uint8_t* data, int32_t data_size);
|
||||
|
||||
void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
|
||||
const uint8_t* data);
|
||||
|
||||
// Decodes a batch of levels into an array and returns the number of levels decoded
|
||||
int Decode(int batch_size, int16_t* levels);
|
||||
|
||||
private:
|
||||
int bit_width_;
|
||||
int num_values_remaining_;
|
||||
Encoding::type encoding_;
|
||||
std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
|
||||
std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
|
||||
int16_t max_level_;
|
||||
};
|
||||
|
||||
struct CryptoContext {
|
||||
CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
|
||||
std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
|
||||
: start_decrypt_with_dictionary_page(start_with_dictionary_page),
|
||||
row_group_ordinal(rg_ordinal),
|
||||
column_ordinal(col_ordinal),
|
||||
meta_decryptor(std::move(meta)),
|
||||
data_decryptor(std::move(data)) {}
|
||||
CryptoContext() {}
|
||||
|
||||
bool start_decrypt_with_dictionary_page = false;
|
||||
int16_t row_group_ordinal = -1;
|
||||
int16_t column_ordinal = -1;
|
||||
std::shared_ptr<Decryptor> meta_decryptor;
|
||||
std::shared_ptr<Decryptor> data_decryptor;
|
||||
};
|
||||
|
||||
// Abstract page iterator interface. This way, we can feed column pages to the
|
||||
// ColumnReader through whatever mechanism we choose
|
||||
class PARQUET_EXPORT PageReader {
|
||||
public:
|
||||
virtual ~PageReader() = default;
|
||||
|
||||
static std::unique_ptr<PageReader> Open(
|
||||
std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
|
||||
Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
|
||||
const CryptoContext* ctx = NULLPTR);
|
||||
|
||||
// @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
|
||||
// containing new Page otherwise
|
||||
virtual std::shared_ptr<Page> NextPage() = 0;
|
||||
|
||||
virtual void set_max_page_header_size(uint32_t size) = 0;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnReader {
|
||||
public:
|
||||
virtual ~ColumnReader() = default;
|
||||
|
||||
static std::shared_ptr<ColumnReader> Make(
|
||||
const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
// Returns true if there are still values in this column.
|
||||
virtual bool HasNext() = 0;
|
||||
|
||||
virtual Type::type type() const = 0;
|
||||
|
||||
virtual const ColumnDescriptor* descr() const = 0;
|
||||
|
||||
// Get the encoding that can be exposed by this reader. If it returns
|
||||
// dictionary encoding, then ReadBatchWithDictionary can be used to read data.
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
virtual ExposedEncoding GetExposedEncoding() = 0;
|
||||
|
||||
protected:
|
||||
friend class RowGroupReader;
|
||||
// Set the encoding that can be exposed by this reader.
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
|
||||
};
|
||||
|
||||
// API to read values from a single column. This is a main client facing API.
|
||||
template <typename DType>
|
||||
class TypedColumnReader : public ColumnReader {
|
||||
public:
|
||||
typedef typename DType::c_type T;
|
||||
|
||||
// Read a batch of repetition levels, definition levels, and values from the
|
||||
// column.
|
||||
//
|
||||
// Since null values are not stored in the values, the number of values read
|
||||
// may be less than the number of repetition and definition levels. With
|
||||
// nested data this is almost certainly true.
|
||||
//
|
||||
// Set def_levels or rep_levels to nullptr if you want to skip reading them.
|
||||
// This is only safe if you know through some other source that there are no
|
||||
// undefined values.
|
||||
//
|
||||
// To fully exhaust a row group, you must read batches until the number of
|
||||
// values read reaches the number of stored values according to the metadata.
|
||||
//
|
||||
// This API is the same for both V1 and V2 of the DataPage
|
||||
//
|
||||
// @returns: actual number of levels read (see values_read for number of values read)
|
||||
virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
|
||||
T* values, int64_t* values_read) = 0;
|
||||
|
||||
/// Read a batch of repetition levels, definition levels, and values from the
|
||||
/// column and leave spaces for null entries on the lowest level in the values
|
||||
/// buffer.
|
||||
///
|
||||
/// In comparison to ReadBatch the length of repetition and definition levels
|
||||
/// is the same as of the number of values read for max_definition_level == 1.
|
||||
/// In the case of max_definition_level > 1, the repetition and definition
|
||||
/// levels are larger than the values but the values include the null entries
|
||||
/// with definition_level == (max_definition_level - 1).
|
||||
///
|
||||
/// To fully exhaust a row group, you must read batches until the number of
|
||||
/// values read reaches the number of stored values according to the metadata.
|
||||
///
|
||||
/// @param batch_size the number of levels to read
|
||||
/// @param[out] def_levels The Parquet definition levels, output has
|
||||
/// the length levels_read.
|
||||
/// @param[out] rep_levels The Parquet repetition levels, output has
|
||||
/// the length levels_read.
|
||||
/// @param[out] values The values in the lowest nested level including
|
||||
/// spacing for nulls on the lowest levels; output has the length
|
||||
/// values_read.
|
||||
/// @param[out] valid_bits Memory allocated for a bitmap that indicates if
|
||||
/// the row is null or on the maximum definition level. For performance
|
||||
/// reasons the underlying buffer should be able to store 1 bit more than
|
||||
/// required. If this requires an additional byte, this byte is only read
|
||||
/// but never written to.
|
||||
/// @param valid_bits_offset The offset in bits of the valid_bits where the
|
||||
/// first relevant bit resides.
|
||||
/// @param[out] levels_read The number of repetition/definition levels that were read.
|
||||
/// @param[out] values_read The number of values read, this includes all
|
||||
/// non-null entries as well as all null-entries on the lowest level
|
||||
/// (i.e. definition_level == max_definition_level - 1)
|
||||
/// @param[out] null_count The number of nulls on the lowest levels.
|
||||
/// (i.e. (values_read - null_count) is total number of non-null entries)
|
||||
///
|
||||
/// \deprecated Since 4.0.0
|
||||
ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
|
||||
virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
|
||||
int16_t* rep_levels, T* values, uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset, int64_t* levels_read,
|
||||
int64_t* values_read, int64_t* null_count) = 0;
|
||||
|
||||
// Skip reading levels
|
||||
// Returns the number of levels skipped
|
||||
virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
|
||||
|
||||
// Read a batch of repetition levels, definition levels, and indices from the
|
||||
// column. And read the dictionary if a dictionary page is encountered during
|
||||
// reading pages. This API is similar to ReadBatch(), with ability to read
|
||||
// dictionary and indices. It is only valid to call this method when the reader can
|
||||
// expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
|
||||
// DICTIONARY).
|
||||
//
|
||||
// The dictionary is read along with the data page. When there's no data page,
|
||||
// the dictionary won't be returned.
|
||||
//
|
||||
// @param batch_size The batch size to read
|
||||
// @param[out] def_levels The Parquet definition levels.
|
||||
// @param[out] rep_levels The Parquet repetition levels.
|
||||
// @param[out] indices The dictionary indices.
|
||||
// @param[out] indices_read The number of indices read.
|
||||
// @param[out] dict The pointer to dictionary values. It will return nullptr if
|
||||
// there's no data page. Each column chunk only has one dictionary page. The dictionary
|
||||
// is owned by the reader, so the caller is responsible for copying the dictionary
|
||||
// values before the reader gets destroyed.
|
||||
// @param[out] dict_len The dictionary length. It will return 0 if there's no data
|
||||
// page.
|
||||
// @returns: actual number of levels read (see indices_read for number of
|
||||
// indices read
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
|
||||
int16_t* rep_levels, int32_t* indices,
|
||||
int64_t* indices_read, const T** dict,
|
||||
int32_t* dict_len) = 0;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Stateful column reader that delimits semantic records for both flat
|
||||
/// and nested columns
|
||||
///
|
||||
/// \note API EXPERIMENTAL
|
||||
/// \since 1.3.0
|
||||
class RecordReader {
|
||||
public:
|
||||
static std::shared_ptr<RecordReader> Make(
|
||||
const ColumnDescriptor* descr, LevelInfo leaf_info,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
|
||||
const bool read_dictionary = false);
|
||||
|
||||
virtual ~RecordReader() = default;
|
||||
|
||||
/// \brief Attempt to read indicated number of records from column chunk
|
||||
/// \return number of records read
|
||||
virtual int64_t ReadRecords(int64_t num_records) = 0;
|
||||
|
||||
/// \brief Pre-allocate space for data. Results in better flat read performance
|
||||
virtual void Reserve(int64_t num_values) = 0;
|
||||
|
||||
/// \brief Clear consumed values and repetition/definition levels as the
|
||||
/// result of calling ReadRecords
|
||||
virtual void Reset() = 0;
|
||||
|
||||
/// \brief Transfer filled values buffer to caller. A new one will be
|
||||
/// allocated in subsequent ReadRecords calls
|
||||
virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
|
||||
|
||||
/// \brief Transfer filled validity bitmap buffer to caller. A new one will
|
||||
/// be allocated in subsequent ReadRecords calls
|
||||
virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
|
||||
|
||||
/// \brief Return true if the record reader has more internal data yet to
|
||||
/// process
|
||||
virtual bool HasMoreData() const = 0;
|
||||
|
||||
/// \brief Advance record reader to the next row group
|
||||
/// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
|
||||
virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
|
||||
|
||||
virtual void DebugPrintState() = 0;
|
||||
|
||||
/// \brief Decoded definition levels
|
||||
int16_t* def_levels() const {
|
||||
return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
|
||||
}
|
||||
|
||||
/// \brief Decoded repetition levels
|
||||
int16_t* rep_levels() const {
|
||||
return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
|
||||
}
|
||||
|
||||
/// \brief Decoded values, including nulls, if any
|
||||
uint8_t* values() const { return values_->mutable_data(); }
|
||||
|
||||
/// \brief Number of values written including nulls (if any)
|
||||
int64_t values_written() const { return values_written_; }
|
||||
|
||||
/// \brief Number of definition / repetition levels (from those that have
|
||||
/// been decoded) that have been consumed inside the reader.
|
||||
int64_t levels_position() const { return levels_position_; }
|
||||
|
||||
/// \brief Number of definition / repetition levels that have been written
|
||||
/// internally in the reader
|
||||
int64_t levels_written() const { return levels_written_; }
|
||||
|
||||
/// \brief Number of nulls in the leaf
|
||||
int64_t null_count() const { return null_count_; }
|
||||
|
||||
/// \brief True if the leaf values are nullable
|
||||
bool nullable_values() const { return nullable_values_; }
|
||||
|
||||
/// \brief True if reading directly as Arrow dictionary-encoded
|
||||
bool read_dictionary() const { return read_dictionary_; }
|
||||
|
||||
protected:
|
||||
bool nullable_values_;
|
||||
|
||||
bool at_record_start_;
|
||||
int64_t records_read_;
|
||||
|
||||
int64_t values_written_;
|
||||
int64_t values_capacity_;
|
||||
int64_t null_count_;
|
||||
|
||||
int64_t levels_written_;
|
||||
int64_t levels_position_;
|
||||
int64_t levels_capacity_;
|
||||
|
||||
std::shared_ptr<::arrow::ResizableBuffer> values_;
|
||||
// In the case of false, don't allocate the values buffer (when we directly read into
|
||||
// builder classes).
|
||||
bool uses_values_;
|
||||
|
||||
std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
|
||||
std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
|
||||
std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
|
||||
|
||||
bool read_dictionary_ = false;
|
||||
};
|
||||
|
||||
class BinaryRecordReader : virtual public RecordReader {
|
||||
public:
|
||||
virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
|
||||
};
|
||||
|
||||
/// \brief Read records directly to dictionary-encoded Arrow form (int32
|
||||
/// indices). Only valid for BYTE_ARRAY columns
|
||||
class DictionaryRecordReader : virtual public RecordReader {
|
||||
public:
|
||||
virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
using BoolReader = TypedColumnReader<BooleanType>;
|
||||
using Int32Reader = TypedColumnReader<Int32Type>;
|
||||
using Int64Reader = TypedColumnReader<Int64Type>;
|
||||
using Int96Reader = TypedColumnReader<Int96Type>;
|
||||
using FloatReader = TypedColumnReader<FloatType>;
|
||||
using DoubleReader = TypedColumnReader<DoubleType>;
|
||||
using ByteArrayReader = TypedColumnReader<ByteArrayType>;
|
||||
using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
|
||||
|
||||
} // namespace parquet
|
262
.venv/Lib/site-packages/pyarrow/include/parquet/column_scanner.h
Normal file
262
.venv/Lib/site-packages/pyarrow/include/parquet/column_scanner.h
Normal file
@ -0,0 +1,262 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
|
||||
|
||||
class PARQUET_EXPORT Scanner {
|
||||
public:
|
||||
explicit Scanner(std::shared_ptr<ColumnReader> reader,
|
||||
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
|
||||
: batch_size_(batch_size),
|
||||
level_offset_(0),
|
||||
levels_buffered_(0),
|
||||
value_buffer_(AllocateBuffer(pool)),
|
||||
value_offset_(0),
|
||||
values_buffered_(0),
|
||||
reader_(std::move(reader)) {
|
||||
def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
|
||||
rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
|
||||
}
|
||||
|
||||
virtual ~Scanner() {}
|
||||
|
||||
static std::shared_ptr<Scanner> Make(
|
||||
std::shared_ptr<ColumnReader> col_reader,
|
||||
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
|
||||
|
||||
bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
|
||||
|
||||
const ColumnDescriptor* descr() const { return reader_->descr(); }
|
||||
|
||||
int64_t batch_size() const { return batch_size_; }
|
||||
|
||||
void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
|
||||
|
||||
protected:
|
||||
int64_t batch_size_;
|
||||
|
||||
std::vector<int16_t> def_levels_;
|
||||
std::vector<int16_t> rep_levels_;
|
||||
int level_offset_;
|
||||
int levels_buffered_;
|
||||
|
||||
std::shared_ptr<ResizableBuffer> value_buffer_;
|
||||
int value_offset_;
|
||||
int64_t values_buffered_;
|
||||
std::shared_ptr<ColumnReader> reader_;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
|
||||
public:
|
||||
typedef typename DType::c_type T;
|
||||
|
||||
explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
|
||||
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
|
||||
: Scanner(std::move(reader), batch_size, pool) {
|
||||
typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
|
||||
int value_byte_size = type_traits<DType::type_num>::value_byte_size;
|
||||
PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
|
||||
values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
|
||||
}
|
||||
|
||||
virtual ~TypedScanner() {}
|
||||
|
||||
bool NextLevels(int16_t* def_level, int16_t* rep_level) {
|
||||
if (level_offset_ == levels_buffered_) {
|
||||
levels_buffered_ = static_cast<int>(
|
||||
typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
|
||||
rep_levels_.data(), values_, &values_buffered_));
|
||||
|
||||
value_offset_ = 0;
|
||||
level_offset_ = 0;
|
||||
if (!levels_buffered_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
*def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
|
||||
*rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
|
||||
level_offset_++;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
|
||||
if (level_offset_ == levels_buffered_) {
|
||||
if (!HasNext()) {
|
||||
// Out of data pages
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
NextLevels(def_level, rep_level);
|
||||
*is_null = *def_level < descr()->max_definition_level();
|
||||
|
||||
if (*is_null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (value_offset_ == values_buffered_) {
|
||||
throw ParquetException("Value was non-null, but has not been buffered");
|
||||
}
|
||||
*val = values_[value_offset_++];
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns true if there is a next value
|
||||
bool NextValue(T* val, bool* is_null) {
|
||||
if (level_offset_ == levels_buffered_) {
|
||||
if (!HasNext()) {
|
||||
// Out of data pages
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Out of values
|
||||
int16_t def_level = -1;
|
||||
int16_t rep_level = -1;
|
||||
NextLevels(&def_level, &rep_level);
|
||||
*is_null = def_level < descr()->max_definition_level();
|
||||
|
||||
if (*is_null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (value_offset_ == values_buffered_) {
|
||||
throw ParquetException("Value was non-null, but has not been buffered");
|
||||
}
|
||||
*val = values_[value_offset_++];
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
|
||||
T val{};
|
||||
int16_t def_level = -1;
|
||||
int16_t rep_level = -1;
|
||||
bool is_null = false;
|
||||
char buffer[80];
|
||||
|
||||
if (!Next(&val, &def_level, &rep_level, &is_null)) {
|
||||
throw ParquetException("No more values buffered");
|
||||
}
|
||||
|
||||
if (with_levels) {
|
||||
out << " D:" << def_level << " R:" << rep_level << " ";
|
||||
if (!is_null) {
|
||||
out << "V:";
|
||||
}
|
||||
}
|
||||
|
||||
if (is_null) {
|
||||
std::string null_fmt = format_fwf<ByteArrayType>(width);
|
||||
snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
|
||||
} else {
|
||||
FormatValue(&val, buffer, sizeof(buffer), width);
|
||||
}
|
||||
out << buffer;
|
||||
}
|
||||
|
||||
private:
|
||||
// The ownership of this object is expressed through the reader_ variable in the base
|
||||
TypedColumnReader<DType>* typed_reader_;
|
||||
|
||||
inline void FormatValue(void* val, char* buffer, int bufsize, int width);
|
||||
|
||||
T* values_;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<DType>(width);
|
||||
snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<Int96Type>(width);
|
||||
std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
|
||||
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<ByteArrayType>(width);
|
||||
std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
|
||||
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<FLBAType>(width);
|
||||
std::string result = FixedLenByteArrayToString(
|
||||
*reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
|
||||
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
|
||||
}
|
||||
|
||||
typedef TypedScanner<BooleanType> BoolScanner;
|
||||
typedef TypedScanner<Int32Type> Int32Scanner;
|
||||
typedef TypedScanner<Int64Type> Int64Scanner;
|
||||
typedef TypedScanner<Int96Type> Int96Scanner;
|
||||
typedef TypedScanner<FloatType> FloatScanner;
|
||||
typedef TypedScanner<DoubleType> DoubleScanner;
|
||||
typedef TypedScanner<ByteArrayType> ByteArrayScanner;
|
||||
typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
|
||||
|
||||
template <typename RType>
|
||||
int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
|
||||
uint8_t* values, int64_t* values_buffered,
|
||||
parquet::ColumnReader* reader) {
|
||||
typedef typename RType::T Type;
|
||||
auto typed_reader = static_cast<RType*>(reader);
|
||||
auto vals = reinterpret_cast<Type*>(&values[0]);
|
||||
return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
|
||||
values_buffered);
|
||||
}
|
||||
|
||||
int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
|
||||
int16_t* rep_levels, uint8_t* values,
|
||||
int64_t* values_buffered,
|
||||
parquet::ColumnReader* reader);
|
||||
|
||||
} // namespace parquet
|
270
.venv/Lib/site-packages/pyarrow/include/parquet/column_writer.h
Normal file
270
.venv/Lib/site-packages/pyarrow/include/parquet/column_writer.h
Normal file
@ -0,0 +1,270 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
|
||||
namespace bit_util {
|
||||
class BitWriter;
|
||||
} // namespace bit_util
|
||||
|
||||
namespace util {
|
||||
class RleEncoder;
|
||||
} // namespace util
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
struct ArrowWriteContext;
|
||||
class ColumnDescriptor;
|
||||
class DataPage;
|
||||
class DictionaryPage;
|
||||
class ColumnChunkMetaDataBuilder;
|
||||
class Encryptor;
|
||||
class WriterProperties;
|
||||
|
||||
class PARQUET_EXPORT LevelEncoder {
|
||||
public:
|
||||
LevelEncoder();
|
||||
~LevelEncoder();
|
||||
|
||||
static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
|
||||
int num_buffered_values);
|
||||
|
||||
// Initialize the LevelEncoder.
|
||||
void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
|
||||
uint8_t* data, int data_size);
|
||||
|
||||
// Encodes a batch of levels from an array and returns the number of levels encoded
|
||||
int Encode(int batch_size, const int16_t* levels);
|
||||
|
||||
int32_t len() {
|
||||
if (encoding_ != Encoding::RLE) {
|
||||
throw ParquetException("Only implemented for RLE encoding");
|
||||
}
|
||||
return rle_length_;
|
||||
}
|
||||
|
||||
private:
|
||||
int bit_width_;
|
||||
int rle_length_;
|
||||
Encoding::type encoding_;
|
||||
std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
|
||||
std::unique_ptr<::arrow::bit_util::BitWriter> bit_packed_encoder_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT PageWriter {
|
||||
public:
|
||||
virtual ~PageWriter() {}
|
||||
|
||||
static std::unique_ptr<PageWriter> Open(
|
||||
std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
|
||||
int compression_level, ColumnChunkMetaDataBuilder* metadata,
|
||||
int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
|
||||
bool buffered_row_group = false,
|
||||
std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
|
||||
std::shared_ptr<Encryptor> data_encryptor = NULLPTR);
|
||||
|
||||
// The Column Writer decides if dictionary encoding is used if set and
|
||||
// if the dictionary encoding has fallen back to default encoding on reaching dictionary
|
||||
// page limit
|
||||
virtual void Close(bool has_dictionary, bool fallback) = 0;
|
||||
|
||||
// Return the number of uncompressed bytes written (including header size)
|
||||
virtual int64_t WriteDataPage(const DataPage& page) = 0;
|
||||
|
||||
// Return the number of uncompressed bytes written (including header size)
|
||||
virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
|
||||
|
||||
virtual bool has_compressor() = 0;
|
||||
|
||||
virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
|
||||
};
|
||||
|
||||
static constexpr int WRITE_BATCH_SIZE = 1000;
|
||||
class PARQUET_EXPORT ColumnWriter {
|
||||
public:
|
||||
virtual ~ColumnWriter() = default;
|
||||
|
||||
static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
|
||||
std::unique_ptr<PageWriter>,
|
||||
const WriterProperties* properties);
|
||||
|
||||
/// \brief Closes the ColumnWriter, commits any buffered values to pages.
|
||||
/// \return Total size of the column in bytes
|
||||
virtual int64_t Close() = 0;
|
||||
|
||||
/// \brief The physical Parquet type of the column
|
||||
virtual Type::type type() const = 0;
|
||||
|
||||
/// \brief The schema for the column
|
||||
virtual const ColumnDescriptor* descr() const = 0;
|
||||
|
||||
/// \brief The number of rows written so far
|
||||
virtual int64_t rows_written() const = 0;
|
||||
|
||||
/// \brief The total size of the compressed pages + page headers. Some values
|
||||
/// might be still buffered and not written to a page yet
|
||||
virtual int64_t total_compressed_bytes() const = 0;
|
||||
|
||||
/// \brief The total number of bytes written as serialized data and
|
||||
/// dictionary pages to the ColumnChunk so far
|
||||
virtual int64_t total_bytes_written() const = 0;
|
||||
|
||||
/// \brief The file-level writer properties
|
||||
virtual const WriterProperties* properties() = 0;
|
||||
|
||||
/// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
|
||||
/// error status if the array data type is not compatible with the concrete
|
||||
/// writer type.
|
||||
///
|
||||
/// leaf_array is always a primitive (possibly dictionary encoded type).
|
||||
/// Leaf_field_nullable indicates whether the leaf array is considered nullable
|
||||
/// according to its schema in a Table or its parent array.
|
||||
virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
|
||||
int64_t num_levels, const ::arrow::Array& leaf_array,
|
||||
ArrowWriteContext* ctx,
|
||||
bool leaf_field_nullable) = 0;
|
||||
};
|
||||
|
||||
// API to write values to a single column. This is the main client facing API.
|
||||
template <typename DType>
|
||||
class TypedColumnWriter : public ColumnWriter {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
// Write a batch of repetition levels, definition levels, and values to the
|
||||
// column.
|
||||
// `num_values` is the number of logical leaf values.
|
||||
// `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
|
||||
// (resp. max repetition level) is 0.
|
||||
// If not null, each of `def_levels` and `rep_levels` must have at least
|
||||
// `num_values`.
|
||||
//
|
||||
// The number of physical values written (taken from `values`) is returned.
|
||||
// It can be smaller than `num_values` is there are some undefined values.
|
||||
virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
|
||||
const int16_t* rep_levels, const T* values) = 0;
|
||||
|
||||
/// Write a batch of repetition levels, definition levels, and values to the
|
||||
/// column.
|
||||
///
|
||||
/// In comparison to WriteBatch the length of repetition and definition levels
|
||||
/// is the same as of the number of values read for max_definition_level == 1.
|
||||
/// In the case of max_definition_level > 1, the repetition and definition
|
||||
/// levels are larger than the values but the values include the null entries
|
||||
/// with definition_level == (max_definition_level - 1). Thus we have to differentiate
|
||||
/// in the parameters of this function if the input has the length of num_values or the
|
||||
/// _number of rows in the lowest nesting level_.
|
||||
///
|
||||
/// In the case that the most inner node in the Parquet is required, the _number of rows
|
||||
/// in the lowest nesting level_ is equal to the number of non-null values. If the
|
||||
/// inner-most schema node is optional, the _number of rows in the lowest nesting level_
|
||||
/// also includes all values with definition_level == (max_definition_level - 1).
|
||||
///
|
||||
/// @param num_values number of levels to write.
|
||||
/// @param def_levels The Parquet definition levels, length is num_values
|
||||
/// @param rep_levels The Parquet repetition levels, length is num_values
|
||||
/// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
|
||||
/// level. The length is number of rows in the lowest nesting level.
|
||||
/// @param valid_bits_offset The offset in bits of the valid_bits where the
|
||||
/// first relevant bit resides.
|
||||
/// @param values The values in the lowest nested level including
|
||||
/// spacing for nulls on the lowest levels; input has the length
|
||||
/// of the number of rows on the lowest nesting level.
|
||||
virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
|
||||
const int16_t* rep_levels, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset, const T* values) = 0;
|
||||
|
||||
// Estimated size of the values that are not written to a page yet
|
||||
virtual int64_t EstimatedBufferedValueBytes() const = 0;
|
||||
};
|
||||
|
||||
using BoolWriter = TypedColumnWriter<BooleanType>;
|
||||
using Int32Writer = TypedColumnWriter<Int32Type>;
|
||||
using Int64Writer = TypedColumnWriter<Int64Type>;
|
||||
using Int96Writer = TypedColumnWriter<Int96Type>;
|
||||
using FloatWriter = TypedColumnWriter<FloatType>;
|
||||
using DoubleWriter = TypedColumnWriter<DoubleType>;
|
||||
using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
|
||||
using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
|
||||
|
||||
namespace internal {
|
||||
|
||||
/**
|
||||
* Timestamp conversion constants
|
||||
*/
|
||||
constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
|
||||
|
||||
template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
|
||||
inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
|
||||
int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
|
||||
(*impala_timestamp).value[2] = (uint32_t)julian_days;
|
||||
|
||||
int64_t last_day_units = time % UnitPerDay;
|
||||
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
|
||||
// impala_timestamp will be unaligned every other entry so do memcpy instead
|
||||
// of assign and reinterpret cast to avoid undefined behavior.
|
||||
std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
|
||||
}
|
||||
|
||||
constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
|
||||
|
||||
inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
|
||||
impala_timestamp);
|
||||
}
|
||||
|
||||
constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
|
||||
|
||||
inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
|
||||
Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
|
||||
milliseconds, impala_timestamp);
|
||||
}
|
||||
|
||||
constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
|
||||
|
||||
inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
|
||||
Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
|
||||
microseconds, impala_timestamp);
|
||||
}
|
||||
|
||||
constexpr int64_t kNanosecondsInNanos = INT64_C(1);
|
||||
|
||||
inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
|
||||
Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
|
||||
nanoseconds, impala_timestamp);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace parquet
|
460
.venv/Lib/site-packages/pyarrow/include/parquet/encoding.h
Normal file
460
.venv/Lib/site-packages/pyarrow/include/parquet/encoding.h
Normal file
@ -0,0 +1,460 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/spaced.h"
|
||||
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class ArrayBuilder;
|
||||
class BinaryArray;
|
||||
class BinaryBuilder;
|
||||
class BooleanBuilder;
|
||||
class Int32Type;
|
||||
class Int64Type;
|
||||
class FloatType;
|
||||
class DoubleType;
|
||||
class FixedSizeBinaryType;
|
||||
template <typename T>
|
||||
class NumericBuilder;
|
||||
class FixedSizeBinaryBuilder;
|
||||
template <typename T>
|
||||
class Dictionary32Builder;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
template <typename DType>
|
||||
class TypedEncoder;
|
||||
|
||||
using BooleanEncoder = TypedEncoder<BooleanType>;
|
||||
using Int32Encoder = TypedEncoder<Int32Type>;
|
||||
using Int64Encoder = TypedEncoder<Int64Type>;
|
||||
using Int96Encoder = TypedEncoder<Int96Type>;
|
||||
using FloatEncoder = TypedEncoder<FloatType>;
|
||||
using DoubleEncoder = TypedEncoder<DoubleType>;
|
||||
using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
|
||||
using FLBAEncoder = TypedEncoder<FLBAType>;
|
||||
|
||||
template <typename DType>
|
||||
class TypedDecoder;
|
||||
|
||||
class BooleanDecoder;
|
||||
using Int32Decoder = TypedDecoder<Int32Type>;
|
||||
using Int64Decoder = TypedDecoder<Int64Type>;
|
||||
using Int96Decoder = TypedDecoder<Int96Type>;
|
||||
using FloatDecoder = TypedDecoder<FloatType>;
|
||||
using DoubleDecoder = TypedDecoder<DoubleType>;
|
||||
using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
|
||||
class FLBADecoder;
|
||||
|
||||
template <typename T>
|
||||
struct EncodingTraits;
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<BooleanType> {
|
||||
using Encoder = BooleanEncoder;
|
||||
using Decoder = BooleanDecoder;
|
||||
|
||||
using ArrowType = ::arrow::BooleanType;
|
||||
using Accumulator = ::arrow::BooleanBuilder;
|
||||
struct DictAccumulator {};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<Int32Type> {
|
||||
using Encoder = Int32Encoder;
|
||||
using Decoder = Int32Decoder;
|
||||
|
||||
using ArrowType = ::arrow::Int32Type;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<Int64Type> {
|
||||
using Encoder = Int64Encoder;
|
||||
using Decoder = Int64Decoder;
|
||||
|
||||
using ArrowType = ::arrow::Int64Type;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<Int96Type> {
|
||||
using Encoder = Int96Encoder;
|
||||
using Decoder = Int96Decoder;
|
||||
|
||||
struct Accumulator {};
|
||||
struct DictAccumulator {};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<FloatType> {
|
||||
using Encoder = FloatEncoder;
|
||||
using Decoder = FloatDecoder;
|
||||
|
||||
using ArrowType = ::arrow::FloatType;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<DoubleType> {
|
||||
using Encoder = DoubleEncoder;
|
||||
using Decoder = DoubleDecoder;
|
||||
|
||||
using ArrowType = ::arrow::DoubleType;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<ByteArrayType> {
|
||||
using Encoder = ByteArrayEncoder;
|
||||
using Decoder = ByteArrayDecoder;
|
||||
|
||||
/// \brief Internal helper class for decoding BYTE_ARRAY data where we can
|
||||
/// overflow the capacity of a single arrow::BinaryArray
|
||||
struct Accumulator {
|
||||
std::unique_ptr<::arrow::BinaryBuilder> builder;
|
||||
std::vector<std::shared_ptr<::arrow::Array>> chunks;
|
||||
};
|
||||
using ArrowType = ::arrow::BinaryType;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<FLBAType> {
|
||||
using Encoder = FLBAEncoder;
|
||||
using Decoder = FLBADecoder;
|
||||
|
||||
using ArrowType = ::arrow::FixedSizeBinaryType;
|
||||
using Accumulator = ::arrow::FixedSizeBinaryBuilder;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
|
||||
};
|
||||
|
||||
class ColumnDescriptor;
|
||||
|
||||
// Untyped base for all encoders
|
||||
class Encoder {
|
||||
public:
|
||||
virtual ~Encoder() = default;
|
||||
|
||||
virtual int64_t EstimatedDataEncodedSize() = 0;
|
||||
virtual std::shared_ptr<Buffer> FlushValues() = 0;
|
||||
virtual Encoding::type encoding() const = 0;
|
||||
|
||||
virtual void Put(const ::arrow::Array& values) = 0;
|
||||
|
||||
virtual MemoryPool* memory_pool() const = 0;
|
||||
};
|
||||
|
||||
// Base class for value encoders. Since encoders may or not have state (e.g.,
|
||||
// dictionary encoding) we use a class instance to maintain any state.
|
||||
//
|
||||
// Encode interfaces are internal, subject to change without deprecation.
|
||||
template <typename DType>
|
||||
class TypedEncoder : virtual public Encoder {
|
||||
public:
|
||||
typedef typename DType::c_type T;
|
||||
|
||||
using Encoder::Put;
|
||||
|
||||
virtual void Put(const T* src, int num_values) = 0;
|
||||
|
||||
virtual void Put(const std::vector<T>& src, int num_values = -1);
|
||||
|
||||
virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset) = 0;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
|
||||
if (num_values == -1) {
|
||||
num_values = static_cast<int>(src.size());
|
||||
}
|
||||
Put(src.data(), num_values);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
|
||||
// NOTE(wesm): This stub is here only to satisfy the compiler; it is
|
||||
// overridden later with the actual implementation
|
||||
}
|
||||
|
||||
// Base class for dictionary encoders
|
||||
template <typename DType>
|
||||
class DictEncoder : virtual public TypedEncoder<DType> {
|
||||
public:
|
||||
/// Writes out any buffered indices to buffer preceded by the bit width of this data.
|
||||
/// Returns the number of bytes written.
|
||||
/// If the supplied buffer is not big enough, returns -1.
|
||||
/// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
|
||||
/// to size buffer.
|
||||
virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
|
||||
|
||||
virtual int dict_encoded_size() = 0;
|
||||
// virtual int dict_encoded_size() { return dict_encoded_size_; }
|
||||
|
||||
virtual int bit_width() const = 0;
|
||||
|
||||
/// Writes out the encoded dictionary to buffer. buffer must be preallocated to
|
||||
/// dict_encoded_size() bytes.
|
||||
virtual void WriteDict(uint8_t* buffer) = 0;
|
||||
|
||||
virtual int num_entries() const = 0;
|
||||
|
||||
/// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
|
||||
/// assumed (without any boundschecking) that the indices reference
|
||||
/// pre-existing dictionary values
|
||||
/// \param[in] indices the dictionary index values. Only Int32Array currently
|
||||
/// supported
|
||||
virtual void PutIndices(const ::arrow::Array& indices) = 0;
|
||||
|
||||
/// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
|
||||
/// separately. Currently throws exception if the current dictionary memo is
|
||||
/// non-empty
|
||||
/// \param[in] values the dictionary values. Only valid for certain
|
||||
/// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
|
||||
virtual void PutDictionary(const ::arrow::Array& values) = 0;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Value decoding
|
||||
|
||||
class Decoder {
|
||||
public:
|
||||
virtual ~Decoder() = default;
|
||||
|
||||
// Sets the data for a new page. This will be called multiple times on the same
|
||||
// decoder and should reset all internal state.
|
||||
virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
|
||||
|
||||
// Returns the number of values left (for the last call to SetData()). This is
|
||||
// the number of values left in this page.
|
||||
virtual int values_left() const = 0;
|
||||
virtual Encoding::type encoding() const = 0;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
class TypedDecoder : virtual public Decoder {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
/// \brief Decode values into a buffer
|
||||
///
|
||||
/// Subclasses may override the more specialized Decode methods below.
|
||||
///
|
||||
/// \param[in] buffer destination for decoded values
|
||||
/// \param[in] max_values maximum number of values to decode
|
||||
/// \return The number of values decoded. Should be identical to max_values except
|
||||
/// at the end of the current data page.
|
||||
virtual int Decode(T* buffer, int max_values) = 0;
|
||||
|
||||
/// \brief Decode the values in this data page but leave spaces for null entries.
|
||||
///
|
||||
/// \param[in] buffer destination for decoded values
|
||||
/// \param[in] num_values size of the def_levels and buffer arrays including the number
|
||||
/// of null slots
|
||||
/// \param[in] null_count number of null slots
|
||||
/// \param[in] valid_bits bitmap data indicating position of valid slots
|
||||
/// \param[in] valid_bits_offset offset into valid_bits
|
||||
/// \return The number of values decoded, including nulls.
|
||||
virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
|
||||
const uint8_t* valid_bits, int64_t valid_bits_offset) {
|
||||
if (null_count > 0) {
|
||||
int values_to_read = num_values - null_count;
|
||||
int values_read = Decode(buffer, values_to_read);
|
||||
if (values_read != values_to_read) {
|
||||
throw ParquetException("Number of values / definition_levels read did not match");
|
||||
}
|
||||
|
||||
return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
|
||||
valid_bits, valid_bits_offset);
|
||||
} else {
|
||||
return Decode(buffer, num_values);
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Decode into an ArrayBuilder or other accumulator
|
||||
///
|
||||
/// This function assumes the definition levels were already decoded
|
||||
/// as a validity bitmap in the given `valid_bits`. `null_count`
|
||||
/// is the number of 0s in `valid_bits`.
|
||||
/// As a space optimization, it is allowed for `valid_bits` to be null
|
||||
/// if `null_count` is zero.
|
||||
///
|
||||
/// \return number of values decoded
|
||||
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset,
|
||||
typename EncodingTraits<DType>::Accumulator* out) = 0;
|
||||
|
||||
/// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
|
||||
///
|
||||
/// \return number of values decoded
|
||||
int DecodeArrowNonNull(int num_values,
|
||||
typename EncodingTraits<DType>::Accumulator* out) {
|
||||
return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
|
||||
}
|
||||
|
||||
/// \brief Decode into a DictionaryBuilder
|
||||
///
|
||||
/// This function assumes the definition levels were already decoded
|
||||
/// as a validity bitmap in the given `valid_bits`. `null_count`
|
||||
/// is the number of 0s in `valid_bits`.
|
||||
/// As a space optimization, it is allowed for `valid_bits` to be null
|
||||
/// if `null_count` is zero.
|
||||
///
|
||||
/// \return number of values decoded
|
||||
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset,
|
||||
typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
|
||||
|
||||
/// \brief Decode into a DictionaryBuilder ignoring nulls
|
||||
///
|
||||
/// \return number of values decoded
|
||||
int DecodeArrowNonNull(int num_values,
|
||||
typename EncodingTraits<DType>::DictAccumulator* builder) {
|
||||
return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
class DictDecoder : virtual public TypedDecoder<DType> {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
|
||||
|
||||
/// \brief Insert dictionary values into the Arrow dictionary builder's memo,
|
||||
/// but do not append any indices
|
||||
virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
|
||||
|
||||
/// \brief Decode only dictionary indices and append to dictionary
|
||||
/// builder. The builder must have had the dictionary from this decoder
|
||||
/// inserted already.
|
||||
///
|
||||
/// \warning Remember to reset the builder each time the dict decoder is initialized
|
||||
/// with a new dictionary page
|
||||
virtual int DecodeIndicesSpaced(int num_values, int null_count,
|
||||
const uint8_t* valid_bits, int64_t valid_bits_offset,
|
||||
::arrow::ArrayBuilder* builder) = 0;
|
||||
|
||||
/// \brief Decode only dictionary indices (no nulls)
|
||||
///
|
||||
/// \warning Remember to reset the builder each time the dict decoder is initialized
|
||||
/// with a new dictionary page
|
||||
virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
|
||||
|
||||
/// \brief Decode only dictionary indices (no nulls). Same as above
|
||||
/// DecodeIndices but target is an array instead of a builder.
|
||||
///
|
||||
/// \note API EXPERIMENTAL
|
||||
virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
|
||||
|
||||
/// \brief Get dictionary. The reader will call this API when it encounters a
|
||||
/// new dictionary.
|
||||
///
|
||||
/// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
|
||||
/// the decoder and is destroyed when the decoder is destroyed.
|
||||
/// @param[out] dictionary_length The dictionary length.
|
||||
///
|
||||
/// \note API EXPERIMENTAL
|
||||
virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// TypedEncoder specializations, traits, and factory functions
|
||||
|
||||
class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
|
||||
public:
|
||||
using TypedDecoder<BooleanType>::Decode;
|
||||
virtual int Decode(uint8_t* buffer, int max_values) = 0;
|
||||
};
|
||||
|
||||
class FLBADecoder : virtual public TypedDecoder<FLBAType> {
|
||||
public:
|
||||
using TypedDecoder<FLBAType>::DecodeSpaced;
|
||||
|
||||
// TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
|
||||
// there is value in adding specialized read methods for
|
||||
// FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
|
||||
// then perhaps not
|
||||
};
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Encoder> MakeEncoder(
|
||||
Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
|
||||
const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
template <typename DType>
|
||||
std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
|
||||
Encoding::type encoding, bool use_dictionary = false,
|
||||
const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
using OutType = typename EncodingTraits<DType>::Encoder;
|
||||
std::unique_ptr<Encoder> base =
|
||||
MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
|
||||
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
|
||||
}
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
|
||||
const ColumnDescriptor* descr = NULLPTR);
|
||||
|
||||
namespace detail {
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
|
||||
const ColumnDescriptor* descr,
|
||||
::arrow::MemoryPool* pool);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename DType>
|
||||
std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
|
||||
const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
using OutType = DictDecoder<DType>;
|
||||
auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
|
||||
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
|
||||
Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
|
||||
using OutType = typename EncodingTraits<DType>::Decoder;
|
||||
std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
|
||||
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
|
||||
}
|
||||
|
||||
} // namespace parquet
|
@ -0,0 +1,135 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "parquet/encryption/encryption.h"
|
||||
#include "parquet/encryption/file_key_wrapper.h"
|
||||
#include "parquet/encryption/key_toolkit.h"
|
||||
#include "parquet/encryption/kms_client_factory.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
|
||||
ParquetCipher::AES_GCM_V1;
|
||||
static constexpr bool kDefaultPlaintextFooter = false;
|
||||
static constexpr bool kDefaultDoubleWrapping = true;
|
||||
static constexpr double kDefaultCacheLifetimeSeconds = 600; // 10 minutes
|
||||
static constexpr bool kDefaultInternalKeyMaterial = true;
|
||||
static constexpr bool kDefaultUniformEncryption = false;
|
||||
static constexpr int32_t kDefaultDataKeyLengthBits = 128;
|
||||
|
||||
struct PARQUET_EXPORT EncryptionConfiguration {
|
||||
explicit EncryptionConfiguration(const std::string& footer_key)
|
||||
: footer_key(footer_key) {}
|
||||
|
||||
/// ID of the master key for footer encryption/signing
|
||||
std::string footer_key;
|
||||
|
||||
/// List of columns to encrypt, with master key IDs (see HIVE-21848).
|
||||
/// Format: "masterKeyID:colName,colName;masterKeyID:colName..."
|
||||
/// Either
|
||||
/// (1) column_keys must be set
|
||||
/// or
|
||||
/// (2) uniform_encryption must be set to true
|
||||
/// If none of (1) and (2) are true, or if both are true, an exception will be
|
||||
/// thrown.
|
||||
std::string column_keys;
|
||||
|
||||
/// Encrypt footer and all columns with the same encryption key.
|
||||
bool uniform_encryption = kDefaultUniformEncryption;
|
||||
|
||||
/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1".
|
||||
ParquetCipher::type encryption_algorithm = kDefaultEncryptionAlgorithm;
|
||||
|
||||
/// Write files with plaintext footer.
|
||||
/// The default is false - files are written with encrypted footer.
|
||||
bool plaintext_footer = kDefaultPlaintextFooter;
|
||||
|
||||
/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key
|
||||
/// encryption keys (KEKs), which in turn are encrypted with master keys.
|
||||
/// The default is true. If set to false, use single wrapping - where DEKs are
|
||||
/// encrypted directly with master keys.
|
||||
bool double_wrapping = kDefaultDoubleWrapping;
|
||||
|
||||
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
|
||||
/// objects).
|
||||
/// The default is 600 (10 minutes).
|
||||
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
|
||||
|
||||
/// Store key material inside Parquet file footers; this mode doesn’t produce
|
||||
/// additional files. By default, true. If set to false, key material is stored in
|
||||
/// separate files in the same folder, which enables key rotation for immutable
|
||||
/// Parquet files.
|
||||
bool internal_key_material = kDefaultInternalKeyMaterial;
|
||||
|
||||
/// Length of data encryption keys (DEKs), randomly generated by parquet key
|
||||
/// management tools. Can be 128, 192 or 256 bits.
|
||||
/// The default is 128 bits.
|
||||
int32_t data_key_length_bits = kDefaultDataKeyLengthBits;
|
||||
};
|
||||
|
||||
struct PARQUET_EXPORT DecryptionConfiguration {
|
||||
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
|
||||
/// objects).
|
||||
/// The default is 600 (10 minutes).
|
||||
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
|
||||
};
|
||||
|
||||
/// This is a core class, that translates the parameters of high level encryption (like
|
||||
/// the names of encrypted columns, names of master keys, etc), into parameters of low
|
||||
/// level encryption (like the key metadata, DEK, etc). A factory that produces the low
|
||||
/// level FileEncryptionProperties and FileDecryptionProperties objects, from the high
|
||||
/// level parameters.
|
||||
class PARQUET_EXPORT CryptoFactory {
|
||||
public:
|
||||
/// a KmsClientFactory object must be registered via this method before calling any of
|
||||
/// GetFileEncryptionProperties()/GetFileDecryptionProperties() methods.
|
||||
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory);
|
||||
|
||||
std::shared_ptr<FileEncryptionProperties> GetFileEncryptionProperties(
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
const EncryptionConfiguration& encryption_config);
|
||||
|
||||
/// The returned FileDecryptionProperties object will use the cache inside this
|
||||
/// CryptoFactory object, so please keep this
|
||||
/// CryptoFactory object alive along with the returned
|
||||
/// FileDecryptionProperties object.
|
||||
std::shared_ptr<FileDecryptionProperties> GetFileDecryptionProperties(
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
const DecryptionConfiguration& decryption_config);
|
||||
|
||||
void RemoveCacheEntriesForToken(const std::string& access_token) {
|
||||
key_toolkit_.RemoveCacheEntriesForToken(access_token);
|
||||
}
|
||||
|
||||
void RemoveCacheEntriesForAllTokens() { key_toolkit_.RemoveCacheEntriesForAllTokens(); }
|
||||
|
||||
private:
|
||||
ColumnPathToEncryptionPropertiesMap GetColumnEncryptionProperties(
|
||||
int dek_length, const std::string& column_keys, FileKeyWrapper* key_wrapper);
|
||||
|
||||
/// Key utilities object for kms client initialization and cache control
|
||||
KeyToolkit key_toolkit_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,510 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
|
||||
ParquetCipher::AES_GCM_V1;
|
||||
static constexpr int32_t kMaximalAadMetadataLength = 256;
|
||||
static constexpr bool kDefaultEncryptedFooter = true;
|
||||
static constexpr bool kDefaultCheckSignature = true;
|
||||
static constexpr bool kDefaultAllowPlaintextFiles = false;
|
||||
static constexpr int32_t kAadFileUniqueLength = 8;
|
||||
|
||||
class ColumnDecryptionProperties;
|
||||
using ColumnPathToDecryptionPropertiesMap =
|
||||
std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
|
||||
|
||||
class ColumnEncryptionProperties;
|
||||
using ColumnPathToEncryptionPropertiesMap =
|
||||
std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
|
||||
|
||||
class PARQUET_EXPORT DecryptionKeyRetriever {
|
||||
public:
|
||||
virtual std::string GetKey(const std::string& key_metadata) = 0;
|
||||
virtual ~DecryptionKeyRetriever() {}
|
||||
};
|
||||
|
||||
/// Simple integer key retriever
|
||||
class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
|
||||
public:
|
||||
void PutKey(uint32_t key_id, const std::string& key);
|
||||
std::string GetKey(const std::string& key_metadata) override;
|
||||
|
||||
private:
|
||||
std::map<uint32_t, std::string> key_map_;
|
||||
};
|
||||
|
||||
// Simple string key retriever
|
||||
class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
|
||||
public:
|
||||
void PutKey(const std::string& key_id, const std::string& key);
|
||||
std::string GetKey(const std::string& key_metadata) override;
|
||||
|
||||
private:
|
||||
std::map<std::string, std::string> key_map_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT HiddenColumnException : public ParquetException {
|
||||
public:
|
||||
explicit HiddenColumnException(const std::string& columnPath)
|
||||
: ParquetException(columnPath.c_str()) {}
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
|
||||
public:
|
||||
explicit KeyAccessDeniedException(const std::string& columnPath)
|
||||
: ParquetException(columnPath.c_str()) {}
|
||||
};
|
||||
|
||||
inline const uint8_t* str2bytes(const std::string& str) {
|
||||
if (str.empty()) return NULLPTR;
|
||||
|
||||
char* cbytes = const_cast<char*>(str.c_str());
|
||||
return reinterpret_cast<const uint8_t*>(cbytes);
|
||||
}
|
||||
|
||||
class PARQUET_EXPORT ColumnEncryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
/// Convenience builder for encrypted columns.
|
||||
explicit Builder(const std::string& name) : Builder(name, true) {}
|
||||
|
||||
/// Convenience builder for encrypted columns.
|
||||
explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
|
||||
: Builder(path->ToDotString(), true) {}
|
||||
|
||||
/// Set a column-specific key.
|
||||
/// If key is not set on an encrypted column, the column will
|
||||
/// be encrypted with the footer key.
|
||||
/// keyBytes Key length must be either 16, 24 or 32 bytes.
|
||||
/// The key is cloned, and will be wiped out (array values set to 0) upon completion
|
||||
/// of file writing.
|
||||
/// Caller is responsible for wiping out the input key array.
|
||||
Builder* key(std::string column_key);
|
||||
|
||||
/// Set a key retrieval metadata.
|
||||
/// use either key_metadata() or key_id(), not both
|
||||
Builder* key_metadata(const std::string& key_metadata);
|
||||
|
||||
/// A convenience function to set key metadata using a string id.
|
||||
/// Set a key retrieval metadata (converted from String).
|
||||
/// use either key_metadata() or key_id(), not both
|
||||
/// key_id will be converted to metadata (UTF-8 array).
|
||||
Builder* key_id(const std::string& key_id);
|
||||
|
||||
std::shared_ptr<ColumnEncryptionProperties> build() {
|
||||
return std::shared_ptr<ColumnEncryptionProperties>(
|
||||
new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
|
||||
}
|
||||
|
||||
private:
|
||||
const std::string column_path_;
|
||||
bool encrypted_;
|
||||
std::string key_;
|
||||
std::string key_metadata_;
|
||||
|
||||
Builder(const std::string path, bool encrypted)
|
||||
: column_path_(path), encrypted_(encrypted) {}
|
||||
};
|
||||
|
||||
std::string column_path() const { return column_path_; }
|
||||
bool is_encrypted() const { return encrypted_; }
|
||||
bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
|
||||
std::string key() const { return key_; }
|
||||
std::string key_metadata() const { return key_metadata_; }
|
||||
|
||||
/// Upon completion of file writing, the encryption key
|
||||
/// will be wiped out.
|
||||
void WipeOutEncryptionKey() { key_.clear(); }
|
||||
|
||||
bool is_utilized() {
|
||||
if (key_.empty())
|
||||
return false; // can re-use column properties without encryption keys
|
||||
return utilized_;
|
||||
}
|
||||
|
||||
/// ColumnEncryptionProperties object can be used for writing one file only.
|
||||
/// Mark ColumnEncryptionProperties as utilized once it is used in
|
||||
/// FileEncryptionProperties as the encryption key will be wiped out upon
|
||||
/// completion of file writing.
|
||||
void set_utilized() { utilized_ = true; }
|
||||
|
||||
std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
|
||||
std::string key_copy = key_;
|
||||
return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
|
||||
encrypted_, column_path_, key_copy, key_metadata_));
|
||||
}
|
||||
|
||||
ColumnEncryptionProperties() = default;
|
||||
ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
|
||||
ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
|
||||
|
||||
private:
|
||||
const std::string column_path_;
|
||||
bool encrypted_;
|
||||
bool encrypted_with_footer_key_;
|
||||
std::string key_;
|
||||
std::string key_metadata_;
|
||||
bool utilized_;
|
||||
explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
|
||||
const std::string& key,
|
||||
const std::string& key_metadata);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnDecryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
explicit Builder(const std::string& name) : column_path_(name) {}
|
||||
|
||||
explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
|
||||
: Builder(path->ToDotString()) {}
|
||||
|
||||
/// Set an explicit column key. If applied on a file that contains
|
||||
/// key metadata for this column the metadata will be ignored,
|
||||
/// the column will be decrypted with this key.
|
||||
/// key length must be either 16, 24 or 32 bytes.
|
||||
Builder* key(const std::string& key);
|
||||
|
||||
std::shared_ptr<ColumnDecryptionProperties> build();
|
||||
|
||||
private:
|
||||
const std::string column_path_;
|
||||
std::string key_;
|
||||
};
|
||||
|
||||
ColumnDecryptionProperties() = default;
|
||||
ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
|
||||
ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
|
||||
|
||||
std::string column_path() const { return column_path_; }
|
||||
std::string key() const { return key_; }
|
||||
bool is_utilized() { return utilized_; }
|
||||
|
||||
/// ColumnDecryptionProperties object can be used for reading one file only.
|
||||
/// Mark ColumnDecryptionProperties as utilized once it is used in
|
||||
/// FileDecryptionProperties as the encryption key will be wiped out upon
|
||||
/// completion of file reading.
|
||||
void set_utilized() { utilized_ = true; }
|
||||
|
||||
/// Upon completion of file reading, the encryption key
|
||||
/// will be wiped out.
|
||||
void WipeOutDecryptionKey();
|
||||
|
||||
std::shared_ptr<ColumnDecryptionProperties> DeepClone();
|
||||
|
||||
private:
|
||||
const std::string column_path_;
|
||||
std::string key_;
|
||||
bool utilized_;
|
||||
|
||||
/// This class is only required for setting explicit column decryption keys -
|
||||
/// to override key retriever (or to provide keys when key metadata and/or
|
||||
/// key retriever are not available)
|
||||
explicit ColumnDecryptionProperties(const std::string& column_path,
|
||||
const std::string& key);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT AADPrefixVerifier {
|
||||
public:
|
||||
/// Verifies identity (AAD Prefix) of individual file,
|
||||
/// or of file collection in a data set.
|
||||
/// Throws exception if an AAD prefix is wrong.
|
||||
/// In a data set, AAD Prefixes should be collected,
|
||||
/// and then checked for missing files.
|
||||
virtual void Verify(const std::string& aad_prefix) = 0;
|
||||
virtual ~AADPrefixVerifier() {}
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileDecryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
Builder() {
|
||||
check_plaintext_footer_integrity_ = kDefaultCheckSignature;
|
||||
plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
|
||||
}
|
||||
|
||||
/// Set an explicit footer key. If applied on a file that contains
|
||||
/// footer key metadata the metadata will be ignored, the footer
|
||||
/// will be decrypted/verified with this key.
|
||||
/// If explicit key is not set, footer key will be fetched from
|
||||
/// key retriever.
|
||||
/// With explicit keys or AAD prefix, new encryption properties object must be
|
||||
/// created for each encrypted file.
|
||||
/// Explicit encryption keys (footer and column) are cloned.
|
||||
/// Upon completion of file reading, the cloned encryption keys in the properties
|
||||
/// will be wiped out (array values set to 0).
|
||||
/// Caller is responsible for wiping out the input key array.
|
||||
/// param footerKey Key length must be either 16, 24 or 32 bytes.
|
||||
Builder* footer_key(const std::string footer_key);
|
||||
|
||||
/// Set explicit column keys (decryption properties).
|
||||
/// Its also possible to set a key retriever on this property object.
|
||||
/// Upon file decryption, availability of explicit keys is checked before
|
||||
/// invocation of the retriever callback.
|
||||
/// If an explicit key is available for a footer or a column,
|
||||
/// its key metadata will be ignored.
|
||||
Builder* column_keys(
|
||||
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
|
||||
|
||||
/// Set a key retriever callback. Its also possible to
|
||||
/// set explicit footer or column keys on this file property object.
|
||||
/// Upon file decryption, availability of explicit keys is checked before
|
||||
/// invocation of the retriever callback.
|
||||
/// If an explicit key is available for a footer or a column,
|
||||
/// its key metadata will be ignored.
|
||||
Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
|
||||
|
||||
/// Skip integrity verification of plaintext footers.
|
||||
/// If not called, integrity of plaintext footers will be checked in runtime,
|
||||
/// and an exception will be thrown in the following situations:
|
||||
/// - footer signing key is not available
|
||||
/// (not passed, or not found by key retriever)
|
||||
/// - footer content and signature don't match
|
||||
Builder* disable_footer_signature_verification() {
|
||||
check_plaintext_footer_integrity_ = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Explicitly supply the file AAD prefix.
|
||||
/// A must when a prefix is used for file encryption, but not stored in file.
|
||||
/// If AAD prefix is stored in file, it will be compared to the explicitly
|
||||
/// supplied value and an exception will be thrown if they differ.
|
||||
Builder* aad_prefix(const std::string& aad_prefix);
|
||||
|
||||
/// Set callback for verification of AAD Prefixes stored in file.
|
||||
Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
|
||||
|
||||
/// By default, reading plaintext (unencrypted) files is not
|
||||
/// allowed when using a decryptor
|
||||
/// - in order to detect files that were not encrypted by mistake.
|
||||
/// However, the default behavior can be overridden by calling this method.
|
||||
/// The caller should use then a different method to ensure encryption
|
||||
/// of files with sensitive data.
|
||||
Builder* plaintext_files_allowed() {
|
||||
plaintext_files_allowed_ = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
std::shared_ptr<FileDecryptionProperties> build() {
|
||||
return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
|
||||
footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
|
||||
aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
|
||||
}
|
||||
|
||||
private:
|
||||
std::string footer_key_;
|
||||
std::string aad_prefix_;
|
||||
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
|
||||
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
|
||||
|
||||
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
|
||||
bool check_plaintext_footer_integrity_;
|
||||
bool plaintext_files_allowed_;
|
||||
};
|
||||
|
||||
std::string column_key(const std::string& column_path) const;
|
||||
|
||||
std::string footer_key() const { return footer_key_; }
|
||||
|
||||
std::string aad_prefix() const { return aad_prefix_; }
|
||||
|
||||
const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
|
||||
return key_retriever_;
|
||||
}
|
||||
|
||||
bool check_plaintext_footer_integrity() const {
|
||||
return check_plaintext_footer_integrity_;
|
||||
}
|
||||
|
||||
bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
|
||||
|
||||
const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
|
||||
return aad_prefix_verifier_;
|
||||
}
|
||||
|
||||
/// Upon completion of file reading, the encryption keys in the properties
|
||||
/// will be wiped out (array values set to 0).
|
||||
void WipeOutDecryptionKeys();
|
||||
|
||||
bool is_utilized();
|
||||
|
||||
/// FileDecryptionProperties object can be used for reading one file only.
|
||||
/// Mark FileDecryptionProperties as utilized once it is used to read a file as the
|
||||
/// encryption keys will be wiped out upon completion of file reading.
|
||||
void set_utilized() { utilized_ = true; }
|
||||
|
||||
/// FileDecryptionProperties object can be used for reading one file only.
|
||||
/// (unless this object keeps the keyRetrieval callback only, and no explicit
|
||||
/// keys or aadPrefix).
|
||||
/// At the end, keys are wiped out in the memory.
|
||||
/// This method allows to clone identical properties for another file,
|
||||
/// with an option to update the aadPrefix (if newAadPrefix is null,
|
||||
/// aadPrefix will be cloned too)
|
||||
std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
|
||||
|
||||
private:
|
||||
std::string footer_key_;
|
||||
std::string aad_prefix_;
|
||||
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
|
||||
|
||||
const std::string empty_string_ = "";
|
||||
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
|
||||
|
||||
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
|
||||
bool check_plaintext_footer_integrity_;
|
||||
bool plaintext_files_allowed_;
|
||||
bool utilized_;
|
||||
|
||||
FileDecryptionProperties(
|
||||
const std::string& footer_key,
|
||||
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
|
||||
bool check_plaintext_footer_integrity, const std::string& aad_prefix,
|
||||
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
|
||||
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
|
||||
bool plaintext_files_allowed);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileEncryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
explicit Builder(const std::string& footer_key)
|
||||
: parquet_cipher_(kDefaultEncryptionAlgorithm),
|
||||
encrypted_footer_(kDefaultEncryptedFooter) {
|
||||
footer_key_ = footer_key;
|
||||
store_aad_prefix_in_file_ = false;
|
||||
}
|
||||
|
||||
/// Create files with plaintext footer.
|
||||
/// If not called, the files will be created with encrypted footer (default).
|
||||
Builder* set_plaintext_footer() {
|
||||
encrypted_footer_ = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Set encryption algorithm.
|
||||
/// If not called, files will be encrypted with AES_GCM_V1 (default).
|
||||
Builder* algorithm(ParquetCipher::type parquet_cipher) {
|
||||
parquet_cipher_ = parquet_cipher;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Set a key retrieval metadata (converted from String).
|
||||
/// use either footer_key_metadata or footer_key_id, not both.
|
||||
Builder* footer_key_id(const std::string& key_id);
|
||||
|
||||
/// Set a key retrieval metadata.
|
||||
/// use either footer_key_metadata or footer_key_id, not both.
|
||||
Builder* footer_key_metadata(const std::string& footer_key_metadata);
|
||||
|
||||
/// Set the file AAD Prefix.
|
||||
Builder* aad_prefix(const std::string& aad_prefix);
|
||||
|
||||
/// Skip storing AAD Prefix in file.
|
||||
/// If not called, and if AAD Prefix is set, it will be stored.
|
||||
Builder* disable_aad_prefix_storage();
|
||||
|
||||
/// Set the list of encrypted columns and their properties (keys etc).
|
||||
/// If not called, all columns will be encrypted with the footer key.
|
||||
/// If called, the file columns not in the list will be left unencrypted.
|
||||
Builder* encrypted_columns(
|
||||
const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
|
||||
|
||||
std::shared_ptr<FileEncryptionProperties> build() {
|
||||
return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
|
||||
parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
|
||||
aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
|
||||
}
|
||||
|
||||
private:
|
||||
ParquetCipher::type parquet_cipher_;
|
||||
bool encrypted_footer_;
|
||||
std::string footer_key_;
|
||||
std::string footer_key_metadata_;
|
||||
|
||||
std::string aad_prefix_;
|
||||
bool store_aad_prefix_in_file_;
|
||||
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
|
||||
};
|
||||
bool encrypted_footer() const { return encrypted_footer_; }
|
||||
|
||||
EncryptionAlgorithm algorithm() const { return algorithm_; }
|
||||
|
||||
std::string footer_key() const { return footer_key_; }
|
||||
|
||||
std::string footer_key_metadata() const { return footer_key_metadata_; }
|
||||
|
||||
std::string file_aad() const { return file_aad_; }
|
||||
|
||||
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
|
||||
const std::string& column_path);
|
||||
|
||||
bool is_utilized() const { return utilized_; }
|
||||
|
||||
/// FileEncryptionProperties object can be used for writing one file only.
|
||||
/// Mark FileEncryptionProperties as utilized once it is used to write a file as the
|
||||
/// encryption keys will be wiped out upon completion of file writing.
|
||||
void set_utilized() { utilized_ = true; }
|
||||
|
||||
/// Upon completion of file writing, the encryption keys
|
||||
/// will be wiped out (array values set to 0).
|
||||
void WipeOutEncryptionKeys();
|
||||
|
||||
/// FileEncryptionProperties object can be used for writing one file only.
|
||||
/// (at the end, keys are wiped out in the memory).
|
||||
/// This method allows to clone identical properties for another file,
|
||||
/// with an option to update the aadPrefix (if newAadPrefix is null,
|
||||
/// aadPrefix will be cloned too)
|
||||
std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
|
||||
|
||||
ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
|
||||
return encrypted_columns_;
|
||||
}
|
||||
|
||||
private:
|
||||
EncryptionAlgorithm algorithm_;
|
||||
std::string footer_key_;
|
||||
std::string footer_key_metadata_;
|
||||
bool encrypted_footer_;
|
||||
std::string file_aad_;
|
||||
std::string aad_prefix_;
|
||||
bool utilized_;
|
||||
bool store_aad_prefix_in_file_;
|
||||
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
|
||||
|
||||
FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
|
||||
const std::string& footer_key_metadata, bool encrypted_footer,
|
||||
const std::string& aad_prefix, bool store_aad_prefix_in_file,
|
||||
const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
|
||||
};
|
||||
|
||||
} // namespace parquet
|
@ -0,0 +1,31 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License") = 0; you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// Key material can be stored outside the Parquet file, for example in a separate small
|
||||
// file in the same folder. This is important for “key rotation”, when MEKs have to be
|
||||
// changed (if compromised; or periodically, just in case) - without modifying the Parquet
|
||||
// files (often immutable).
|
||||
// TODO: details will be implemented later
|
||||
class FileKeyMaterialStore {};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,66 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
|
||||
#include "parquet/encryption/encryption.h"
|
||||
#include "parquet/encryption/key_material.h"
|
||||
#include "parquet/encryption/key_toolkit.h"
|
||||
#include "parquet/encryption/key_toolkit_internal.h"
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// This class will retrieve the key from "key metadata", following these steps:
|
||||
// 1. Parse "key metadata" (see structure in KeyMetadata class).
|
||||
// 2. Retrieve "key material" which can be stored inside or outside "key metadata"
|
||||
// Currently we don't support the case "key material" stores outside "key metadata"
|
||||
// yet.
|
||||
// 3. Unwrap the "data encryption key" from "key material". There are 2 modes:
|
||||
// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master
|
||||
// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted
|
||||
// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above
|
||||
// "key encryption key"
|
||||
class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever {
|
||||
public:
|
||||
/// key_toolkit and kms_connection_config is to get KmsClient from cache or create
|
||||
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
|
||||
/// KmsClient in the cache.
|
||||
FileKeyUnwrapper(KeyToolkit* key_toolkit,
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
double cache_lifetime_seconds);
|
||||
|
||||
std::string GetKey(const std::string& key_metadata) override;
|
||||
|
||||
private:
|
||||
internal::KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material);
|
||||
std::shared_ptr<KmsClient> GetKmsClientFromConfigOrKeyMaterial(
|
||||
const KeyMaterial& key_material);
|
||||
|
||||
/// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token
|
||||
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, std::string>> kek_per_kek_id_;
|
||||
KeyToolkit* key_toolkit_;
|
||||
KmsConnectionConfig kms_connection_config_;
|
||||
const double cache_entry_lifetime_seconds_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,82 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
|
||||
#include "parquet/encryption/file_key_material_store.h"
|
||||
#include "parquet/encryption/key_encryption_key.h"
|
||||
#include "parquet/encryption/key_toolkit.h"
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// This class will generate "key metadata" from "data encryption key" and "master key",
|
||||
// following these steps:
|
||||
// 1. Wrap "data encryption key". There are 2 modes:
|
||||
// 1.1. single wrapping: encrypt "data encryption key" directly with "master encryption
|
||||
// key"
|
||||
// 1.2. double wrapping: 2 steps:
|
||||
// 1.2.1. "key encryption key" is randomized (see KeyEncryptionKey class)
|
||||
// 1.2.2. "data encryption key" is encrypted with the above "key encryption key"
|
||||
// 2. Create "key material" (see structure in KeyMaterial class)
|
||||
// 3. Create "key metadata" with "key material" inside or a reference to outside "key
|
||||
// material" (see structure in KeyMetadata class).
|
||||
// We don't support the case "key material" stores outside "key metadata" yet.
|
||||
class PARQUET_EXPORT FileKeyWrapper {
|
||||
public:
|
||||
static constexpr int kKeyEncryptionKeyLength = 16;
|
||||
static constexpr int kKeyEncryptionKeyIdLength = 16;
|
||||
|
||||
/// key_toolkit and kms_connection_config is to get KmsClient from the cache or create
|
||||
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
|
||||
/// KmsClient in the cache. key_material_store is to store "key material" outside
|
||||
/// parquet file, NULL if "key material" is stored inside parquet file.
|
||||
FileKeyWrapper(KeyToolkit* key_toolkit,
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
std::shared_ptr<FileKeyMaterialStore> key_material_store,
|
||||
double cache_entry_lifetime_seconds, bool double_wrapping);
|
||||
|
||||
/// Creates key_metadata field for a given data key, via wrapping the key with the
|
||||
/// master key
|
||||
std::string GetEncryptionKeyMetadata(const std::string& data_key,
|
||||
const std::string& master_key_id,
|
||||
bool is_footer_key);
|
||||
|
||||
private:
|
||||
KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id);
|
||||
|
||||
/// A map of Master Encryption Key ID -> KeyEncryptionKey, for the current token
|
||||
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, KeyEncryptionKey>>
|
||||
kek_per_master_key_id_;
|
||||
|
||||
std::shared_ptr<KmsClient> kms_client_;
|
||||
KmsConnectionConfig kms_connection_config_;
|
||||
std::shared_ptr<FileKeyMaterialStore> key_material_store_;
|
||||
const double cache_entry_lifetime_seconds_;
|
||||
const bool double_wrapping_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,59 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/base64.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key
|
||||
// encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK).
|
||||
// In a writer process, a random KEK is generated for each MEK ID, and cached in a <MEK-ID
|
||||
// : KEK> map. This allows to perform an interaction with a KMS server only once for each
|
||||
// MEK, in order to wrap its KEK. "Data encryption key" (DEK) wrapping is performed
|
||||
// locally, and does not involve an interaction with a KMS server.
|
||||
class KeyEncryptionKey {
|
||||
public:
|
||||
KeyEncryptionKey(std::string kek_bytes, std::string kek_id,
|
||||
std::string encoded_wrapped_kek)
|
||||
: kek_bytes_(std::move(kek_bytes)),
|
||||
kek_id_(std::move(kek_id)),
|
||||
encoded_kek_id_(::arrow::util::base64_encode(kek_id_)),
|
||||
encoded_wrapped_kek_(std::move(encoded_wrapped_kek)) {}
|
||||
|
||||
const std::string& kek_bytes() const { return kek_bytes_; }
|
||||
|
||||
const std::string& kek_id() const { return kek_id_; }
|
||||
|
||||
const std::string& encoded_kek_id() const { return encoded_kek_id_; }
|
||||
|
||||
const std::string& encoded_wrapped_kek() const { return encoded_wrapped_kek_; }
|
||||
|
||||
private:
|
||||
std::string kek_bytes_;
|
||||
std::string kek_id_;
|
||||
std::string encoded_kek_id_;
|
||||
std::string encoded_wrapped_kek_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,131 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace json {
|
||||
namespace internal {
|
||||
class ObjectParser;
|
||||
} // namespace internal
|
||||
} // namespace json
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// KeyMaterial class represents the "key material", keeping the information that allows
|
||||
// readers to recover an encryption key (see description of the KeyMetadata class). The
|
||||
// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a
|
||||
// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key
|
||||
// material is generated by encrypting the "data encryption key" (DEK) by a "master key".
|
||||
// In the double wrapping mode, the key material is generated by encrypting the DEK by a
|
||||
// "key encryption key" (KEK), that in turn is encrypted by a "master key".
|
||||
//
|
||||
// Key material is kept in a flat json object, with the following fields:
|
||||
// 1. "keyMaterialType" - a String, with the type of key material. In the current
|
||||
// version, only one value is allowed - "PKMT1" (stands
|
||||
// for "parquet key management tools, version 1"). For external key material storage,
|
||||
// this field is written in both "key metadata" and "key material" jsons. For internal
|
||||
// key material storage, this field is written only once in the common json.
|
||||
// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer
|
||||
// key, and keeps additional information (such as
|
||||
// KMS instance ID and URL). If false, means that the material belongs to a column
|
||||
// key.
|
||||
// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key
|
||||
// material.
|
||||
// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key
|
||||
// material.
|
||||
// 5. "masterKeyID" - a String, with the ID of the master key used to generate the
|
||||
// material.
|
||||
// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding).
|
||||
// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in
|
||||
// double wrapping mode.
|
||||
// If false - in single wrapping mode.
|
||||
// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the
|
||||
// material. Written only in double wrapping mode.
|
||||
// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in
|
||||
// double wrapping mode.
|
||||
class PARQUET_EXPORT KeyMaterial {
|
||||
public:
|
||||
// these fields are defined in a specification and should never be changed
|
||||
static constexpr const char kKeyMaterialTypeField[] = "keyMaterialType";
|
||||
static constexpr const char kKeyMaterialType1[] = "PKMT1";
|
||||
|
||||
static constexpr const char kFooterKeyIdInFile[] = "footerKey";
|
||||
static constexpr const char kColumnKeyIdInFilePrefix[] = "columnKey";
|
||||
|
||||
static constexpr const char kIsFooterKeyField[] = "isFooterKey";
|
||||
static constexpr const char kDoubleWrappingField[] = "doubleWrapping";
|
||||
static constexpr const char kKmsInstanceIdField[] = "kmsInstanceID";
|
||||
static constexpr const char kKmsInstanceUrlField[] = "kmsInstanceURL";
|
||||
static constexpr const char kMasterKeyIdField[] = "masterKeyID";
|
||||
static constexpr const char kWrappedDataEncryptionKeyField[] = "wrappedDEK";
|
||||
static constexpr const char kKeyEncryptionKeyIdField[] = "keyEncryptionKeyID";
|
||||
static constexpr const char kWrappedKeyEncryptionKeyField[] = "wrappedKEK";
|
||||
|
||||
public:
|
||||
KeyMaterial() = default;
|
||||
|
||||
static KeyMaterial Parse(const std::string& key_material_string);
|
||||
|
||||
static KeyMaterial Parse(
|
||||
const ::arrow::json::internal::ObjectParser* key_material_json);
|
||||
|
||||
/// This method returns a json string that will be stored either inside a parquet file
|
||||
/// or in a key material store outside the parquet file.
|
||||
static std::string SerializeToJson(bool is_footer_key,
|
||||
const std::string& kms_instance_id,
|
||||
const std::string& kms_instance_url,
|
||||
const std::string& master_key_id,
|
||||
bool is_double_wrapped, const std::string& kek_id,
|
||||
const std::string& encoded_wrapped_kek,
|
||||
const std::string& encoded_wrapped_dek,
|
||||
bool is_internal_storage);
|
||||
|
||||
bool is_footer_key() const { return is_footer_key_; }
|
||||
bool is_double_wrapped() const { return is_double_wrapped_; }
|
||||
const std::string& master_key_id() const { return master_key_id_; }
|
||||
const std::string& wrapped_dek() const { return encoded_wrapped_dek_; }
|
||||
const std::string& kek_id() const { return kek_id_; }
|
||||
const std::string& wrapped_kek() const { return encoded_wrapped_kek_; }
|
||||
const std::string& kms_instance_id() const { return kms_instance_id_; }
|
||||
const std::string& kms_instance_url() const { return kms_instance_url_; }
|
||||
|
||||
private:
|
||||
KeyMaterial(bool is_footer_key, const std::string& kms_instance_id,
|
||||
const std::string& kms_instance_url, const std::string& master_key_id,
|
||||
bool is_double_wrapped, const std::string& kek_id,
|
||||
const std::string& encoded_wrapped_kek,
|
||||
const std::string& encoded_wrapped_dek);
|
||||
|
||||
bool is_footer_key_;
|
||||
std::string kms_instance_id_;
|
||||
std::string kms_instance_url_;
|
||||
std::string master_key_id_;
|
||||
bool is_double_wrapped_;
|
||||
std::string kek_id_;
|
||||
std::string encoded_wrapped_kek_;
|
||||
std::string encoded_wrapped_dek_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,94 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arrow/util/variant.h"
|
||||
|
||||
#include "parquet/encryption/key_material.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// Parquet encryption specification defines "key metadata" as an arbitrary byte array,
|
||||
// generated by file writers for each encryption key, and passed to the low level API for
|
||||
// storage in the file footer. The "key metadata" field is made available to file readers
|
||||
// to enable recovery of the key. This interface can be utilized for implementation
|
||||
// of any key management scheme.
|
||||
//
|
||||
// The keytools package (PARQUET-1373) implements one approach, of many possible, to key
|
||||
// management and to generation of the "key metadata" fields. This approach, based on the
|
||||
// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual
|
||||
// material, required to recover a key, in a "key material" object (see the KeyMaterial
|
||||
// class for details). This class is implemented to support version 1 of the parquet key
|
||||
// management tools specification.
|
||||
//
|
||||
// KeyMetadata writes (and reads) the "key metadata" field as a flat json object,
|
||||
// with the following fields:
|
||||
// 1. "keyMaterialType" - a String, with the type of key material.
|
||||
// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the
|
||||
// "key metadata" field. If false, "key material" is kept externally (outside Parquet
|
||||
// files) - in this case, "key metadata" keeps a reference to the external "key material".
|
||||
// 3. "keyReference" - a String, with the reference to the external "key material".
|
||||
// Written only if internalStorage is false.
|
||||
//
|
||||
// If internalStorage is true, "key material" is a part of "key metadata", and the json
|
||||
// keeps additional fields, described in the KeyMaterial class.
|
||||
class PARQUET_EXPORT KeyMetadata {
|
||||
public:
|
||||
static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage";
|
||||
static constexpr const char kKeyReferenceField[] = "keyReference";
|
||||
|
||||
/// key_metadata_bytes is the key metadata field stored in the parquet file,
|
||||
/// in the serialized json object format.
|
||||
static KeyMetadata Parse(const std::string& key_metadata_bytes);
|
||||
|
||||
static std::string CreateSerializedForExternalMaterial(
|
||||
const std::string& key_reference);
|
||||
|
||||
bool key_material_stored_internally() const { return is_internal_storage_; }
|
||||
|
||||
const KeyMaterial& key_material() const {
|
||||
if (!is_internal_storage_) {
|
||||
throw ParquetException("key material is stored externally.");
|
||||
}
|
||||
return ::arrow::util::get<KeyMaterial>(key_material_or_reference_);
|
||||
}
|
||||
|
||||
const std::string& key_reference() const {
|
||||
if (is_internal_storage_) {
|
||||
throw ParquetException("key material is stored internally.");
|
||||
}
|
||||
return ::arrow::util::get<std::string>(key_material_or_reference_);
|
||||
}
|
||||
|
||||
private:
|
||||
explicit KeyMetadata(const KeyMaterial& key_material);
|
||||
explicit KeyMetadata(const std::string& key_reference);
|
||||
|
||||
bool is_internal_storage_;
|
||||
/// If is_internal_storage_ is true, KeyMaterial is set,
|
||||
/// else a string referencing to an outside "key material" is set.
|
||||
::arrow::util::Variant<KeyMaterial, std::string> key_material_or_reference_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,76 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "parquet/encryption/key_encryption_key.h"
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/encryption/kms_client_factory.h"
|
||||
#include "parquet/encryption/two_level_cache_with_expiration.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// KeyToolkit is a utility that keeps various tools for key management (such as key
|
||||
// rotation, kms client instantiation, cache control, etc), plus a number of auxiliary
|
||||
// classes for internal use.
|
||||
class PARQUET_EXPORT KeyToolkit {
|
||||
public:
|
||||
/// KMS client two level cache: token -> KMSInstanceId -> KmsClient
|
||||
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>>& kms_client_cache_per_token() {
|
||||
return kms_client_cache_;
|
||||
}
|
||||
/// Key encryption key two level cache for wrapping: token -> MasterEncryptionKeyId ->
|
||||
/// KeyEncryptionKey
|
||||
TwoLevelCacheWithExpiration<KeyEncryptionKey>& kek_write_cache_per_token() {
|
||||
return key_encryption_key_write_cache_;
|
||||
}
|
||||
|
||||
/// Key encryption key two level cache for unwrapping: token -> KeyEncryptionKeyId ->
|
||||
/// KeyEncryptionKeyBytes
|
||||
TwoLevelCacheWithExpiration<std::string>& kek_read_cache_per_token() {
|
||||
return key_encryption_key_read_cache_;
|
||||
}
|
||||
|
||||
std::shared_ptr<KmsClient> GetKmsClient(
|
||||
const KmsConnectionConfig& kms_connection_config, double cache_entry_lifetime_ms);
|
||||
|
||||
/// Flush any caches that are tied to the (compromised) access_token
|
||||
void RemoveCacheEntriesForToken(const std::string& access_token);
|
||||
|
||||
void RemoveCacheEntriesForAllTokens();
|
||||
|
||||
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory) {
|
||||
if (kms_client_factory_ != NULL) {
|
||||
throw ParquetException("KMS client factory has already been registered.");
|
||||
}
|
||||
kms_client_factory_ = kms_client_factory;
|
||||
}
|
||||
|
||||
private:
|
||||
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>> kms_client_cache_;
|
||||
TwoLevelCacheWithExpiration<KeyEncryptionKey> key_encryption_key_write_cache_;
|
||||
TwoLevelCacheWithExpiration<std::string> key_encryption_key_read_cache_;
|
||||
std::shared_ptr<KmsClientFactory> kms_client_factory_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,95 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/mutex.h"
|
||||
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
/// This class wraps the key access token of a KMS server. If your token changes over
|
||||
/// time, you should keep the reference to the KeyAccessToken object and call Refresh()
|
||||
/// method every time you have a new token.
|
||||
class PARQUET_EXPORT KeyAccessToken {
|
||||
public:
|
||||
KeyAccessToken() = default;
|
||||
|
||||
explicit KeyAccessToken(const std::string value) : value_(value) {}
|
||||
|
||||
void Refresh(const std::string& new_value) {
|
||||
auto lock = mutex_.Lock();
|
||||
value_ = new_value;
|
||||
}
|
||||
|
||||
const std::string& value() const {
|
||||
auto lock = mutex_.Lock();
|
||||
return value_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string value_;
|
||||
mutable ::arrow::util::Mutex mutex_;
|
||||
};
|
||||
|
||||
struct PARQUET_EXPORT KmsConnectionConfig {
|
||||
std::string kms_instance_id;
|
||||
std::string kms_instance_url;
|
||||
/// If the access token is changed in the future, you should keep a reference to
|
||||
/// this object and call Refresh() on it whenever there is a new access token.
|
||||
std::shared_ptr<KeyAccessToken> refreshable_key_access_token;
|
||||
std::unordered_map<std::string, std::string> custom_kms_conf;
|
||||
|
||||
KmsConnectionConfig();
|
||||
|
||||
const std::string& key_access_token() const {
|
||||
if (refreshable_key_access_token == NULL ||
|
||||
refreshable_key_access_token->value().empty()) {
|
||||
throw ParquetException("key access token is not set!");
|
||||
}
|
||||
return refreshable_key_access_token->value();
|
||||
}
|
||||
|
||||
void SetDefaultIfEmpty();
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT KmsClient {
|
||||
public:
|
||||
static constexpr const char kKmsInstanceIdDefault[] = "DEFAULT";
|
||||
static constexpr const char kKmsInstanceUrlDefault[] = "DEFAULT";
|
||||
static constexpr const char kKeyAccessTokenDefault[] = "DEFAULT";
|
||||
|
||||
/// Wraps a key - encrypts it with the master key, encodes the result
|
||||
/// and potentially adds a KMS-specific metadata.
|
||||
virtual std::string WrapKey(const std::string& key_bytes,
|
||||
const std::string& master_key_identifier) = 0;
|
||||
|
||||
/// Decrypts (unwraps) a key with the master key.
|
||||
virtual std::string UnwrapKey(const std::string& wrapped_key,
|
||||
const std::string& master_key_identifier) = 0;
|
||||
virtual ~KmsClient() {}
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,40 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
class PARQUET_EXPORT KmsClientFactory {
|
||||
public:
|
||||
explicit KmsClientFactory(bool wrap_locally = false) : wrap_locally_(wrap_locally) {}
|
||||
|
||||
virtual ~KmsClientFactory() = default;
|
||||
|
||||
virtual std::shared_ptr<KmsClient> CreateKmsClient(
|
||||
const KmsConnectionConfig& kms_connection_config) = 0;
|
||||
|
||||
protected:
|
||||
bool wrap_locally_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,96 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
/// This class supports local wrapping mode, master keys will be fetched from the KMS
|
||||
/// server and used to encrypt other keys (data encryption keys or key encryption keys).
|
||||
class PARQUET_EXPORT LocalWrapKmsClient : public KmsClient {
|
||||
public:
|
||||
static constexpr const char kLocalWrapNoKeyVersion[] = "NO_VERSION";
|
||||
|
||||
explicit LocalWrapKmsClient(const KmsConnectionConfig& kms_connection_config);
|
||||
|
||||
std::string WrapKey(const std::string& key_bytes,
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
std::string UnwrapKey(const std::string& wrapped_key,
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
protected:
|
||||
/// Get master key from the remote KMS server.
|
||||
/// Note: this function might be called by multiple threads
|
||||
virtual std::string GetMasterKeyFromServer(
|
||||
const std::string& master_key_identifier) = 0;
|
||||
|
||||
private:
|
||||
/// KMS systems wrap keys by encrypting them by master keys, and attaching additional
|
||||
/// information (such as the version number of the masker key) to the result of
|
||||
/// encryption. The master key version is required in key rotation. Currently, the
|
||||
/// local wrapping mode does not support key rotation (because not all KMS systems allow
|
||||
/// to fetch a master key by its ID and version number). Still, the local wrapping mode
|
||||
/// adds a placeholder for the master key version, that will enable support for key
|
||||
/// rotation in this mode in the future, with appropriate KMS systems. This will also
|
||||
/// enable backward compatibility, where future readers will be able to extract master
|
||||
/// key version in the files written by the current code.
|
||||
///
|
||||
/// LocalKeyWrap class writes (and reads) the "key wrap" as a flat json with the
|
||||
/// following fields:
|
||||
/// 1. "masterKeyVersion" - a String, with the master key version. In the current
|
||||
/// version, only one value is allowed - "NO_VERSION".
|
||||
/// 2. "encryptedKey" - a String, with the key encrypted by the master key
|
||||
/// (base64-encoded).
|
||||
class LocalKeyWrap {
|
||||
public:
|
||||
static constexpr const char kLocalWrapKeyVersionField[] = "masterKeyVersion";
|
||||
static constexpr const char kLocalWrapEncryptedKeyField[] = "encryptedKey";
|
||||
|
||||
LocalKeyWrap(std::string master_key_version, std::string encrypted_encoded_key);
|
||||
|
||||
static std::string CreateSerialized(const std::string& encrypted_encoded_key);
|
||||
|
||||
static LocalKeyWrap Parse(const std::string& wrapped_key);
|
||||
|
||||
const std::string& master_key_version() const { return master_key_version_; }
|
||||
|
||||
const std::string& encrypted_encoded_key() const { return encrypted_encoded_key_; }
|
||||
|
||||
private:
|
||||
std::string encrypted_encoded_key_;
|
||||
std::string master_key_version_;
|
||||
};
|
||||
|
||||
std::string GetKeyFromServer(const std::string& key_identifier);
|
||||
|
||||
protected:
|
||||
KmsConnectionConfig kms_connection_config_;
|
||||
::arrow::util::ConcurrentMap<std::string, std::string> master_key_cache_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,118 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module defines an abstract interface for iterating through pages in a
|
||||
// Parquet column chunk within a row group. It could be extended in the future
|
||||
// to iterate through all data pages in all chunks in a file.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "arrow/util/io_util.h"
|
||||
|
||||
#include "parquet/encryption/encryption.h"
|
||||
#include "parquet/test_util.h"
|
||||
|
||||
namespace parquet {
|
||||
class ParquetFileReader;
|
||||
namespace encryption {
|
||||
namespace test {
|
||||
|
||||
using ::arrow::internal::TemporaryDir;
|
||||
|
||||
constexpr int kFixedLength = 10;
|
||||
|
||||
const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16
|
||||
const char kColumnEncryptionKey1[] = "1234567890123450";
|
||||
const char kColumnEncryptionKey2[] = "1234567890123451";
|
||||
const char kFileName[] = "tester";
|
||||
|
||||
// Get the path of file inside parquet test data directory
|
||||
std::string data_file(const char* file);
|
||||
|
||||
// A temporary directory that contains the encrypted files generated in the tests.
|
||||
extern std::unique_ptr<TemporaryDir> temp_dir;
|
||||
|
||||
inline ::arrow::Result<std::unique_ptr<TemporaryDir>> temp_data_dir() {
|
||||
return TemporaryDir::Make("parquet-encryption-test-");
|
||||
}
|
||||
|
||||
const char kDoubleFieldName[] = "double_field";
|
||||
const char kFloatFieldName[] = "float_field";
|
||||
const char kBooleanFieldName[] = "boolean_field";
|
||||
const char kInt32FieldName[] = "int32_field";
|
||||
const char kInt64FieldName[] = "int64_field";
|
||||
const char kInt96FieldName[] = "int96_field";
|
||||
const char kByteArrayFieldName[] = "ba_field";
|
||||
const char kFixedLenByteArrayFieldName[] = "flba_field";
|
||||
|
||||
const char kFooterMasterKey[] = "0123456789112345";
|
||||
const char kFooterMasterKeyId[] = "kf";
|
||||
const char* const kColumnMasterKeys[] = {"1234567890123450", "1234567890123451",
|
||||
"1234567890123452", "1234567890123453",
|
||||
"1234567890123454", "1234567890123455"};
|
||||
const char* const kColumnMasterKeyIds[] = {"kc1", "kc2", "kc3", "kc4", "kc5", "kc6"};
|
||||
|
||||
// The result of this function will be used to set into TestOnlyInMemoryKmsClientFactory
|
||||
// as the key mapping to look at.
|
||||
std::unordered_map<std::string, std::string> BuildKeyMap(const char* const* column_ids,
|
||||
const char* const* column_keys,
|
||||
const char* footer_id,
|
||||
const char* footer_key);
|
||||
|
||||
// The result of this function will be used to set into EncryptionConfiguration
|
||||
// as colum keys.
|
||||
std::string BuildColumnKeyMapping();
|
||||
|
||||
// FileEncryptor and FileDecryptor are helper classes to write/read an encrypted parquet
|
||||
// file corresponding to each pair of FileEncryptionProperties/FileDecryptionProperties.
|
||||
// FileEncryptor writes the file with fixed data values and FileDecryptor reads the file
|
||||
// and verify the correctness of data values.
|
||||
class FileEncryptor {
|
||||
public:
|
||||
FileEncryptor();
|
||||
|
||||
void EncryptFile(
|
||||
std::string file,
|
||||
std::shared_ptr<parquet::FileEncryptionProperties> encryption_configurations);
|
||||
|
||||
private:
|
||||
std::shared_ptr<schema::GroupNode> SetupEncryptionSchema();
|
||||
|
||||
int num_rowgroups_ = 5;
|
||||
int rows_per_rowgroup_ = 50;
|
||||
std::shared_ptr<schema::GroupNode> schema_;
|
||||
};
|
||||
|
||||
class FileDecryptor {
|
||||
public:
|
||||
void DecryptFile(std::string file_name,
|
||||
std::shared_ptr<FileDecryptionProperties> file_decryption_properties);
|
||||
|
||||
private:
|
||||
void CheckFile(parquet::ParquetFileReader* file_reader,
|
||||
FileDecryptionProperties* file_decryption_properties);
|
||||
};
|
||||
|
||||
} // namespace test
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,89 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/base64.h"
|
||||
|
||||
#include "parquet/encryption/kms_client_factory.h"
|
||||
#include "parquet/encryption/local_wrap_kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
// This is a mock class, built for testing only. Don't use it as an example of
|
||||
// LocalWrapKmsClient implementation.
|
||||
class TestOnlyLocalWrapInMemoryKms : public LocalWrapKmsClient {
|
||||
public:
|
||||
explicit TestOnlyLocalWrapInMemoryKms(const KmsConnectionConfig& kms_connection_config);
|
||||
|
||||
static void InitializeMasterKeys(
|
||||
const std::unordered_map<std::string, std::string>& master_keys_map);
|
||||
|
||||
protected:
|
||||
std::string GetMasterKeyFromServer(const std::string& master_key_identifier) override;
|
||||
|
||||
private:
|
||||
static std::unordered_map<std::string, std::string> master_key_map_;
|
||||
};
|
||||
|
||||
// This is a mock class, built for testing only. Don't use it as an example of KmsClient
|
||||
// implementation.
|
||||
class TestOnlyInServerWrapKms : public KmsClient {
|
||||
public:
|
||||
static void InitializeMasterKeys(
|
||||
const std::unordered_map<std::string, std::string>& master_keys_map);
|
||||
|
||||
std::string WrapKey(const std::string& key_bytes,
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
std::string UnwrapKey(const std::string& wrapped_key,
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
private:
|
||||
std::string GetMasterKeyFromServer(const std::string& master_key_identifier);
|
||||
|
||||
static std::unordered_map<std::string, std::string> master_key_map_;
|
||||
};
|
||||
|
||||
// This is a mock class, built for testing only. Don't use it as an example of
|
||||
// KmsClientFactory implementation.
|
||||
class TestOnlyInMemoryKmsClientFactory : public KmsClientFactory {
|
||||
public:
|
||||
TestOnlyInMemoryKmsClientFactory(
|
||||
bool wrap_locally,
|
||||
const std::unordered_map<std::string, std::string>& master_keys_map)
|
||||
: KmsClientFactory(wrap_locally) {
|
||||
TestOnlyLocalWrapInMemoryKms::InitializeMasterKeys(master_keys_map);
|
||||
TestOnlyInServerWrapKms::InitializeMasterKeys(master_keys_map);
|
||||
}
|
||||
|
||||
std::shared_ptr<KmsClient> CreateKmsClient(
|
||||
const KmsConnectionConfig& kms_connection_config) {
|
||||
if (wrap_locally_) {
|
||||
return std::make_shared<TestOnlyLocalWrapInMemoryKms>(kms_connection_config);
|
||||
} else {
|
||||
return std::make_shared<TestOnlyInServerWrapKms>();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
@ -0,0 +1,159 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
#include "arrow/util/mutex.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace encryption {
|
||||
|
||||
using ::arrow::util::ConcurrentMap;
|
||||
|
||||
namespace internal {
|
||||
|
||||
using TimePoint =
|
||||
std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<double>>;
|
||||
|
||||
inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); }
|
||||
|
||||
template <typename E>
|
||||
class ExpiringCacheEntry {
|
||||
public:
|
||||
ExpiringCacheEntry() = default;
|
||||
|
||||
ExpiringCacheEntry(E cached_item, double expiration_interval_seconds)
|
||||
: expiration_timestamp_(CurrentTimePoint() +
|
||||
std::chrono::duration<double>(expiration_interval_seconds)),
|
||||
cached_item_(std::move(cached_item)) {}
|
||||
|
||||
bool IsExpired() const {
|
||||
const auto now = CurrentTimePoint();
|
||||
return (now > expiration_timestamp_);
|
||||
}
|
||||
|
||||
E cached_item() { return cached_item_; }
|
||||
|
||||
private:
|
||||
const TimePoint expiration_timestamp_;
|
||||
E cached_item_;
|
||||
};
|
||||
|
||||
// This class is to avoid the below warning when compiling KeyToolkit class with VS2015
|
||||
// warning C4503: decorated name length exceeded, name was truncated
|
||||
template <typename V>
|
||||
class ExpiringCacheMapEntry {
|
||||
public:
|
||||
ExpiringCacheMapEntry() = default;
|
||||
|
||||
explicit ExpiringCacheMapEntry(
|
||||
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item,
|
||||
double expiration_interval_seconds)
|
||||
: map_cache_(cached_item, expiration_interval_seconds) {}
|
||||
|
||||
bool IsExpired() { return map_cache_.IsExpired(); }
|
||||
|
||||
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item() {
|
||||
return map_cache_.cached_item();
|
||||
}
|
||||
|
||||
private:
|
||||
// ConcurrentMap object may be accessed and modified at many places at the same time,
|
||||
// from multiple threads, or even removed from cache.
|
||||
ExpiringCacheEntry<std::shared_ptr<ConcurrentMap<std::string, V>>> map_cache_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// Two-level cache with expiration of internal caches according to token lifetime.
|
||||
// External cache is per token, internal is per string key.
|
||||
// Wrapper class around:
|
||||
// std::unordered_map<std::string,
|
||||
// internal::ExpiringCacheEntry<std::unordered_map<std::string, V>>>
|
||||
// This cache is safe to be shared between threads.
|
||||
template <typename V>
|
||||
class TwoLevelCacheWithExpiration {
|
||||
public:
|
||||
TwoLevelCacheWithExpiration() {
|
||||
last_cache_cleanup_timestamp_ = internal::CurrentTimePoint();
|
||||
}
|
||||
|
||||
std::shared_ptr<ConcurrentMap<std::string, V>> GetOrCreateInternalCache(
|
||||
const std::string& access_token, double cache_entry_lifetime_seconds) {
|
||||
auto lock = mutex_.Lock();
|
||||
|
||||
auto external_cache_entry = cache_.find(access_token);
|
||||
if (external_cache_entry == cache_.end() ||
|
||||
external_cache_entry->second.IsExpired()) {
|
||||
cache_.insert({access_token, internal::ExpiringCacheMapEntry<V>(
|
||||
std::shared_ptr<ConcurrentMap<std::string, V>>(
|
||||
new ConcurrentMap<std::string, V>()),
|
||||
cache_entry_lifetime_seconds)});
|
||||
}
|
||||
|
||||
return cache_[access_token].cached_item();
|
||||
}
|
||||
|
||||
void CheckCacheForExpiredTokens(double cache_cleanup_period_seconds) {
|
||||
auto lock = mutex_.Lock();
|
||||
|
||||
const auto now = internal::CurrentTimePoint();
|
||||
if (now > (last_cache_cleanup_timestamp_ +
|
||||
std::chrono::duration<double>(cache_cleanup_period_seconds))) {
|
||||
RemoveExpiredEntriesNoMutex();
|
||||
last_cache_cleanup_timestamp_ =
|
||||
now + std::chrono::duration<double>(cache_cleanup_period_seconds);
|
||||
}
|
||||
}
|
||||
|
||||
void RemoveExpiredEntriesFromCache() {
|
||||
auto lock = mutex_.Lock();
|
||||
|
||||
RemoveExpiredEntriesNoMutex();
|
||||
}
|
||||
|
||||
void Remove(const std::string& access_token) {
|
||||
auto lock = mutex_.Lock();
|
||||
cache_.erase(access_token);
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
auto lock = mutex_.Lock();
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
void RemoveExpiredEntriesNoMutex() {
|
||||
for (auto it = cache_.begin(); it != cache_.end();) {
|
||||
if (it->second.IsExpired()) {
|
||||
it = cache_.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::unordered_map<std::string, internal::ExpiringCacheMapEntry<V>> cache_;
|
||||
internal::TimePoint last_cache_cleanup_timestamp_;
|
||||
::arrow::util::Mutex mutex_;
|
||||
};
|
||||
|
||||
} // namespace encryption
|
||||
} // namespace parquet
|
158
.venv/Lib/site-packages/pyarrow/include/parquet/exception.h
Normal file
158
.venv/Lib/site-packages/pyarrow/include/parquet/exception.h
Normal file
@ -0,0 +1,158 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <exception>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/string_builder.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
// PARQUET-1085
|
||||
#if !defined(ARROW_UNUSED)
|
||||
#define ARROW_UNUSED(x) UNUSED(x)
|
||||
#endif
|
||||
|
||||
// Parquet exception to Arrow Status
|
||||
|
||||
#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
|
||||
#define END_PARQUET_CATCH_EXCEPTIONS \
|
||||
} \
|
||||
catch (const ::parquet::ParquetStatusException& e) { \
|
||||
return e.status(); \
|
||||
} \
|
||||
catch (const ::parquet::ParquetException& e) { \
|
||||
return ::arrow::Status::IOError(e.what()); \
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
|
||||
#define PARQUET_CATCH_NOT_OK(s) \
|
||||
BEGIN_PARQUET_CATCH_EXCEPTIONS \
|
||||
(s); \
|
||||
END_PARQUET_CATCH_EXCEPTIONS
|
||||
|
||||
// clang-format on
|
||||
|
||||
#define PARQUET_CATCH_AND_RETURN(s) \
|
||||
BEGIN_PARQUET_CATCH_EXCEPTIONS \
|
||||
return (s); \
|
||||
END_PARQUET_CATCH_EXCEPTIONS
|
||||
|
||||
// Arrow Status to Parquet exception
|
||||
|
||||
#define PARQUET_IGNORE_NOT_OK(s) \
|
||||
do { \
|
||||
::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
|
||||
ARROW_UNUSED(_s); \
|
||||
} while (0)
|
||||
|
||||
#define PARQUET_THROW_NOT_OK(s) \
|
||||
do { \
|
||||
::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
|
||||
if (!_s.ok()) { \
|
||||
throw ::parquet::ParquetStatusException(std::move(_s)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
|
||||
auto status_name = (rexpr); \
|
||||
PARQUET_THROW_NOT_OK(status_name.status()); \
|
||||
lhs = std::move(status_name).ValueOrDie();
|
||||
|
||||
#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
|
||||
PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
|
||||
lhs, rexpr);
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ParquetException : public std::exception {
|
||||
public:
|
||||
PARQUET_NORETURN static void EofException(const std::string& msg = "") {
|
||||
static std::string prefix = "Unexpected end of stream";
|
||||
if (msg.empty()) {
|
||||
throw ParquetException(prefix);
|
||||
}
|
||||
throw ParquetException(prefix, ": ", msg);
|
||||
}
|
||||
|
||||
PARQUET_NORETURN static void NYI(const std::string& msg = "") {
|
||||
throw ParquetException("Not yet implemented: ", msg, ".");
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
explicit ParquetException(Args&&... args)
|
||||
: msg_(::arrow::util::StringBuilder(std::forward<Args>(args)...)) {}
|
||||
|
||||
explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
|
||||
|
||||
explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
|
||||
|
||||
ParquetException(const ParquetException&) = default;
|
||||
ParquetException& operator=(const ParquetException&) = default;
|
||||
ParquetException(ParquetException&&) = default;
|
||||
ParquetException& operator=(ParquetException&&) = default;
|
||||
|
||||
const char* what() const noexcept override { return msg_.c_str(); }
|
||||
|
||||
private:
|
||||
std::string msg_;
|
||||
};
|
||||
|
||||
// Support printing a ParquetException.
|
||||
// This is needed for clang-on-MSVC as there operator<< is not defined for
|
||||
// std::exception.
|
||||
PARQUET_EXPORT
|
||||
std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
|
||||
|
||||
class ParquetStatusException : public ParquetException {
|
||||
public:
|
||||
explicit ParquetStatusException(::arrow::Status status)
|
||||
: ParquetException(status.ToString()), status_(std::move(status)) {}
|
||||
|
||||
const ::arrow::Status& status() const { return status_; }
|
||||
|
||||
private:
|
||||
::arrow::Status status_;
|
||||
};
|
||||
|
||||
// This class exists for the purpose of detecting an invalid or corrupted file.
|
||||
class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
|
||||
public:
|
||||
ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
|
||||
default;
|
||||
|
||||
template <typename Arg,
|
||||
typename std::enable_if<
|
||||
!std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
|
||||
int>::type = 0,
|
||||
typename... Args>
|
||||
explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
|
||||
: ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
|
||||
std::forward<Args>(args)...)) {}
|
||||
};
|
||||
|
||||
template <typename StatusReturnBlock>
|
||||
void ThrowNotOk(StatusReturnBlock&& b) {
|
||||
PARQUET_THROW_NOT_OK(b());
|
||||
}
|
||||
|
||||
} // namespace parquet
|
188
.venv/Lib/site-packages/pyarrow/include/parquet/file_reader.h
Normal file
188
.venv/Lib/site-packages/pyarrow/include/parquet/file_reader.h
Normal file
@ -0,0 +1,188 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/io/caching.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "parquet/metadata.h" // IWYU pragma: keep
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ColumnReader;
|
||||
class FileMetaData;
|
||||
class PageReader;
|
||||
class RowGroupMetaData;
|
||||
|
||||
class PARQUET_EXPORT RowGroupReader {
|
||||
public:
|
||||
// Forward declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct Contents {
|
||||
virtual ~Contents() {}
|
||||
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
|
||||
virtual const RowGroupMetaData* metadata() const = 0;
|
||||
virtual const ReaderProperties* properties() const = 0;
|
||||
};
|
||||
|
||||
explicit RowGroupReader(std::unique_ptr<Contents> contents);
|
||||
|
||||
// Returns the rowgroup metadata
|
||||
const RowGroupMetaData* metadata() const;
|
||||
|
||||
// Construct a ColumnReader for the indicated row group-relative
|
||||
// column. Ownership is shared with the RowGroupReader.
|
||||
std::shared_ptr<ColumnReader> Column(int i);
|
||||
|
||||
// Construct a ColumnReader, trying to enable exposed encoding.
|
||||
//
|
||||
// For dictionary encoding, currently we only support column chunks that are fully
|
||||
// dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
|
||||
// If a column chunk uses dictionary encoding but then falls back to plain encoding, the
|
||||
// encoding will not be exposed.
|
||||
//
|
||||
// The returned column reader provides an API GetExposedEncoding() for the
|
||||
// users to check the exposed encoding and determine how to read the batches.
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
|
||||
int i, ExposedEncoding encoding_to_expose);
|
||||
|
||||
std::unique_ptr<PageReader> GetColumnPageReader(int i);
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ParquetFileReader {
|
||||
public:
|
||||
// Declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct PARQUET_EXPORT Contents {
|
||||
static std::unique_ptr<Contents> Open(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
virtual ~Contents() = default;
|
||||
// Perform any cleanup associated with the file contents
|
||||
virtual void Close() = 0;
|
||||
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
|
||||
virtual std::shared_ptr<FileMetaData> metadata() const = 0;
|
||||
};
|
||||
|
||||
ParquetFileReader();
|
||||
~ParquetFileReader();
|
||||
|
||||
// Create a file reader instance from an Arrow file object. Thread-safety is
|
||||
// the responsibility of the file implementation
|
||||
static std::unique_ptr<ParquetFileReader> Open(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
// API Convenience to open a serialized Parquet file on disk, using Arrow IO
|
||||
// interfaces.
|
||||
static std::unique_ptr<ParquetFileReader> OpenFile(
|
||||
const std::string& path, bool memory_map = true,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
// Asynchronously open a file reader from an Arrow file object.
|
||||
// Does not throw - all errors are reported through the Future.
|
||||
static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
void Open(std::unique_ptr<Contents> contents);
|
||||
void Close();
|
||||
|
||||
// The RowGroupReader is owned by the FileReader
|
||||
std::shared_ptr<RowGroupReader> RowGroup(int i);
|
||||
|
||||
// Returns the file metadata. Only one instance is ever created
|
||||
std::shared_ptr<FileMetaData> metadata() const;
|
||||
|
||||
/// Pre-buffer the specified column indices in all row groups.
|
||||
///
|
||||
/// Readers can optionally call this to cache the necessary slices
|
||||
/// of the file in-memory before deserialization. Arrow readers can
|
||||
/// automatically do this via an option. This is intended to
|
||||
/// increase performance when reading from high-latency filesystems
|
||||
/// (e.g. Amazon S3).
|
||||
///
|
||||
/// After calling this, creating readers for row groups/column
|
||||
/// indices that were not buffered may fail. Creating multiple
|
||||
/// readers for the a subset of the buffered regions is
|
||||
/// acceptable. This may be called again to buffer a different set
|
||||
/// of row groups/columns.
|
||||
///
|
||||
/// If memory usage is a concern, note that data will remain
|
||||
/// buffered in memory until either \a PreBuffer() is called again,
|
||||
/// or the reader itself is destructed. Reading - and buffering -
|
||||
/// only one row group at a time may be useful.
|
||||
///
|
||||
/// This method may throw.
|
||||
void PreBuffer(const std::vector<int>& row_groups,
|
||||
const std::vector<int>& column_indices,
|
||||
const ::arrow::io::IOContext& ctx,
|
||||
const ::arrow::io::CacheOptions& options);
|
||||
|
||||
/// Wait for the specified row groups and column indices to be pre-buffered.
|
||||
///
|
||||
/// After the returned Future completes, reading the specified row
|
||||
/// groups/columns will not block.
|
||||
///
|
||||
/// PreBuffer must be called first. This method does not throw.
|
||||
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
|
||||
const std::vector<int>& column_indices) const;
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
};
|
||||
|
||||
// Read only Parquet file metadata
|
||||
std::shared_ptr<FileMetaData> PARQUET_EXPORT
|
||||
ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
|
||||
|
||||
/// \brief Scan all values in file. Useful for performance testing
|
||||
/// \param[in] columns the column numbers to scan. If empty scans all
|
||||
/// \param[in] column_batch_size number of values to read at a time when scanning column
|
||||
/// \param[in] reader a ParquetFileReader instance
|
||||
/// \return number of semantic rows in file
|
||||
PARQUET_EXPORT
|
||||
int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
|
||||
ParquetFileReader* reader);
|
||||
|
||||
} // namespace parquet
|
234
.venv/Lib/site-packages/pyarrow/include/parquet/file_writer.h
Normal file
234
.venv/Lib/site-packages/pyarrow/include/parquet/file_writer.h
Normal file
@ -0,0 +1,234 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "parquet/metadata.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ColumnWriter;
|
||||
|
||||
// FIXME: copied from reader-internal.cc
|
||||
static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
|
||||
static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
|
||||
|
||||
class PARQUET_EXPORT RowGroupWriter {
|
||||
public:
|
||||
// Forward declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct Contents {
|
||||
virtual ~Contents() = default;
|
||||
virtual int num_columns() const = 0;
|
||||
virtual int64_t num_rows() const = 0;
|
||||
|
||||
// to be used only with ParquetFileWriter::AppendRowGroup
|
||||
virtual ColumnWriter* NextColumn() = 0;
|
||||
// to be used only with ParquetFileWriter::AppendBufferedRowGroup
|
||||
virtual ColumnWriter* column(int i) = 0;
|
||||
|
||||
virtual int current_column() const = 0;
|
||||
virtual void Close() = 0;
|
||||
|
||||
// total bytes written by the page writer
|
||||
virtual int64_t total_bytes_written() const = 0;
|
||||
// total bytes still compressed but not written
|
||||
virtual int64_t total_compressed_bytes() const = 0;
|
||||
};
|
||||
|
||||
explicit RowGroupWriter(std::unique_ptr<Contents> contents);
|
||||
|
||||
/// Construct a ColumnWriter for the indicated row group-relative column.
|
||||
///
|
||||
/// To be used only with ParquetFileWriter::AppendRowGroup
|
||||
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
|
||||
/// valid until the next call to NextColumn or Close. As the contents are
|
||||
/// directly written to the sink, once a new column is started, the contents
|
||||
/// of the previous one cannot be modified anymore.
|
||||
ColumnWriter* NextColumn();
|
||||
/// Index of currently written column. Equal to -1 if NextColumn()
|
||||
/// has not been called yet.
|
||||
int current_column();
|
||||
void Close();
|
||||
|
||||
int num_columns() const;
|
||||
|
||||
/// Construct a ColumnWriter for the indicated row group column.
|
||||
///
|
||||
/// To be used only with ParquetFileWriter::AppendBufferedRowGroup
|
||||
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is
|
||||
/// valid until Close. The contents are buffered in memory and written to sink
|
||||
/// on Close
|
||||
ColumnWriter* column(int i);
|
||||
|
||||
/**
|
||||
* Number of rows that shall be written as part of this RowGroup.
|
||||
*/
|
||||
int64_t num_rows() const;
|
||||
|
||||
int64_t total_bytes_written() const;
|
||||
int64_t total_compressed_bytes() const;
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
};
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteFileMetaData(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteMetaDataFile(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
|
||||
ArrowOutputStream* sink,
|
||||
const std::shared_ptr<Encryptor>& encryptor,
|
||||
bool encrypt_footer);
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink,
|
||||
const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
|
||||
bool encrypt_footer = false);
|
||||
PARQUET_EXPORT
|
||||
void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
class PARQUET_EXPORT ParquetFileWriter {
|
||||
public:
|
||||
// Forward declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct Contents {
|
||||
Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata)
|
||||
: schema_(), key_value_metadata_(std::move(key_value_metadata)) {
|
||||
schema_.Init(std::move(schema));
|
||||
}
|
||||
virtual ~Contents() {}
|
||||
// Perform any cleanup associated with the file contents
|
||||
virtual void Close() = 0;
|
||||
|
||||
/// \note Deprecated since 1.3.0
|
||||
RowGroupWriter* AppendRowGroup(int64_t num_rows);
|
||||
|
||||
virtual RowGroupWriter* AppendRowGroup() = 0;
|
||||
virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
|
||||
|
||||
virtual int64_t num_rows() const = 0;
|
||||
virtual int num_columns() const = 0;
|
||||
virtual int num_row_groups() const = 0;
|
||||
|
||||
virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
|
||||
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
|
||||
return key_value_metadata_;
|
||||
}
|
||||
|
||||
// Return const-pointer to make it clear that this object is not to be copied
|
||||
const SchemaDescriptor* schema() const { return &schema_; }
|
||||
|
||||
SchemaDescriptor schema_;
|
||||
|
||||
/// This should be the only place this is stored. Everything else is a const reference
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
|
||||
|
||||
const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
|
||||
std::shared_ptr<FileMetaData> file_metadata_;
|
||||
};
|
||||
|
||||
ParquetFileWriter();
|
||||
~ParquetFileWriter();
|
||||
|
||||
static std::unique_ptr<ParquetFileWriter> Open(
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
std::shared_ptr<schema::GroupNode> schema,
|
||||
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
|
||||
|
||||
void Open(std::unique_ptr<Contents> contents);
|
||||
void Close();
|
||||
|
||||
// Construct a RowGroupWriter for the indicated number of rows.
|
||||
//
|
||||
// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
|
||||
// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
|
||||
// @param num_rows The number of rows that are stored in the new RowGroup
|
||||
//
|
||||
// \deprecated Since 1.3.0
|
||||
RowGroupWriter* AppendRowGroup(int64_t num_rows);
|
||||
|
||||
/// Construct a RowGroupWriter with an arbitrary number of rows.
|
||||
///
|
||||
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
|
||||
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
|
||||
RowGroupWriter* AppendRowGroup();
|
||||
|
||||
/// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
|
||||
/// Use this if you want to write a RowGroup based on a certain size
|
||||
///
|
||||
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
|
||||
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
|
||||
RowGroupWriter* AppendBufferedRowGroup();
|
||||
|
||||
/// Number of columns.
|
||||
///
|
||||
/// This number is fixed during the lifetime of the writer as it is determined via
|
||||
/// the schema.
|
||||
int num_columns() const;
|
||||
|
||||
/// Number of rows in the yet started RowGroups.
|
||||
///
|
||||
/// Changes on the addition of a new RowGroup.
|
||||
int64_t num_rows() const;
|
||||
|
||||
/// Number of started RowGroups.
|
||||
int num_row_groups() const;
|
||||
|
||||
/// Configuration passed to the writer, e.g. the used Parquet format version.
|
||||
const std::shared_ptr<WriterProperties>& properties() const;
|
||||
|
||||
/// Returns the file schema descriptor
|
||||
const SchemaDescriptor* schema() const;
|
||||
|
||||
/// Returns a column descriptor in schema
|
||||
const ColumnDescriptor* descr(int i) const;
|
||||
|
||||
/// Returns the file custom metadata
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
|
||||
|
||||
/// Returns the file metadata, only available after calling Close().
|
||||
const std::shared_ptr<FileMetaData> metadata() const;
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
std::shared_ptr<FileMetaData> file_metadata_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
72
.venv/Lib/site-packages/pyarrow/include/parquet/hasher.h
Normal file
72
.venv/Lib/site-packages/pyarrow/include/parquet/hasher.h
Normal file
@ -0,0 +1,72 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
// Abstract class for hash
|
||||
class Hasher {
|
||||
public:
|
||||
/// Compute hash for 32 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int32_t value) const = 0;
|
||||
|
||||
/// Compute hash for 64 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int64_t value) const = 0;
|
||||
|
||||
/// Compute hash for float value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(float value) const = 0;
|
||||
|
||||
/// Compute hash for double value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(double value) const = 0;
|
||||
|
||||
/// Compute hash for Int96 value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const Int96* value) const = 0;
|
||||
|
||||
/// Compute hash for ByteArray value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const ByteArray* value) const = 0;
|
||||
|
||||
/// Compute hash for fixed byte array value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value address.
|
||||
/// @param len the value length.
|
||||
virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
|
||||
|
||||
virtual ~Hasher() = default;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
@ -0,0 +1,40 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace internal {
|
||||
|
||||
/// Builds a bitmap where each set bit indicates the corresponding level is greater
|
||||
/// than rhs.
|
||||
uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
|
||||
int16_t rhs);
|
||||
|
||||
struct MinMax {
|
||||
int16_t min;
|
||||
int16_t max;
|
||||
};
|
||||
|
||||
MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace parquet
|
@ -0,0 +1,65 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "parquet/level_comparison.h"
|
||||
|
||||
// Used to make sure ODR rule isn't violated.
|
||||
#ifndef PARQUET_IMPL_NAMESPACE
|
||||
#error "PARQUET_IMPL_NAMESPACE must be defined"
|
||||
#endif
|
||||
namespace parquet {
|
||||
namespace internal {
|
||||
namespace PARQUET_IMPL_NAMESPACE {
|
||||
/// Builds a bitmap by applying predicate to the level vector provided.
|
||||
///
|
||||
/// \param[in] levels Rep or def level array.
|
||||
/// \param[in] num_levels The number of levels to process (must be [0, 64])
|
||||
/// \param[in] predicate The predicate to apply (must have the signature `bool
|
||||
/// predicate(int16_t)`.
|
||||
/// \returns The bitmap using least significant "bit" ordering.
|
||||
///
|
||||
template <typename Predicate>
|
||||
inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
|
||||
Predicate predicate) {
|
||||
// Both clang and GCC can vectorize this automatically with SSE4/AVX2.
|
||||
uint64_t mask = 0;
|
||||
for (int x = 0; x < num_levels; x++) {
|
||||
mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
|
||||
}
|
||||
return ::arrow::bit_util::ToLittleEndian(mask);
|
||||
}
|
||||
|
||||
inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
|
||||
MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
|
||||
for (int x = 0; x < num_levels; x++) {
|
||||
out.min = std::min(levels[x], out.min);
|
||||
out.max = std::max(levels[x], out.max);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
|
||||
int16_t rhs) {
|
||||
return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
|
||||
}
|
||||
|
||||
} // namespace PARQUET_IMPL_NAMESPACE
|
||||
} // namespace internal
|
||||
} // namespace parquet
|
@ -0,0 +1,199 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "arrow/util/endian.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace internal {
|
||||
|
||||
struct PARQUET_EXPORT LevelInfo {
|
||||
LevelInfo()
|
||||
: null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
|
||||
LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
|
||||
int32_t repeated_ancestor_definition_level)
|
||||
: null_slot_usage(null_slots),
|
||||
def_level(definition_level),
|
||||
rep_level(repetition_level),
|
||||
repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
|
||||
|
||||
bool operator==(const LevelInfo& b) const {
|
||||
return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
|
||||
rep_level == b.rep_level &&
|
||||
repeated_ancestor_def_level == b.repeated_ancestor_def_level;
|
||||
}
|
||||
|
||||
bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
|
||||
|
||||
// How many slots an undefined but present (i.e. null) element in
|
||||
// parquet consumes when decoding to Arrow.
|
||||
// "Slot" is used in the same context as the Arrow specification
|
||||
// (i.e. a value holder).
|
||||
// This is only ever >1 for descendents of FixedSizeList.
|
||||
int32_t null_slot_usage = 1;
|
||||
|
||||
// The definition level at which the value for the field
|
||||
// is considered not null (definition levels greater than
|
||||
// or equal to this value indicate a not-null
|
||||
// value for the field). For list fields definition levels
|
||||
// greater than or equal to this field indicate a present,
|
||||
// possibly null, child value.
|
||||
int16_t def_level = 0;
|
||||
|
||||
// The repetition level corresponding to this element
|
||||
// or the closest repeated ancestor. Any repetition
|
||||
// level less than this indicates either a new list OR
|
||||
// an empty list (which is determined in conjunction
|
||||
// with definition levels).
|
||||
int16_t rep_level = 0;
|
||||
|
||||
// The definition level indicating the level at which the closest
|
||||
// repeated ancestor is not empty. This is used to discriminate
|
||||
// between a value less than |def_level| being null or excluded entirely.
|
||||
// For instance if we have an arrow schema like:
|
||||
// list(struct(f0: int)). Then then there are the following
|
||||
// definition levels:
|
||||
// 0 = null list
|
||||
// 1 = present but empty list.
|
||||
// 2 = a null value in the list
|
||||
// 3 = a non null struct but null integer.
|
||||
// 4 = a present integer.
|
||||
// When reconstructing, the struct and integer arrays'
|
||||
// repeated_ancestor_def_level would be 2. Any
|
||||
// def_level < 2 indicates that there isn't a corresponding
|
||||
// child value in the list.
|
||||
// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
|
||||
// has the def levels [0, 1, 2, 3, 4]. The actual
|
||||
// struct array is only of length 3: [not-set, set, set] and
|
||||
// the int array is also of length 3: [N/A, null, 1].
|
||||
//
|
||||
int16_t repeated_ancestor_def_level = 0;
|
||||
|
||||
/// Increments levels according to the cardinality of node.
|
||||
void Increment(const schema::Node& node) {
|
||||
if (node.is_repeated()) {
|
||||
IncrementRepeated();
|
||||
return;
|
||||
}
|
||||
if (node.is_optional()) {
|
||||
IncrementOptional();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// Incremetns level for a optional node.
|
||||
void IncrementOptional() { def_level++; }
|
||||
|
||||
/// Increments levels for the repeated node. Returns
|
||||
/// the previous ancestor_list_def_level.
|
||||
int16_t IncrementRepeated() {
|
||||
int16_t last_repeated_ancestor = repeated_ancestor_def_level;
|
||||
|
||||
// Repeated fields add both a repetition and definition level. This is used
|
||||
// to distinguish between an empty list and a list with an item in it.
|
||||
++rep_level;
|
||||
++def_level;
|
||||
// For levels >= repeated_ancenstor_def_level it indicates the list was
|
||||
// non-null and had at least one element. This is important
|
||||
// for later decoding because we need to add a slot for these
|
||||
// values. for levels < current_def_level no slots are added
|
||||
// to arrays.
|
||||
repeated_ancestor_def_level = def_level;
|
||||
return last_repeated_ancestor;
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
|
||||
// This print method is to silence valgrind issues. What's printed
|
||||
// is not important because all asserts happen directly on
|
||||
// members.
|
||||
os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
|
||||
<< ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
|
||||
if (levels.null_slot_usage > 1) {
|
||||
os << ", null_slot_usage=" << levels.null_slot_usage;
|
||||
}
|
||||
os << "}";
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
// Input/Output structure for reconstructed validity bitmaps.
|
||||
struct PARQUET_EXPORT ValidityBitmapInputOutput {
|
||||
// Input only.
|
||||
// The maximum number of values_read expected (actual
|
||||
// values read must be less than or equal to this value).
|
||||
// If this number is exceeded methods will throw a
|
||||
// ParquetException. Exceeding this limit indicates
|
||||
// either a corrupt or incorrectly written file.
|
||||
int64_t values_read_upper_bound = 0;
|
||||
// Output only. The number of values added to the encountered
|
||||
// (this is logically the count of the number of elements
|
||||
// for an Arrow array).
|
||||
int64_t values_read = 0;
|
||||
// Input/Output. The number of nulls encountered.
|
||||
int64_t null_count = 0;
|
||||
// Output only. The validity bitmap to populate. May be be null only
|
||||
// for DefRepLevelsToListInfo (if all that is needed is list offsets).
|
||||
uint8_t* valid_bits = NULLPTR;
|
||||
// Input only, offset into valid_bits to start at.
|
||||
int64_t valid_bits_offset = 0;
|
||||
};
|
||||
|
||||
// Converts def_levels to validity bitmaps for non-list arrays and structs that have
|
||||
// at least one member that is not a list and has no list descendents.
|
||||
// For lists use DefRepLevelsToList and structs where all descendants contain
|
||||
// a list use DefRepLevelsToBitmap.
|
||||
void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output);
|
||||
|
||||
// Reconstructs a validity bitmap and list offsets for a list arrays based on
|
||||
// def/rep levels. The first element of offsets will not be modified if rep_levels
|
||||
// starts with a new list. The first element of offsets will be used when calculating
|
||||
// the next offset. See documentation onf DefLevelsToBitmap for when to use this
|
||||
// method vs the other ones in this file for reconstruction.
|
||||
//
|
||||
// Offsets must be sized to 1 + values_read_upper_bound.
|
||||
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
|
||||
const int16_t* rep_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output,
|
||||
int32_t* offsets);
|
||||
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
|
||||
const int16_t* rep_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output,
|
||||
int64_t* offsets);
|
||||
|
||||
// Reconstructs a validity bitmap for a struct every member is a list or has
|
||||
// a list descendant. See documentation on DefLevelsToBitmap for when more
|
||||
// details on this method compared to the other ones defined above.
|
||||
void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
|
||||
const int16_t* rep_levels,
|
||||
int64_t num_def_levels, LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output);
|
||||
|
||||
// This is exposed to ensure we can properly test a software simulated pext function
|
||||
// (i.e. it isn't hidden by runtime dispatch).
|
||||
uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace parquet
|
@ -0,0 +1,357 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
|
||||
#include "parquet/level_conversion.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "arrow/util/bit_run_reader.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_writer.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/simd.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/level_comparison.h"
|
||||
|
||||
namespace parquet {
|
||||
namespace internal {
|
||||
#ifndef PARQUET_IMPL_NAMESPACE
|
||||
#error "PARQUET_IMPL_NAMESPACE must be defined"
|
||||
#endif
|
||||
namespace PARQUET_IMPL_NAMESPACE {
|
||||
|
||||
// clang-format off
|
||||
/* Python code to generate lookup table:
|
||||
|
||||
kLookupBits = 5
|
||||
count = 0
|
||||
print('constexpr int kLookupBits = {};'.format(kLookupBits))
|
||||
print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
|
||||
print(' ', end = '')
|
||||
for mask in range(1 << kLookupBits):
|
||||
for data in range(1 << kLookupBits):
|
||||
bit_value = 0
|
||||
bit_len = 0
|
||||
for i in range(kLookupBits):
|
||||
if mask & (1 << i):
|
||||
bit_value |= (((data >> i) & 1) << bit_len)
|
||||
bit_len += 1
|
||||
out = '0x{:02X},'.format(bit_value)
|
||||
count += 1
|
||||
if count % (1 << kLookupBits) == 1:
|
||||
print(' {')
|
||||
if count % 8 == 1:
|
||||
print(' ', end = '')
|
||||
if count % 8 == 0:
|
||||
print(out, end = '\n')
|
||||
else:
|
||||
print(out, end = ' ')
|
||||
if count % (1 << kLookupBits) == 0:
|
||||
print(' },', end = '')
|
||||
print('\n};')
|
||||
|
||||
*/
|
||||
// clang-format on
|
||||
|
||||
constexpr int kLookupBits = 5;
|
||||
constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
|
||||
0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
|
||||
0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
|
||||
0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
|
||||
0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
|
||||
0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
|
||||
0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
|
||||
0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
|
||||
0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
|
||||
0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
|
||||
0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
|
||||
0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
|
||||
0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
|
||||
0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
|
||||
0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
|
||||
0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
|
||||
0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
|
||||
0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
|
||||
0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
|
||||
0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
|
||||
0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
|
||||
0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
|
||||
0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
|
||||
0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
|
||||
0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
|
||||
0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
|
||||
0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
|
||||
0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
|
||||
0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
|
||||
0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
|
||||
0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
|
||||
0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
|
||||
0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
|
||||
0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
|
||||
0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
|
||||
0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
|
||||
0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
|
||||
0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
|
||||
0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
|
||||
0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
|
||||
0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
|
||||
0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
|
||||
0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
|
||||
0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
|
||||
0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
|
||||
0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
|
||||
0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
|
||||
0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
|
||||
0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
|
||||
0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
|
||||
0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
|
||||
0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
|
||||
0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
|
||||
0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
|
||||
0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
|
||||
0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
|
||||
},
|
||||
};
|
||||
|
||||
inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
|
||||
// A software emulation of _pext_u64
|
||||
|
||||
// These checks should be inline and are likely to be common cases.
|
||||
if (select_bitmap == ~uint64_t{0}) {
|
||||
return bitmap;
|
||||
} else if (select_bitmap == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Fallback to lookup table method
|
||||
uint64_t bit_value = 0;
|
||||
int bit_len = 0;
|
||||
constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
|
||||
while (select_bitmap != 0) {
|
||||
const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
|
||||
const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
|
||||
bit_value |= (value << bit_len);
|
||||
bit_len += mask_len;
|
||||
bitmap >>= kLookupBits;
|
||||
select_bitmap >>= kLookupBits;
|
||||
}
|
||||
return bit_value;
|
||||
}
|
||||
|
||||
#ifdef ARROW_HAVE_BMI2
|
||||
|
||||
// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
|
||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||
|
||||
using extract_bitmap_t = uint32_t;
|
||||
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
|
||||
extract_bitmap_t select_bitmap) {
|
||||
return _pext_u32(bitmap, select_bitmap);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using extract_bitmap_t = uint64_t;
|
||||
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
|
||||
extract_bitmap_t select_bitmap) {
|
||||
return _pext_u64(bitmap, select_bitmap);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else // !defined(ARROW_HAVE_BMI2)
|
||||
|
||||
// Use 64-bit pext emulation when BMI2 isn't available.
|
||||
using extract_bitmap_t = uint64_t;
|
||||
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
|
||||
extract_bitmap_t select_bitmap) {
|
||||
return ExtractBitsSoftware(bitmap, select_bitmap);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
|
||||
|
||||
template <bool has_repeated_parent>
|
||||
int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
|
||||
int64_t upper_bound_remaining, LevelInfo level_info,
|
||||
::arrow::internal::FirstTimeBitmapWriter* writer) {
|
||||
DCHECK_LE(batch_size, kExtractBitsSize);
|
||||
|
||||
// Greater than level_info.def_level - 1 implies >= the def_level
|
||||
auto defined_bitmap = static_cast<extract_bitmap_t>(
|
||||
internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
|
||||
|
||||
if (has_repeated_parent) {
|
||||
// Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
|
||||
// repeated_ancestor_def_level
|
||||
auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
|
||||
def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
|
||||
auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
|
||||
int64_t selected_count = ::arrow::bit_util::PopCount(present_bitmap);
|
||||
if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
|
||||
throw ParquetException("Values read exceeded upper bound");
|
||||
}
|
||||
writer->AppendWord(selected_bits, selected_count);
|
||||
return ::arrow::bit_util::PopCount(selected_bits);
|
||||
} else {
|
||||
if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
|
||||
std::stringstream ss;
|
||||
ss << "Values read exceeded upper bound";
|
||||
throw ParquetException(ss.str());
|
||||
}
|
||||
|
||||
writer->AppendWord(defined_bitmap, batch_size);
|
||||
return ::arrow::bit_util::PopCount(defined_bitmap);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool has_repeated_parent>
|
||||
void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info, ValidityBitmapInputOutput* output) {
|
||||
::arrow::internal::FirstTimeBitmapWriter writer(
|
||||
output->valid_bits,
|
||||
/*start_offset=*/output->valid_bits_offset,
|
||||
/*length=*/output->values_read_upper_bound);
|
||||
int64_t set_count = 0;
|
||||
output->values_read = 0;
|
||||
int64_t values_read_remaining = output->values_read_upper_bound;
|
||||
while (num_def_levels > kExtractBitsSize) {
|
||||
set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
|
||||
def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
|
||||
def_levels += kExtractBitsSize;
|
||||
num_def_levels -= kExtractBitsSize;
|
||||
values_read_remaining = output->values_read_upper_bound - writer.position();
|
||||
}
|
||||
set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
|
||||
def_levels, num_def_levels, values_read_remaining, level_info, &writer);
|
||||
|
||||
output->values_read = writer.position();
|
||||
output->null_count += output->values_read - set_count;
|
||||
writer.Finish();
|
||||
}
|
||||
|
||||
} // namespace PARQUET_IMPL_NAMESPACE
|
||||
} // namespace internal
|
||||
} // namespace parquet
|
489
.venv/Lib/site-packages/pyarrow/include/parquet/metadata.h
Normal file
489
.venv/Lib/site-packages/pyarrow/include/parquet/metadata.h
Normal file
@ -0,0 +1,489 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ColumnDescriptor;
|
||||
class EncodedStatistics;
|
||||
class Statistics;
|
||||
class SchemaDescriptor;
|
||||
|
||||
class FileCryptoMetaData;
|
||||
class InternalFileDecryptor;
|
||||
class Decryptor;
|
||||
class Encryptor;
|
||||
class FooterSigningEncryptor;
|
||||
|
||||
namespace schema {
|
||||
|
||||
class ColumnPath;
|
||||
|
||||
} // namespace schema
|
||||
|
||||
using KeyValueMetadata = ::arrow::KeyValueMetadata;
|
||||
|
||||
class PARQUET_EXPORT ApplicationVersion {
|
||||
public:
|
||||
// Known Versions with Issues
|
||||
static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
|
||||
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
|
||||
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
|
||||
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
|
||||
|
||||
// Application that wrote the file. e.g. "IMPALA"
|
||||
std::string application_;
|
||||
// Build name
|
||||
std::string build_;
|
||||
|
||||
// Version of the application that wrote the file, expressed as
|
||||
// (<major>.<minor>.<patch>). Unmatched parts default to 0.
|
||||
// "1.2.3" => {1, 2, 3}
|
||||
// "1.2" => {1, 2, 0}
|
||||
// "1.2-cdh5" => {1, 2, 0}
|
||||
struct {
|
||||
int major;
|
||||
int minor;
|
||||
int patch;
|
||||
std::string unknown;
|
||||
std::string pre_release;
|
||||
std::string build_info;
|
||||
} version;
|
||||
|
||||
ApplicationVersion() = default;
|
||||
explicit ApplicationVersion(const std::string& created_by);
|
||||
ApplicationVersion(std::string application, int major, int minor, int patch);
|
||||
|
||||
// Returns true if version is strictly less than other_version
|
||||
bool VersionLt(const ApplicationVersion& other_version) const;
|
||||
|
||||
// Returns true if version is strictly equal with other_version
|
||||
bool VersionEq(const ApplicationVersion& other_version) const;
|
||||
|
||||
// Checks if the Version has the correct statistics for a given column
|
||||
bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
|
||||
SortOrder::type sort_order = SortOrder::SIGNED) const;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnCryptoMetaData {
|
||||
public:
|
||||
static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
|
||||
~ColumnCryptoMetaData();
|
||||
|
||||
bool Equals(const ColumnCryptoMetaData& other) const;
|
||||
|
||||
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
|
||||
bool encrypted_with_footer_key() const;
|
||||
const std::string& key_metadata() const;
|
||||
|
||||
private:
|
||||
explicit ColumnCryptoMetaData(const uint8_t* metadata);
|
||||
|
||||
class ColumnCryptoMetaDataImpl;
|
||||
std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
|
||||
struct PageEncodingStats {
|
||||
PageType::type page_type;
|
||||
Encoding::type encoding;
|
||||
int32_t count;
|
||||
};
|
||||
|
||||
/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
|
||||
class PARQUET_EXPORT ColumnChunkMetaData {
|
||||
public:
|
||||
// API convenience to get a MetaData accessor
|
||||
static std::unique_ptr<ColumnChunkMetaData> Make(
|
||||
const void* metadata, const ColumnDescriptor* descr,
|
||||
const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
|
||||
int16_t column_ordinal = -1,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
~ColumnChunkMetaData();
|
||||
|
||||
bool Equals(const ColumnChunkMetaData& other) const;
|
||||
|
||||
// column chunk
|
||||
int64_t file_offset() const;
|
||||
|
||||
// parameter is only used when a dataset is spread across multiple files
|
||||
const std::string& file_path() const;
|
||||
|
||||
// column metadata
|
||||
bool is_metadata_set() const;
|
||||
Type::type type() const;
|
||||
int64_t num_values() const;
|
||||
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
|
||||
bool is_stats_set() const;
|
||||
std::shared_ptr<Statistics> statistics() const;
|
||||
|
||||
Compression::type compression() const;
|
||||
// Indicate if the ColumnChunk compression is supported by the current
|
||||
// compiled parquet library.
|
||||
bool can_decompress() const;
|
||||
|
||||
const std::vector<Encoding::type>& encodings() const;
|
||||
const std::vector<PageEncodingStats>& encoding_stats() const;
|
||||
bool has_dictionary_page() const;
|
||||
int64_t dictionary_page_offset() const;
|
||||
int64_t data_page_offset() const;
|
||||
bool has_index_page() const;
|
||||
int64_t index_page_offset() const;
|
||||
int64_t total_compressed_size() const;
|
||||
int64_t total_uncompressed_size() const;
|
||||
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
|
||||
|
||||
private:
|
||||
explicit ColumnChunkMetaData(
|
||||
const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
|
||||
int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
// PIMPL Idiom
|
||||
class ColumnChunkMetaDataImpl;
|
||||
std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
|
||||
class PARQUET_EXPORT RowGroupMetaData {
|
||||
public:
|
||||
/// \brief Create a RowGroupMetaData from a serialized thrift message.
|
||||
static std::unique_ptr<RowGroupMetaData> Make(
|
||||
const void* metadata, const SchemaDescriptor* schema,
|
||||
const ApplicationVersion* writer_version = NULLPTR,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
~RowGroupMetaData();
|
||||
|
||||
bool Equals(const RowGroupMetaData& other) const;
|
||||
|
||||
/// \brief The number of columns in this row group. The order must match the
|
||||
/// parent's column ordering.
|
||||
int num_columns() const;
|
||||
|
||||
/// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
|
||||
///
|
||||
/// WARNING, the returned object references memory location in it's parent
|
||||
/// (RowGroupMetaData) object. Hence, the parent must outlive the returned
|
||||
/// object.
|
||||
///
|
||||
/// \param[in] index of the ColumnChunkMetaData to retrieve.
|
||||
///
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
|
||||
|
||||
/// \brief Number of rows in this row group.
|
||||
int64_t num_rows() const;
|
||||
|
||||
/// \brief Total byte size of all the uncompressed column data in this row group.
|
||||
int64_t total_byte_size() const;
|
||||
|
||||
/// \brief Total byte size of all the compressed (and potentially encrypted)
|
||||
/// column data in this row group.
|
||||
///
|
||||
/// This information is optional and may be 0 if omitted.
|
||||
int64_t total_compressed_size() const;
|
||||
|
||||
/// \brief Byte offset from beginning of file to first page (data or
|
||||
/// dictionary) in this row group
|
||||
///
|
||||
/// The file_offset field that this method exposes is optional. This method
|
||||
/// will return 0 if that field is not set to a meaningful value.
|
||||
int64_t file_offset() const;
|
||||
// Return const-pointer to make it clear that this object is not to be copied
|
||||
const SchemaDescriptor* schema() const;
|
||||
// Indicate if all of the RowGroup's ColumnChunks can be decompressed.
|
||||
bool can_decompress() const;
|
||||
|
||||
private:
|
||||
explicit RowGroupMetaData(
|
||||
const void* metadata, const SchemaDescriptor* schema,
|
||||
const ApplicationVersion* writer_version = NULLPTR,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
// PIMPL Idiom
|
||||
class RowGroupMetaDataImpl;
|
||||
std::unique_ptr<RowGroupMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
class FileMetaDataBuilder;
|
||||
|
||||
/// \brief FileMetaData is a proxy around format::FileMetaData.
|
||||
class PARQUET_EXPORT FileMetaData {
|
||||
public:
|
||||
/// \brief Create a FileMetaData from a serialized thrift message.
|
||||
static std::shared_ptr<FileMetaData> Make(
|
||||
const void* serialized_metadata, uint32_t* inout_metadata_len,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
~FileMetaData();
|
||||
|
||||
bool Equals(const FileMetaData& other) const;
|
||||
|
||||
/// \brief The number of top-level columns in the schema.
|
||||
///
|
||||
/// Parquet thrift definition requires that nested schema elements are
|
||||
/// flattened. This method returns the number of columns in the un-flattened
|
||||
/// version.
|
||||
int num_columns() const;
|
||||
|
||||
/// \brief The number of flattened schema elements.
|
||||
///
|
||||
/// Parquet thrift definition requires that nested schema elements are
|
||||
/// flattened. This method returns the total number of elements in the
|
||||
/// flattened list.
|
||||
int num_schema_elements() const;
|
||||
|
||||
/// \brief The total number of rows.
|
||||
int64_t num_rows() const;
|
||||
|
||||
/// \brief The number of row groups in the file.
|
||||
int num_row_groups() const;
|
||||
|
||||
/// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
|
||||
///
|
||||
/// WARNING, the returned object references memory location in it's parent
|
||||
/// (FileMetaData) object. Hence, the parent must outlive the returned object.
|
||||
///
|
||||
/// \param[in] index of the RowGroup to retrieve.
|
||||
///
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
|
||||
|
||||
/// \brief Return the "version" of the file
|
||||
///
|
||||
/// WARNING: The value returned by this method is unreliable as 1) the Parquet
|
||||
/// file metadata stores the version as a single integer and 2) some producers
|
||||
/// are known to always write a hardcoded value. Therefore, you cannot use
|
||||
/// this value to know which features are used in the file.
|
||||
ParquetVersion::type version() const;
|
||||
|
||||
/// \brief Return the application's user-agent string of the writer.
|
||||
const std::string& created_by() const;
|
||||
|
||||
/// \brief Return the application's version of the writer.
|
||||
const ApplicationVersion& writer_version() const;
|
||||
|
||||
/// \brief Size of the original thrift encoded metadata footer.
|
||||
uint32_t size() const;
|
||||
|
||||
/// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed.
|
||||
///
|
||||
/// This will return false if any of the RowGroup's page is compressed with a
|
||||
/// compression format which is not compiled in the current parquet library.
|
||||
bool can_decompress() const;
|
||||
|
||||
bool is_encryption_algorithm_set() const;
|
||||
EncryptionAlgorithm encryption_algorithm() const;
|
||||
const std::string& footer_signing_key_metadata() const;
|
||||
|
||||
/// \brief Verify signature of FileMetaData when file is encrypted but footer
|
||||
/// is not encrypted (plaintext footer).
|
||||
bool VerifySignature(const void* signature);
|
||||
|
||||
void WriteTo(::arrow::io::OutputStream* dst,
|
||||
const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
|
||||
|
||||
/// \brief Return Thrift-serialized representation of the metadata as a
|
||||
/// string
|
||||
std::string SerializeToString() const;
|
||||
|
||||
// Return const-pointer to make it clear that this object is not to be copied
|
||||
const SchemaDescriptor* schema() const;
|
||||
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
|
||||
|
||||
/// \brief Set a path to all ColumnChunk for all RowGroups.
|
||||
///
|
||||
/// Commonly used by systems (Dask, Spark) who generates an metadata-only
|
||||
/// parquet file. The path is usually relative to said index file.
|
||||
///
|
||||
/// \param[in] path to set.
|
||||
void set_file_path(const std::string& path);
|
||||
|
||||
/// \brief Merge row groups from another metadata file into this one.
|
||||
///
|
||||
/// The schema of the input FileMetaData must be equal to the
|
||||
/// schema of this object.
|
||||
///
|
||||
/// This is used by systems who creates an aggregate metadata-only file by
|
||||
/// concatenating the row groups of multiple files. This newly created
|
||||
/// metadata file acts as an index of all available row groups.
|
||||
///
|
||||
/// \param[in] other FileMetaData to merge the row groups from.
|
||||
///
|
||||
/// \throws ParquetException if schemas are not equal.
|
||||
void AppendRowGroups(const FileMetaData& other);
|
||||
|
||||
/// \brief Return a FileMetaData containing a subset of the row groups in this
|
||||
/// FileMetaData.
|
||||
std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
|
||||
|
||||
private:
|
||||
friend FileMetaDataBuilder;
|
||||
friend class SerializedFile;
|
||||
|
||||
explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
|
||||
|
||||
// PIMPL Idiom
|
||||
FileMetaData();
|
||||
class FileMetaDataImpl;
|
||||
std::unique_ptr<FileMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileCryptoMetaData {
|
||||
public:
|
||||
// API convenience to get a MetaData accessor
|
||||
static std::shared_ptr<FileCryptoMetaData> Make(const uint8_t* serialized_metadata,
|
||||
uint32_t* metadata_len);
|
||||
~FileCryptoMetaData();
|
||||
|
||||
EncryptionAlgorithm encryption_algorithm() const;
|
||||
const std::string& key_metadata() const;
|
||||
|
||||
void WriteTo(::arrow::io::OutputStream* dst) const;
|
||||
|
||||
private:
|
||||
friend FileMetaDataBuilder;
|
||||
FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
|
||||
|
||||
// PIMPL Idiom
|
||||
FileCryptoMetaData();
|
||||
class FileCryptoMetaDataImpl;
|
||||
std::unique_ptr<FileCryptoMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
// Builder API
|
||||
class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
|
||||
public:
|
||||
// API convenience to get a MetaData reader
|
||||
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
|
||||
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
|
||||
|
||||
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
|
||||
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
|
||||
void* contents);
|
||||
|
||||
~ColumnChunkMetaDataBuilder();
|
||||
|
||||
// column chunk
|
||||
// Used when a dataset is spread across multiple files
|
||||
void set_file_path(const std::string& path);
|
||||
// column metadata
|
||||
void SetStatistics(const EncodedStatistics& stats);
|
||||
// get the column descriptor
|
||||
const ColumnDescriptor* descr() const;
|
||||
|
||||
int64_t total_compressed_size() const;
|
||||
// commit the metadata
|
||||
|
||||
void Finish(int64_t num_values, int64_t dictionary_page_offset,
|
||||
int64_t index_page_offset, int64_t data_page_offset,
|
||||
int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
|
||||
bool dictionary_fallback,
|
||||
const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
|
||||
const std::map<Encoding::type, int32_t>& data_encoding_stats_,
|
||||
const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
|
||||
|
||||
// The metadata contents, suitable for passing to ColumnChunkMetaData::Make
|
||||
const void* contents() const;
|
||||
|
||||
// For writing metadata at end of column chunk
|
||||
void WriteTo(::arrow::io::OutputStream* sink);
|
||||
|
||||
private:
|
||||
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
|
||||
const ColumnDescriptor* column);
|
||||
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
|
||||
const ColumnDescriptor* column, void* contents);
|
||||
// PIMPL Idiom
|
||||
class ColumnChunkMetaDataBuilderImpl;
|
||||
std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT RowGroupMetaDataBuilder {
|
||||
public:
|
||||
// API convenience to get a MetaData reader
|
||||
static std::unique_ptr<RowGroupMetaDataBuilder> Make(
|
||||
std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
|
||||
void* contents);
|
||||
|
||||
~RowGroupMetaDataBuilder();
|
||||
|
||||
ColumnChunkMetaDataBuilder* NextColumnChunk();
|
||||
int num_columns();
|
||||
int64_t num_rows();
|
||||
int current_column() const;
|
||||
|
||||
void set_num_rows(int64_t num_rows);
|
||||
|
||||
// commit the metadata
|
||||
void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
|
||||
|
||||
private:
|
||||
explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
|
||||
const SchemaDescriptor* schema_, void* contents);
|
||||
// PIMPL Idiom
|
||||
class RowGroupMetaDataBuilderImpl;
|
||||
std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileMetaDataBuilder {
|
||||
public:
|
||||
// API convenience to get a MetaData reader
|
||||
static std::unique_ptr<FileMetaDataBuilder> Make(
|
||||
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
|
||||
|
||||
~FileMetaDataBuilder();
|
||||
|
||||
// The prior RowGroupMetaDataBuilder (if any) is destroyed
|
||||
RowGroupMetaDataBuilder* AppendRowGroup();
|
||||
|
||||
// Complete the Thrift structure
|
||||
std::unique_ptr<FileMetaData> Finish();
|
||||
|
||||
// crypto metadata
|
||||
std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
|
||||
|
||||
private:
|
||||
explicit FileMetaDataBuilder(
|
||||
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
|
||||
// PIMPL Idiom
|
||||
class FileMetaDataBuilderImpl;
|
||||
std::unique_ptr<FileMetaDataBuilderImpl> impl_;
|
||||
};
|
||||
|
||||
PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
|
||||
|
||||
} // namespace parquet
|
54
.venv/Lib/site-packages/pyarrow/include/parquet/murmur3.h
Normal file
54
.venv/Lib/site-packages/pyarrow/include/parquet/murmur3.h
Normal file
@ -0,0 +1,54 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||
// domain. The author hereby disclaims copyright to this source code.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "parquet/hasher.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// Source:
|
||||
/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
|
||||
/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class)
|
||||
class PARQUET_EXPORT MurmurHash3 : public Hasher {
|
||||
public:
|
||||
MurmurHash3() : seed_(DEFAULT_SEED) {}
|
||||
uint64_t Hash(int32_t value) const override;
|
||||
uint64_t Hash(int64_t value) const override;
|
||||
uint64_t Hash(float value) const override;
|
||||
uint64_t Hash(double value) const override;
|
||||
uint64_t Hash(const Int96* value) const override;
|
||||
uint64_t Hash(const ByteArray* value) const override;
|
||||
uint64_t Hash(const FLBA* val, uint32_t len) const override;
|
||||
|
||||
private:
|
||||
// Default seed for hash which comes from Bloom filter in parquet-mr, it is generated
|
||||
// by System.nanoTime() of java.
|
||||
static constexpr int DEFAULT_SEED = 1361930890;
|
||||
|
||||
uint32_t seed_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
@ -0,0 +1,31 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#ifndef PARQUET_VERSION_H
|
||||
#define PARQUET_VERSION_H
|
||||
|
||||
#define PARQUET_VERSION_MAJOR 8
|
||||
#define PARQUET_VERSION_MINOR 0
|
||||
#define PARQUET_VERSION_PATCH 0
|
||||
|
||||
#define PARQUET_SO_VERSION "800"
|
||||
#define PARQUET_FULL_SO_VERSION "800.0.0"
|
||||
|
||||
// define the parquet created by version
|
||||
#define CREATED_BY_VERSION "parquet-cpp-arrow version 8.0.0"
|
||||
|
||||
#endif // PARQUET_VERSION_H
|
28
.venv/Lib/site-packages/pyarrow/include/parquet/pch.h
Normal file
28
.venv/Lib/site-packages/pyarrow/include/parquet/pch.h
Normal file
@ -0,0 +1,28 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Often-used headers, for precompiling.
|
||||
// If updating this header, please make sure you check compilation speed
|
||||
// before checking in. Adding headers which are not used extremely often
|
||||
// may incur a slowdown, since it makes the precompiled header heavier to load.
|
||||
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/metadata.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
111
.venv/Lib/site-packages/pyarrow/include/parquet/platform.h
Normal file
111
.venv/Lib/site-packages/pyarrow/include/parquet/platform.h
Normal file
@ -0,0 +1,111 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/buffer.h" // IWYU pragma: export
|
||||
#include "arrow/io/interfaces.h" // IWYU pragma: export
|
||||
#include "arrow/status.h" // IWYU pragma: export
|
||||
#include "arrow/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/util/macros.h" // IWYU pragma: export
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
// Disable warning for STL types usage in DLL interface
|
||||
// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
|
||||
#pragma warning(disable : 4275 4251)
|
||||
// Disable diamond inheritance warnings
|
||||
#pragma warning(disable : 4250)
|
||||
// Disable macro redefinition warnings
|
||||
#pragma warning(disable : 4005)
|
||||
// Disable extern before exported template warnings
|
||||
#pragma warning(disable : 4910)
|
||||
#else
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
#endif
|
||||
|
||||
#ifdef PARQUET_STATIC
|
||||
#define PARQUET_EXPORT
|
||||
#elif defined(PARQUET_EXPORTING)
|
||||
#define PARQUET_EXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define PARQUET_EXPORT __declspec(dllimport)
|
||||
#endif
|
||||
|
||||
#define PARQUET_NO_EXPORT
|
||||
|
||||
#else // Not Windows
|
||||
#ifndef PARQUET_EXPORT
|
||||
#define PARQUET_EXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#ifndef PARQUET_NO_EXPORT
|
||||
#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif // Non-Windows
|
||||
|
||||
// This is a complicated topic, some reading on it:
|
||||
// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
|
||||
#if defined(_MSC_VER) || defined(__clang__)
|
||||
#define PARQUET_TEMPLATE_CLASS_EXPORT
|
||||
#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
|
||||
#else
|
||||
#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
|
||||
#define PARQUET_TEMPLATE_EXPORT
|
||||
#endif
|
||||
|
||||
#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
|
||||
|
||||
#define PARQUET_NORETURN ARROW_NORETURN
|
||||
#define PARQUET_DEPRECATED ARROW_DEPRECATED
|
||||
|
||||
// If ARROW_VALGRIND set when compiling unit tests, also define
|
||||
// PARQUET_VALGRIND
|
||||
#ifdef ARROW_VALGRIND
|
||||
#define PARQUET_VALGRIND
|
||||
#endif
|
||||
|
||||
namespace parquet {
|
||||
|
||||
using Buffer = ::arrow::Buffer;
|
||||
using Codec = ::arrow::util::Codec;
|
||||
using Compression = ::arrow::Compression;
|
||||
using MemoryPool = ::arrow::MemoryPool;
|
||||
using MutableBuffer = ::arrow::MutableBuffer;
|
||||
using ResizableBuffer = ::arrow::ResizableBuffer;
|
||||
using ResizableBuffer = ::arrow::ResizableBuffer;
|
||||
using ArrowInputFile = ::arrow::io::RandomAccessFile;
|
||||
using ArrowInputStream = ::arrow::io::InputStream;
|
||||
using ArrowOutputStream = ::arrow::io::OutputStream;
|
||||
|
||||
constexpr int64_t kDefaultOutputStreamSize = 1024;
|
||||
|
||||
constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::shared_ptr<ResizableBuffer> AllocateBuffer(
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
|
||||
|
||||
} // namespace parquet
|
46
.venv/Lib/site-packages/pyarrow/include/parquet/printer.h
Normal file
46
.venv/Lib/site-packages/pyarrow/include/parquet/printer.h
Normal file
@ -0,0 +1,46 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
#include <list>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ParquetFileReader;
|
||||
|
||||
class PARQUET_EXPORT ParquetFilePrinter {
|
||||
private:
|
||||
ParquetFileReader* fileReader;
|
||||
|
||||
public:
|
||||
explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
|
||||
~ParquetFilePrinter() {}
|
||||
|
||||
void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
|
||||
bool print_values = false, bool format_dump = false,
|
||||
bool print_key_value_metadata = false,
|
||||
const char* filename = "No Name");
|
||||
|
||||
void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
|
||||
const char* filename = "No Name");
|
||||
};
|
||||
|
||||
} // namespace parquet
|
837
.venv/Lib/site-packages/pyarrow/include/parquet/properties.h
Normal file
837
.venv/Lib/site-packages/pyarrow/include/parquet/properties.h
Normal file
@ -0,0 +1,837 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/io/caching.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/compression.h"
|
||||
#include "parquet/encryption/encryption.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/parquet_version.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/type_fwd.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// Controls serialization format of data pages. parquet-format v2.0.0
|
||||
/// introduced a new data page metadata type DataPageV2 and serialized page
|
||||
/// structure (for example, encoded levels are no longer compressed). Prior to
|
||||
/// the completion of PARQUET-457 in 2020, this library did not implement
|
||||
/// DataPageV2 correctly, so if you use the V2 data page format, you may have
|
||||
/// forward compatibility issues (older versions of the library will be unable
|
||||
/// to read the files). Note that some Parquet implementations do not implement
|
||||
/// DataPageV2 at all.
|
||||
enum class ParquetDataPageVersion { V1, V2 };
|
||||
|
||||
/// Align the default buffer size to a small multiple of a page size.
|
||||
constexpr int64_t kDefaultBufferSize = 4096 * 4;
|
||||
|
||||
class PARQUET_EXPORT ReaderProperties {
|
||||
public:
|
||||
explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
|
||||
: pool_(pool) {}
|
||||
|
||||
MemoryPool* memory_pool() const { return pool_; }
|
||||
|
||||
std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
|
||||
int64_t start, int64_t num_bytes);
|
||||
|
||||
/// Buffered stream reading allows the user to control the memory usage of
|
||||
/// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
|
||||
/// wrapped in a buffered reader that uses a fix sized buffer (of size
|
||||
/// `buffer_size()`) instead of the full size of the ReadAt.
|
||||
///
|
||||
/// The primary reason for this control knobs is for resource control and not
|
||||
/// performance.
|
||||
bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
|
||||
void enable_buffered_stream() { buffered_stream_enabled_ = true; }
|
||||
void disable_buffered_stream() { buffered_stream_enabled_ = false; }
|
||||
|
||||
int64_t buffer_size() const { return buffer_size_; }
|
||||
void set_buffer_size(int64_t size) { buffer_size_ = size; }
|
||||
|
||||
void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
|
||||
file_decryption_properties_ = std::move(decryption);
|
||||
}
|
||||
|
||||
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
|
||||
return file_decryption_properties_;
|
||||
}
|
||||
|
||||
private:
|
||||
MemoryPool* pool_;
|
||||
int64_t buffer_size_ = kDefaultBufferSize;
|
||||
bool buffered_stream_enabled_ = false;
|
||||
std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
|
||||
};
|
||||
|
||||
ReaderProperties PARQUET_EXPORT default_reader_properties();
|
||||
|
||||
static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
|
||||
static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
|
||||
static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
|
||||
static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
|
||||
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
|
||||
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
|
||||
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
|
||||
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
|
||||
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
|
||||
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
|
||||
|
||||
class PARQUET_EXPORT ColumnProperties {
|
||||
public:
|
||||
ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
|
||||
Compression::type codec = DEFAULT_COMPRESSION_TYPE,
|
||||
bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
|
||||
bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
|
||||
size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
|
||||
: encoding_(encoding),
|
||||
codec_(codec),
|
||||
dictionary_enabled_(dictionary_enabled),
|
||||
statistics_enabled_(statistics_enabled),
|
||||
max_stats_size_(max_stats_size),
|
||||
compression_level_(Codec::UseDefaultCompressionLevel()) {}
|
||||
|
||||
void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
|
||||
|
||||
void set_compression(Compression::type codec) { codec_ = codec; }
|
||||
|
||||
void set_dictionary_enabled(bool dictionary_enabled) {
|
||||
dictionary_enabled_ = dictionary_enabled;
|
||||
}
|
||||
|
||||
void set_statistics_enabled(bool statistics_enabled) {
|
||||
statistics_enabled_ = statistics_enabled;
|
||||
}
|
||||
|
||||
void set_max_statistics_size(size_t max_stats_size) {
|
||||
max_stats_size_ = max_stats_size;
|
||||
}
|
||||
|
||||
void set_compression_level(int compression_level) {
|
||||
compression_level_ = compression_level;
|
||||
}
|
||||
|
||||
Encoding::type encoding() const { return encoding_; }
|
||||
|
||||
Compression::type compression() const { return codec_; }
|
||||
|
||||
bool dictionary_enabled() const { return dictionary_enabled_; }
|
||||
|
||||
bool statistics_enabled() const { return statistics_enabled_; }
|
||||
|
||||
size_t max_statistics_size() const { return max_stats_size_; }
|
||||
|
||||
int compression_level() const { return compression_level_; }
|
||||
|
||||
private:
|
||||
Encoding::type encoding_;
|
||||
Compression::type codec_;
|
||||
bool dictionary_enabled_;
|
||||
bool statistics_enabled_;
|
||||
size_t max_stats_size_;
|
||||
int compression_level_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT WriterProperties {
|
||||
public:
|
||||
class Builder {
|
||||
public:
|
||||
Builder()
|
||||
: pool_(::arrow::default_memory_pool()),
|
||||
dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
|
||||
write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
|
||||
max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
|
||||
pagesize_(kDefaultDataPageSize),
|
||||
version_(ParquetVersion::PARQUET_1_0),
|
||||
data_page_version_(ParquetDataPageVersion::V1),
|
||||
created_by_(DEFAULT_CREATED_BY) {}
|
||||
virtual ~Builder() {}
|
||||
|
||||
/// Specify the memory pool for the writer. Default default_memory_pool.
|
||||
Builder* memory_pool(MemoryPool* pool) {
|
||||
pool_ = pool;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Enable dictionary encoding in general for all columns. Default enabled.
|
||||
Builder* enable_dictionary() {
|
||||
default_column_properties_.set_dictionary_enabled(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Disable dictionary encoding in general for all columns. Default enabled.
|
||||
Builder* disable_dictionary() {
|
||||
default_column_properties_.set_dictionary_enabled(false);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Enable dictionary encoding for column specified by `path`. Default enabled.
|
||||
Builder* enable_dictionary(const std::string& path) {
|
||||
dictionary_enabled_[path] = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Enable dictionary encoding for column specified by `path`. Default enabled.
|
||||
Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
|
||||
return this->enable_dictionary(path->ToDotString());
|
||||
}
|
||||
|
||||
/// Disable dictionary encoding for column specified by `path`. Default enabled.
|
||||
Builder* disable_dictionary(const std::string& path) {
|
||||
dictionary_enabled_[path] = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Disable dictionary encoding for column specified by `path`. Default enabled.
|
||||
Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
|
||||
return this->disable_dictionary(path->ToDotString());
|
||||
}
|
||||
|
||||
/// Specify the dictionary page size limit per row group. Default 1MB.
|
||||
Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
|
||||
dictionary_pagesize_limit_ = dictionary_psize_limit;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify the write batch size while writing batches of Arrow values into Parquet.
|
||||
/// Default 1024.
|
||||
Builder* write_batch_size(int64_t write_batch_size) {
|
||||
write_batch_size_ = write_batch_size;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify the max row group length.
|
||||
/// Default 64M.
|
||||
Builder* max_row_group_length(int64_t max_row_group_length) {
|
||||
max_row_group_length_ = max_row_group_length;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify the data page size.
|
||||
/// Default 1MB.
|
||||
Builder* data_pagesize(int64_t pg_size) {
|
||||
pagesize_ = pg_size;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify the data page version.
|
||||
/// Default V1.
|
||||
Builder* data_page_version(ParquetDataPageVersion data_page_version) {
|
||||
data_page_version_ = data_page_version;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify the Parquet file version.
|
||||
/// Default PARQUET_1_0.
|
||||
Builder* version(ParquetVersion::type version) {
|
||||
version_ = version;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* created_by(const std::string& created_by) {
|
||||
created_by_ = created_by;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// \brief Define the encoding that is used when we don't utilise dictionary encoding.
|
||||
//
|
||||
/// This either apply if dictionary encoding is disabled or if we fallback
|
||||
/// as the dictionary grew too large.
|
||||
Builder* encoding(Encoding::type encoding_type) {
|
||||
if (encoding_type == Encoding::PLAIN_DICTIONARY ||
|
||||
encoding_type == Encoding::RLE_DICTIONARY) {
|
||||
throw ParquetException("Can't use dictionary encoding as fallback encoding");
|
||||
}
|
||||
|
||||
default_column_properties_.set_encoding(encoding_type);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// \brief Define the encoding that is used when we don't utilise dictionary encoding.
|
||||
//
|
||||
/// This either apply if dictionary encoding is disabled or if we fallback
|
||||
/// as the dictionary grew too large.
|
||||
Builder* encoding(const std::string& path, Encoding::type encoding_type) {
|
||||
if (encoding_type == Encoding::PLAIN_DICTIONARY ||
|
||||
encoding_type == Encoding::RLE_DICTIONARY) {
|
||||
throw ParquetException("Can't use dictionary encoding as fallback encoding");
|
||||
}
|
||||
|
||||
encodings_[path] = encoding_type;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// \brief Define the encoding that is used when we don't utilise dictionary encoding.
|
||||
//
|
||||
/// This either apply if dictionary encoding is disabled or if we fallback
|
||||
/// as the dictionary grew too large.
|
||||
Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
|
||||
Encoding::type encoding_type) {
|
||||
return this->encoding(path->ToDotString(), encoding_type);
|
||||
}
|
||||
|
||||
/// Specify compression codec in general for all columns.
|
||||
/// Default UNCOMPRESSED.
|
||||
Builder* compression(Compression::type codec) {
|
||||
default_column_properties_.set_compression(codec);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify max statistics size to store min max value.
|
||||
/// Default 4KB.
|
||||
Builder* max_statistics_size(size_t max_stats_sz) {
|
||||
default_column_properties_.set_max_statistics_size(max_stats_sz);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify compression codec for the column specified by `path`.
|
||||
/// Default UNCOMPRESSED.
|
||||
Builder* compression(const std::string& path, Compression::type codec) {
|
||||
codecs_[path] = codec;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Specify compression codec for the column specified by `path`.
|
||||
/// Default UNCOMPRESSED.
|
||||
Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
|
||||
Compression::type codec) {
|
||||
return this->compression(path->ToDotString(), codec);
|
||||
}
|
||||
|
||||
/// \brief Specify the default compression level for the compressor in
|
||||
/// every column. In case a column does not have an explicitly specified
|
||||
/// compression level, the default one would be used.
|
||||
///
|
||||
/// The provided compression level is compressor specific. The user would
|
||||
/// have to familiarize oneself with the available levels for the selected
|
||||
/// compressor. If the compressor does not allow for selecting different
|
||||
/// compression levels, calling this function would not have any effect.
|
||||
/// Parquet and Arrow do not validate the passed compression level. If no
|
||||
/// level is selected by the user or if the special
|
||||
/// std::numeric_limits<int>::min() value is passed, then Arrow selects the
|
||||
/// compression level.
|
||||
Builder* compression_level(int compression_level) {
|
||||
default_column_properties_.set_compression_level(compression_level);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// \brief Specify a compression level for the compressor for the column
|
||||
/// described by path.
|
||||
///
|
||||
/// The provided compression level is compressor specific. The user would
|
||||
/// have to familiarize oneself with the available levels for the selected
|
||||
/// compressor. If the compressor does not allow for selecting different
|
||||
/// compression levels, calling this function would not have any effect.
|
||||
/// Parquet and Arrow do not validate the passed compression level. If no
|
||||
/// level is selected by the user or if the special
|
||||
/// std::numeric_limits<int>::min() value is passed, then Arrow selects the
|
||||
/// compression level.
|
||||
Builder* compression_level(const std::string& path, int compression_level) {
|
||||
codecs_compression_level_[path] = compression_level;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// \brief Specify a compression level for the compressor for the column
|
||||
/// described by path.
|
||||
///
|
||||
/// The provided compression level is compressor specific. The user would
|
||||
/// have to familiarize oneself with the available levels for the selected
|
||||
/// compressor. If the compressor does not allow for selecting different
|
||||
/// compression levels, calling this function would not have any effect.
|
||||
/// Parquet and Arrow do not validate the passed compression level. If no
|
||||
/// level is selected by the user or if the special
|
||||
/// std::numeric_limits<int>::min() value is passed, then Arrow selects the
|
||||
/// compression level.
|
||||
Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
|
||||
int compression_level) {
|
||||
return this->compression_level(path->ToDotString(), compression_level);
|
||||
}
|
||||
|
||||
/// Define the file encryption properties.
|
||||
/// Default NULL.
|
||||
Builder* encryption(
|
||||
std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
|
||||
file_encryption_properties_ = std::move(file_encryption_properties);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Enable statistics in general.
|
||||
/// Default enabled.
|
||||
Builder* enable_statistics() {
|
||||
default_column_properties_.set_statistics_enabled(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Disable statistics in general.
|
||||
/// Default enabled.
|
||||
Builder* disable_statistics() {
|
||||
default_column_properties_.set_statistics_enabled(false);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Enable statistics for the column specified by `path`.
|
||||
/// Default enabled.
|
||||
Builder* enable_statistics(const std::string& path) {
|
||||
statistics_enabled_[path] = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Enable statistics for the column specified by `path`.
|
||||
/// Default enabled.
|
||||
Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
|
||||
return this->enable_statistics(path->ToDotString());
|
||||
}
|
||||
|
||||
/// Disable statistics for the column specified by `path`.
|
||||
/// Default enabled.
|
||||
Builder* disable_statistics(const std::string& path) {
|
||||
statistics_enabled_[path] = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Disable statistics for the column specified by `path`.
|
||||
/// Default enabled.
|
||||
Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
|
||||
return this->disable_statistics(path->ToDotString());
|
||||
}
|
||||
|
||||
/// \brief Build the WriterProperties with the builder parameters.
|
||||
/// \return The WriterProperties defined by the builder.
|
||||
std::shared_ptr<WriterProperties> build() {
|
||||
std::unordered_map<std::string, ColumnProperties> column_properties;
|
||||
auto get = [&](const std::string& key) -> ColumnProperties& {
|
||||
auto it = column_properties.find(key);
|
||||
if (it == column_properties.end())
|
||||
return column_properties[key] = default_column_properties_;
|
||||
else
|
||||
return it->second;
|
||||
};
|
||||
|
||||
for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
|
||||
for (const auto& item : codecs_) get(item.first).set_compression(item.second);
|
||||
for (const auto& item : codecs_compression_level_)
|
||||
get(item.first).set_compression_level(item.second);
|
||||
for (const auto& item : dictionary_enabled_)
|
||||
get(item.first).set_dictionary_enabled(item.second);
|
||||
for (const auto& item : statistics_enabled_)
|
||||
get(item.first).set_statistics_enabled(item.second);
|
||||
|
||||
return std::shared_ptr<WriterProperties>(new WriterProperties(
|
||||
pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
|
||||
pagesize_, version_, created_by_, std::move(file_encryption_properties_),
|
||||
default_column_properties_, column_properties, data_page_version_));
|
||||
}
|
||||
|
||||
private:
|
||||
MemoryPool* pool_;
|
||||
int64_t dictionary_pagesize_limit_;
|
||||
int64_t write_batch_size_;
|
||||
int64_t max_row_group_length_;
|
||||
int64_t pagesize_;
|
||||
ParquetVersion::type version_;
|
||||
ParquetDataPageVersion data_page_version_;
|
||||
std::string created_by_;
|
||||
|
||||
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
|
||||
|
||||
// Settings used for each column unless overridden in any of the maps below
|
||||
ColumnProperties default_column_properties_;
|
||||
std::unordered_map<std::string, Encoding::type> encodings_;
|
||||
std::unordered_map<std::string, Compression::type> codecs_;
|
||||
std::unordered_map<std::string, int32_t> codecs_compression_level_;
|
||||
std::unordered_map<std::string, bool> dictionary_enabled_;
|
||||
std::unordered_map<std::string, bool> statistics_enabled_;
|
||||
};
|
||||
|
||||
inline MemoryPool* memory_pool() const { return pool_; }
|
||||
|
||||
inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
|
||||
|
||||
inline int64_t write_batch_size() const { return write_batch_size_; }
|
||||
|
||||
inline int64_t max_row_group_length() const { return max_row_group_length_; }
|
||||
|
||||
inline int64_t data_pagesize() const { return pagesize_; }
|
||||
|
||||
inline ParquetDataPageVersion data_page_version() const {
|
||||
return parquet_data_page_version_;
|
||||
}
|
||||
|
||||
inline ParquetVersion::type version() const { return parquet_version_; }
|
||||
|
||||
inline std::string created_by() const { return parquet_created_by_; }
|
||||
|
||||
inline Encoding::type dictionary_index_encoding() const {
|
||||
if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
|
||||
return Encoding::PLAIN_DICTIONARY;
|
||||
} else {
|
||||
return Encoding::RLE_DICTIONARY;
|
||||
}
|
||||
}
|
||||
|
||||
inline Encoding::type dictionary_page_encoding() const {
|
||||
if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
|
||||
return Encoding::PLAIN_DICTIONARY;
|
||||
} else {
|
||||
return Encoding::PLAIN;
|
||||
}
|
||||
}
|
||||
|
||||
const ColumnProperties& column_properties(
|
||||
const std::shared_ptr<schema::ColumnPath>& path) const {
|
||||
auto it = column_properties_.find(path->ToDotString());
|
||||
if (it != column_properties_.end()) return it->second;
|
||||
return default_column_properties_;
|
||||
}
|
||||
|
||||
Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
|
||||
return column_properties(path).encoding();
|
||||
}
|
||||
|
||||
Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
|
||||
return column_properties(path).compression();
|
||||
}
|
||||
|
||||
int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
|
||||
return column_properties(path).compression_level();
|
||||
}
|
||||
|
||||
bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
|
||||
return column_properties(path).dictionary_enabled();
|
||||
}
|
||||
|
||||
bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
|
||||
return column_properties(path).statistics_enabled();
|
||||
}
|
||||
|
||||
size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
|
||||
return column_properties(path).max_statistics_size();
|
||||
}
|
||||
|
||||
inline FileEncryptionProperties* file_encryption_properties() const {
|
||||
return file_encryption_properties_.get();
|
||||
}
|
||||
|
||||
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
|
||||
const std::string& path) const {
|
||||
if (file_encryption_properties_) {
|
||||
return file_encryption_properties_->column_encryption_properties(path);
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
explicit WriterProperties(
|
||||
MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
|
||||
int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
|
||||
const std::string& created_by,
|
||||
std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
|
||||
const ColumnProperties& default_column_properties,
|
||||
const std::unordered_map<std::string, ColumnProperties>& column_properties,
|
||||
ParquetDataPageVersion data_page_version)
|
||||
: pool_(pool),
|
||||
dictionary_pagesize_limit_(dictionary_pagesize_limit),
|
||||
write_batch_size_(write_batch_size),
|
||||
max_row_group_length_(max_row_group_length),
|
||||
pagesize_(pagesize),
|
||||
parquet_data_page_version_(data_page_version),
|
||||
parquet_version_(version),
|
||||
parquet_created_by_(created_by),
|
||||
file_encryption_properties_(file_encryption_properties),
|
||||
default_column_properties_(default_column_properties),
|
||||
column_properties_(column_properties) {}
|
||||
|
||||
MemoryPool* pool_;
|
||||
int64_t dictionary_pagesize_limit_;
|
||||
int64_t write_batch_size_;
|
||||
int64_t max_row_group_length_;
|
||||
int64_t pagesize_;
|
||||
ParquetDataPageVersion parquet_data_page_version_;
|
||||
ParquetVersion::type parquet_version_;
|
||||
std::string parquet_created_by_;
|
||||
|
||||
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
|
||||
|
||||
ColumnProperties default_column_properties_;
|
||||
std::unordered_map<std::string, ColumnProperties> column_properties_;
|
||||
};
|
||||
|
||||
PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Properties specific to Apache Arrow columnar read and write
|
||||
|
||||
static constexpr bool kArrowDefaultUseThreads = false;
|
||||
|
||||
// Default number of rows to read when using ::arrow::RecordBatchReader
|
||||
static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
|
||||
|
||||
/// EXPERIMENTAL: Properties for configuring FileReader behavior.
|
||||
class PARQUET_EXPORT ArrowReaderProperties {
|
||||
public:
|
||||
explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
|
||||
: use_threads_(use_threads),
|
||||
read_dict_indices_(),
|
||||
batch_size_(kArrowDefaultBatchSize),
|
||||
pre_buffer_(false),
|
||||
cache_options_(::arrow::io::CacheOptions::Defaults()),
|
||||
coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
|
||||
|
||||
void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
|
||||
|
||||
bool use_threads() const { return use_threads_; }
|
||||
|
||||
void set_read_dictionary(int column_index, bool read_dict) {
|
||||
if (read_dict) {
|
||||
read_dict_indices_.insert(column_index);
|
||||
} else {
|
||||
read_dict_indices_.erase(column_index);
|
||||
}
|
||||
}
|
||||
bool read_dictionary(int column_index) const {
|
||||
if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
|
||||
|
||||
int64_t batch_size() const { return batch_size_; }
|
||||
|
||||
/// Enable read coalescing.
|
||||
///
|
||||
/// When enabled, the Arrow reader will pre-buffer necessary regions
|
||||
/// of the file in-memory. This is intended to improve performance on
|
||||
/// high-latency filesystems (e.g. Amazon S3).
|
||||
void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
|
||||
|
||||
bool pre_buffer() const { return pre_buffer_; }
|
||||
|
||||
/// Set options for read coalescing. This can be used to tune the
|
||||
/// implementation for characteristics of different filesystems.
|
||||
void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
|
||||
|
||||
const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
|
||||
|
||||
/// Set execution context for read coalescing.
|
||||
void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
|
||||
|
||||
const ::arrow::io::IOContext& io_context() const { return io_context_; }
|
||||
|
||||
/// Set timestamp unit to use for deprecated INT96-encoded timestamps
|
||||
/// (default is NANO).
|
||||
void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
|
||||
coerce_int96_timestamp_unit_ = unit;
|
||||
}
|
||||
|
||||
::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
|
||||
return coerce_int96_timestamp_unit_;
|
||||
}
|
||||
|
||||
private:
|
||||
bool use_threads_;
|
||||
std::unordered_set<int> read_dict_indices_;
|
||||
int64_t batch_size_;
|
||||
bool pre_buffer_;
|
||||
::arrow::io::IOContext io_context_;
|
||||
::arrow::io::CacheOptions cache_options_;
|
||||
::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
|
||||
};
|
||||
|
||||
/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
|
||||
PARQUET_EXPORT
|
||||
ArrowReaderProperties default_arrow_reader_properties();
|
||||
|
||||
class PARQUET_EXPORT ArrowWriterProperties {
|
||||
public:
|
||||
enum EngineVersion {
|
||||
V1, // Supports only nested lists.
|
||||
V2 // Full support for all nesting combinations
|
||||
};
|
||||
class Builder {
|
||||
public:
|
||||
Builder()
|
||||
: write_timestamps_as_int96_(false),
|
||||
coerce_timestamps_enabled_(false),
|
||||
coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
|
||||
truncated_timestamps_allowed_(false),
|
||||
store_schema_(false),
|
||||
// TODO: At some point we should flip this.
|
||||
compliant_nested_types_(false),
|
||||
engine_version_(V2) {}
|
||||
virtual ~Builder() = default;
|
||||
|
||||
Builder* disable_deprecated_int96_timestamps() {
|
||||
write_timestamps_as_int96_ = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* enable_deprecated_int96_timestamps() {
|
||||
write_timestamps_as_int96_ = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
|
||||
coerce_timestamps_enabled_ = true;
|
||||
coerce_timestamps_unit_ = unit;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* allow_truncated_timestamps() {
|
||||
truncated_timestamps_allowed_ = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* disallow_truncated_timestamps() {
|
||||
truncated_timestamps_allowed_ = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
|
||||
/// to enable certain read options (like "read_dictionary") to be set
|
||||
/// automatically
|
||||
Builder* store_schema() {
|
||||
store_schema_ = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* enable_compliant_nested_types() {
|
||||
compliant_nested_types_ = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* disable_compliant_nested_types() {
|
||||
compliant_nested_types_ = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
Builder* set_engine_version(EngineVersion version) {
|
||||
engine_version_ = version;
|
||||
return this;
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrowWriterProperties> build() {
|
||||
return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
|
||||
write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
|
||||
truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
|
||||
engine_version_));
|
||||
}
|
||||
|
||||
private:
|
||||
bool write_timestamps_as_int96_;
|
||||
|
||||
bool coerce_timestamps_enabled_;
|
||||
::arrow::TimeUnit::type coerce_timestamps_unit_;
|
||||
bool truncated_timestamps_allowed_;
|
||||
|
||||
bool store_schema_;
|
||||
bool compliant_nested_types_;
|
||||
EngineVersion engine_version_;
|
||||
};
|
||||
|
||||
bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
|
||||
|
||||
bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
|
||||
::arrow::TimeUnit::type coerce_timestamps_unit() const {
|
||||
return coerce_timestamps_unit_;
|
||||
}
|
||||
|
||||
bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
|
||||
|
||||
bool store_schema() const { return store_schema_; }
|
||||
|
||||
/// \brief Enable nested type naming according to the parquet specification.
|
||||
///
|
||||
/// Older versions of arrow wrote out field names for nested lists based on the name
|
||||
/// of the field. According to the parquet specification they should always be
|
||||
/// "element".
|
||||
bool compliant_nested_types() const { return compliant_nested_types_; }
|
||||
|
||||
/// \brief The underlying engine version to use when writing Arrow data.
|
||||
///
|
||||
/// V2 is currently the latest V1 is considered deprecated but left in
|
||||
/// place in case there are bugs detected in V2.
|
||||
EngineVersion engine_version() const { return engine_version_; }
|
||||
|
||||
private:
|
||||
explicit ArrowWriterProperties(bool write_nanos_as_int96,
|
||||
bool coerce_timestamps_enabled,
|
||||
::arrow::TimeUnit::type coerce_timestamps_unit,
|
||||
bool truncated_timestamps_allowed, bool store_schema,
|
||||
bool compliant_nested_types,
|
||||
EngineVersion engine_version)
|
||||
: write_timestamps_as_int96_(write_nanos_as_int96),
|
||||
coerce_timestamps_enabled_(coerce_timestamps_enabled),
|
||||
coerce_timestamps_unit_(coerce_timestamps_unit),
|
||||
truncated_timestamps_allowed_(truncated_timestamps_allowed),
|
||||
store_schema_(store_schema),
|
||||
compliant_nested_types_(compliant_nested_types),
|
||||
engine_version_(engine_version) {}
|
||||
|
||||
const bool write_timestamps_as_int96_;
|
||||
const bool coerce_timestamps_enabled_;
|
||||
const ::arrow::TimeUnit::type coerce_timestamps_unit_;
|
||||
const bool truncated_timestamps_allowed_;
|
||||
const bool store_schema_;
|
||||
const bool compliant_nested_types_;
|
||||
const EngineVersion engine_version_;
|
||||
};
|
||||
|
||||
/// \brief State object used for writing Arrow data directly to a Parquet
|
||||
/// column chunk. API possibly not stable
|
||||
struct ArrowWriteContext {
|
||||
ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
|
||||
: memory_pool(memory_pool),
|
||||
properties(properties),
|
||||
data_buffer(AllocateBuffer(memory_pool)),
|
||||
def_levels_buffer(AllocateBuffer(memory_pool)) {}
|
||||
|
||||
template <typename T>
|
||||
::arrow::Status GetScratchData(const int64_t num_values, T** out) {
|
||||
ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
|
||||
*out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
|
||||
return ::arrow::Status::OK();
|
||||
}
|
||||
|
||||
MemoryPool* memory_pool;
|
||||
const ArrowWriterProperties* properties;
|
||||
|
||||
// Buffer used for storing the data of an array converted to the physical type
|
||||
// as expected by parquet-cpp.
|
||||
std::shared_ptr<ResizableBuffer> data_buffer;
|
||||
|
||||
// We use the shared ownership of this buffer
|
||||
std::shared_ptr<ResizableBuffer> def_levels_buffer;
|
||||
};
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
|
||||
|
||||
} // namespace parquet
|
492
.venv/Lib/site-packages/pyarrow/include/parquet/schema.h
Normal file
492
.venv/Lib/site-packages/pyarrow/include/parquet/schema.h
Normal file
@ -0,0 +1,492 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module contains the logical parquet-cpp types (independent of Thrift
|
||||
// structures), schema nodes, and related type tools
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
#include "parquet/windows_fixup.h" // for OPTIONAL
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class SchemaDescriptor;
|
||||
|
||||
namespace schema {
|
||||
|
||||
class Node;
|
||||
|
||||
// List encodings: using the terminology from Impala to define different styles
|
||||
// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
|
||||
// the converted type named in the Parquet metadata is ConvertedType::LIST we
|
||||
// use that terminology here. It also helps distinguish from the *_ARRAY
|
||||
// primitive types.
|
||||
//
|
||||
// One-level encoding: Only allows required lists with required cells
|
||||
// repeated value_type name
|
||||
//
|
||||
// Two-level encoding: Enables optional lists with only required cells
|
||||
// <required/optional> group list
|
||||
// repeated value_type item
|
||||
//
|
||||
// Three-level encoding: Enables optional lists with optional cells
|
||||
// <required/optional> group bag
|
||||
// repeated group list
|
||||
// <required/optional> value_type item
|
||||
//
|
||||
// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
|
||||
// the non-repeated nodes set to required.
|
||||
//
|
||||
// The "official" encoding recommended in the Parquet spec is the 3-level, and
|
||||
// we use that as the default when creating list types. For semantic completeness
|
||||
// we allow the other two. Since all types of encodings will occur "in the
|
||||
// wild" we need to be able to interpret the associated definition levels in
|
||||
// the context of the actual encoding used in the file.
|
||||
//
|
||||
// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
|
||||
// SchemaElement, which could make things challenging if we are trying to infer
|
||||
// that a sequence of nodes semantically represents an array according to one
|
||||
// of these encodings (versus a struct containing an array). We should refuse
|
||||
// the temptation to guess, as they say.
|
||||
struct ListEncoding {
|
||||
enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnPath {
|
||||
public:
|
||||
ColumnPath() : path_() {}
|
||||
explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
|
||||
explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
|
||||
|
||||
static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
|
||||
static std::shared_ptr<ColumnPath> FromNode(const Node& node);
|
||||
|
||||
std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
|
||||
std::string ToDotString() const;
|
||||
const std::vector<std::string>& ToDotVector() const;
|
||||
|
||||
protected:
|
||||
std::vector<std::string> path_;
|
||||
};
|
||||
|
||||
// Base class for logical schema types. A type has a name, repetition level,
|
||||
// and optionally a logical type (ConvertedType in Parquet metadata parlance)
|
||||
class PARQUET_EXPORT Node {
|
||||
public:
|
||||
enum type { PRIMITIVE, GROUP };
|
||||
|
||||
virtual ~Node() {}
|
||||
|
||||
bool is_primitive() const { return type_ == Node::PRIMITIVE; }
|
||||
|
||||
bool is_group() const { return type_ == Node::GROUP; }
|
||||
|
||||
bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
|
||||
|
||||
bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
|
||||
|
||||
bool is_required() const { return repetition_ == Repetition::REQUIRED; }
|
||||
|
||||
virtual bool Equals(const Node* other) const = 0;
|
||||
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
Node::type node_type() const { return type_; }
|
||||
|
||||
Repetition::type repetition() const { return repetition_; }
|
||||
|
||||
ConvertedType::type converted_type() const { return converted_type_; }
|
||||
|
||||
const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
|
||||
|
||||
/// \brief The field_id value for the serialized SchemaElement. If the
|
||||
/// field_id is less than 0 (e.g. -1), it will not be set when serialized to
|
||||
/// Thrift.
|
||||
int field_id() const { return field_id_; }
|
||||
|
||||
const Node* parent() const { return parent_; }
|
||||
|
||||
const std::shared_ptr<ColumnPath> path() const;
|
||||
|
||||
virtual void ToParquet(void* element) const = 0;
|
||||
|
||||
// Node::Visitor abstract class for walking schemas with the visitor pattern
|
||||
class Visitor {
|
||||
public:
|
||||
virtual ~Visitor() {}
|
||||
|
||||
virtual void Visit(Node* node) = 0;
|
||||
};
|
||||
class ConstVisitor {
|
||||
public:
|
||||
virtual ~ConstVisitor() {}
|
||||
|
||||
virtual void Visit(const Node* node) = 0;
|
||||
};
|
||||
|
||||
virtual void Visit(Visitor* visitor) = 0;
|
||||
virtual void VisitConst(ConstVisitor* visitor) const = 0;
|
||||
|
||||
protected:
|
||||
friend class GroupNode;
|
||||
|
||||
Node(Node::type type, const std::string& name, Repetition::type repetition,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
|
||||
: type_(type),
|
||||
name_(name),
|
||||
repetition_(repetition),
|
||||
converted_type_(converted_type),
|
||||
field_id_(field_id),
|
||||
parent_(NULLPTR) {}
|
||||
|
||||
Node(Node::type type, const std::string& name, Repetition::type repetition,
|
||||
std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
|
||||
: type_(type),
|
||||
name_(name),
|
||||
repetition_(repetition),
|
||||
logical_type_(std::move(logical_type)),
|
||||
field_id_(field_id),
|
||||
parent_(NULLPTR) {}
|
||||
|
||||
Node::type type_;
|
||||
std::string name_;
|
||||
Repetition::type repetition_;
|
||||
ConvertedType::type converted_type_;
|
||||
std::shared_ptr<const LogicalType> logical_type_;
|
||||
int field_id_;
|
||||
// Nodes should not be shared, they have a single parent.
|
||||
const Node* parent_;
|
||||
|
||||
bool EqualsInternal(const Node* other) const;
|
||||
void SetParent(const Node* p_parent);
|
||||
|
||||
private:
|
||||
PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
|
||||
};
|
||||
|
||||
// Save our breath all over the place with these typedefs
|
||||
typedef std::shared_ptr<Node> NodePtr;
|
||||
typedef std::vector<NodePtr> NodeVector;
|
||||
|
||||
// A type that is one of the primitive Parquet storage types. In addition to
|
||||
// the other type metadata (name, repetition level, logical type), also has the
|
||||
// physical storage type and their type-specific metadata (byte width, decimal
|
||||
// parameters)
|
||||
class PARQUET_EXPORT PrimitiveNode : public Node {
|
||||
public:
|
||||
static std::unique_ptr<Node> FromParquet(const void* opaque_element);
|
||||
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
Type::type type,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE,
|
||||
int length = -1, int precision = -1, int scale = -1,
|
||||
int field_id = -1) {
|
||||
return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
|
||||
precision, scale, field_id));
|
||||
}
|
||||
|
||||
// If no logical type, pass LogicalType::None() or nullptr
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
std::shared_ptr<const LogicalType> logical_type,
|
||||
Type::type primitive_type, int primitive_length = -1,
|
||||
int field_id = -1) {
|
||||
return NodePtr(new PrimitiveNode(name, repetition, logical_type, primitive_type,
|
||||
primitive_length, field_id));
|
||||
}
|
||||
|
||||
bool Equals(const Node* other) const override;
|
||||
|
||||
Type::type physical_type() const { return physical_type_; }
|
||||
|
||||
ColumnOrder column_order() const { return column_order_; }
|
||||
|
||||
void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
|
||||
|
||||
int32_t type_length() const { return type_length_; }
|
||||
|
||||
const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
|
||||
|
||||
void ToParquet(void* element) const override;
|
||||
void Visit(Visitor* visitor) override;
|
||||
void VisitConst(ConstVisitor* visitor) const override;
|
||||
|
||||
private:
|
||||
PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
|
||||
int precision = -1, int scale = -1, int field_id = -1);
|
||||
|
||||
PrimitiveNode(const std::string& name, Repetition::type repetition,
|
||||
std::shared_ptr<const LogicalType> logical_type,
|
||||
Type::type primitive_type, int primitive_length = -1, int field_id = -1);
|
||||
|
||||
Type::type physical_type_;
|
||||
int32_t type_length_;
|
||||
DecimalMetadata decimal_metadata_;
|
||||
ColumnOrder column_order_;
|
||||
|
||||
// For FIXED_LEN_BYTE_ARRAY
|
||||
void SetTypeLength(int32_t length) { type_length_ = length; }
|
||||
|
||||
bool EqualsInternal(const PrimitiveNode* other) const;
|
||||
|
||||
FRIEND_TEST(TestPrimitiveNode, Attrs);
|
||||
FRIEND_TEST(TestPrimitiveNode, Equals);
|
||||
FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
|
||||
FRIEND_TEST(TestPrimitiveNode, FromParquet);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT GroupNode : public Node {
|
||||
public:
|
||||
static std::unique_ptr<Node> FromParquet(const void* opaque_element,
|
||||
NodeVector fields = {});
|
||||
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE,
|
||||
int field_id = -1) {
|
||||
return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
|
||||
}
|
||||
|
||||
// If no logical type, pass nullptr
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields,
|
||||
std::shared_ptr<const LogicalType> logical_type,
|
||||
int field_id = -1) {
|
||||
return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id));
|
||||
}
|
||||
|
||||
bool Equals(const Node* other) const override;
|
||||
|
||||
NodePtr field(int i) const { return fields_[i]; }
|
||||
// Get the index of a field by its name, or negative value if not found.
|
||||
// If several fields share the same name, it is unspecified which one
|
||||
// is returned.
|
||||
int FieldIndex(const std::string& name) const;
|
||||
// Get the index of a field by its node, or negative value if not found.
|
||||
int FieldIndex(const Node& node) const;
|
||||
|
||||
int field_count() const { return static_cast<int>(fields_.size()); }
|
||||
|
||||
void ToParquet(void* element) const override;
|
||||
void Visit(Visitor* visitor) override;
|
||||
void VisitConst(ConstVisitor* visitor) const override;
|
||||
|
||||
/// \brief Return true if this node or any child node has REPEATED repetition
|
||||
/// type
|
||||
bool HasRepeatedFields() const;
|
||||
|
||||
private:
|
||||
GroupNode(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
|
||||
|
||||
GroupNode(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
|
||||
int field_id = -1);
|
||||
|
||||
NodeVector fields_;
|
||||
bool EqualsInternal(const GroupNode* other) const;
|
||||
|
||||
// Mapping between field name to the field index
|
||||
std::unordered_multimap<std::string, int> field_name_to_idx_;
|
||||
|
||||
FRIEND_TEST(TestGroupNode, Attrs);
|
||||
FRIEND_TEST(TestGroupNode, Equals);
|
||||
FRIEND_TEST(TestGroupNode, FieldIndex);
|
||||
FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Convenience primitive type factory functions
|
||||
|
||||
#define PRIMITIVE_FACTORY(FuncName, TYPE) \
|
||||
static inline NodePtr FuncName(const std::string& name, \
|
||||
Repetition::type repetition = Repetition::OPTIONAL, \
|
||||
int field_id = -1) { \
|
||||
return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
|
||||
/*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
|
||||
}
|
||||
|
||||
PRIMITIVE_FACTORY(Boolean, BOOLEAN)
|
||||
PRIMITIVE_FACTORY(Int32, INT32)
|
||||
PRIMITIVE_FACTORY(Int64, INT64)
|
||||
PRIMITIVE_FACTORY(Int96, INT96)
|
||||
PRIMITIVE_FACTORY(Float, FLOAT)
|
||||
PRIMITIVE_FACTORY(Double, DOUBLE)
|
||||
PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
|
||||
|
||||
void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
|
||||
int indent_width = 2);
|
||||
|
||||
} // namespace schema
|
||||
|
||||
// The ColumnDescriptor encapsulates information necessary to interpret
|
||||
// primitive column data in the context of a particular schema. We have to
|
||||
// examine the node structure of a column's path to the root in the schema tree
|
||||
// to be able to reassemble the nested structure from the repetition and
|
||||
// definition levels.
|
||||
class PARQUET_EXPORT ColumnDescriptor {
|
||||
public:
|
||||
ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
|
||||
int16_t max_repetition_level,
|
||||
const SchemaDescriptor* schema_descr = NULLPTR);
|
||||
|
||||
bool Equals(const ColumnDescriptor& other) const;
|
||||
|
||||
int16_t max_definition_level() const { return max_definition_level_; }
|
||||
|
||||
int16_t max_repetition_level() const { return max_repetition_level_; }
|
||||
|
||||
Type::type physical_type() const { return primitive_node_->physical_type(); }
|
||||
|
||||
ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
|
||||
|
||||
const std::shared_ptr<const LogicalType>& logical_type() const {
|
||||
return primitive_node_->logical_type();
|
||||
}
|
||||
|
||||
ColumnOrder column_order() const { return primitive_node_->column_order(); }
|
||||
|
||||
SortOrder::type sort_order() const {
|
||||
auto la = logical_type();
|
||||
auto pt = physical_type();
|
||||
return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
|
||||
}
|
||||
|
||||
const std::string& name() const { return primitive_node_->name(); }
|
||||
|
||||
const std::shared_ptr<schema::ColumnPath> path() const;
|
||||
|
||||
const schema::NodePtr& schema_node() const { return node_; }
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
int type_length() const;
|
||||
|
||||
int type_precision() const;
|
||||
|
||||
int type_scale() const;
|
||||
|
||||
private:
|
||||
schema::NodePtr node_;
|
||||
const schema::PrimitiveNode* primitive_node_;
|
||||
|
||||
int16_t max_definition_level_;
|
||||
int16_t max_repetition_level_;
|
||||
};
|
||||
|
||||
// Container for the converted Parquet schema with a computed information from
|
||||
// the schema analysis needed for file reading
|
||||
//
|
||||
// * Column index to Node
|
||||
// * Max repetition / definition levels for each primitive node
|
||||
//
|
||||
// The ColumnDescriptor objects produced by this class can be used to assist in
|
||||
// the reconstruction of fully materialized data structures from the
|
||||
// repetition-definition level encoding of nested data
|
||||
//
|
||||
// TODO(wesm): this object can be recomputed from a Schema
|
||||
class PARQUET_EXPORT SchemaDescriptor {
|
||||
public:
|
||||
SchemaDescriptor() {}
|
||||
~SchemaDescriptor() {}
|
||||
|
||||
// Analyze the schema
|
||||
void Init(std::unique_ptr<schema::Node> schema);
|
||||
void Init(schema::NodePtr schema);
|
||||
|
||||
const ColumnDescriptor* Column(int i) const;
|
||||
|
||||
// Get the index of a column by its dotstring path, or negative value if not found.
|
||||
// If several columns share the same dotstring path, it is unspecified which one
|
||||
// is returned.
|
||||
int ColumnIndex(const std::string& node_path) const;
|
||||
// Get the index of a column by its node, or negative value if not found.
|
||||
int ColumnIndex(const schema::Node& node) const;
|
||||
|
||||
bool Equals(const SchemaDescriptor& other) const;
|
||||
|
||||
// The number of physical columns appearing in the file
|
||||
int num_columns() const { return static_cast<int>(leaves_.size()); }
|
||||
|
||||
const schema::NodePtr& schema_root() const { return schema_; }
|
||||
|
||||
const schema::GroupNode* group_node() const { return group_node_; }
|
||||
|
||||
// Returns the root (child of the schema root) node of the leaf(column) node
|
||||
const schema::Node* GetColumnRoot(int i) const;
|
||||
|
||||
const std::string& name() const { return group_node_->name(); }
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
|
||||
|
||||
/// \brief Return column index corresponding to a particular
|
||||
/// PrimitiveNode. Returns -1 if not found
|
||||
int GetColumnIndex(const schema::PrimitiveNode& node) const;
|
||||
|
||||
/// \brief Return true if any field or their children have REPEATED repetition
|
||||
/// type
|
||||
bool HasRepeatedFields() const;
|
||||
|
||||
private:
|
||||
friend class ColumnDescriptor;
|
||||
|
||||
// Root Node
|
||||
schema::NodePtr schema_;
|
||||
// Root Node
|
||||
const schema::GroupNode* group_node_;
|
||||
|
||||
void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
|
||||
int16_t max_rep_level, const schema::NodePtr& base);
|
||||
|
||||
// Result of leaf node / tree analysis
|
||||
std::vector<ColumnDescriptor> leaves_;
|
||||
|
||||
std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
|
||||
|
||||
// Mapping between leaf nodes and root group of leaf (first node
|
||||
// below the schema's root group)
|
||||
//
|
||||
// For example, the leaf `a.b.c.d` would have a link back to `a`
|
||||
//
|
||||
// -- a <------
|
||||
// -- -- b |
|
||||
// -- -- -- c |
|
||||
// -- -- -- -- d
|
||||
std::unordered_map<int, schema::NodePtr> leaf_to_base_;
|
||||
|
||||
// Mapping between ColumnPath DotString to the leaf index
|
||||
std::unordered_multimap<std::string, int> leaf_to_idx_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
367
.venv/Lib/site-packages/pyarrow/include/parquet/statistics.h
Normal file
367
.venv/Lib/site-packages/pyarrow/include/parquet/statistics.h
Normal file
@ -0,0 +1,367 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class BinaryArray;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ColumnDescriptor;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Value comparator interfaces
|
||||
|
||||
/// \brief Base class for value comparators. Generally used with
|
||||
/// TypedComparator<T>
|
||||
class PARQUET_EXPORT Comparator {
|
||||
public:
|
||||
virtual ~Comparator() {}
|
||||
|
||||
/// \brief Create a comparator explicitly from physical type and
|
||||
/// sort order
|
||||
/// \param[in] physical_type the physical type for the typed
|
||||
/// comparator
|
||||
/// \param[in] sort_order either SortOrder::SIGNED or
|
||||
/// SortOrder::UNSIGNED
|
||||
/// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
|
||||
static std::shared_ptr<Comparator> Make(Type::type physical_type,
|
||||
SortOrder::type sort_order,
|
||||
int type_length = -1);
|
||||
|
||||
/// \brief Create typed comparator inferring default sort order from
|
||||
/// ColumnDescriptor
|
||||
/// \param[in] descr the Parquet column schema
|
||||
static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
|
||||
};
|
||||
|
||||
/// \brief Interface for comparison of physical types according to the
|
||||
/// semantics of a particular logical type.
|
||||
template <typename DType>
|
||||
class TypedComparator : public Comparator {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
/// \brief Scalar comparison of two elements, return true if first
|
||||
/// is strictly less than the second
|
||||
virtual bool Compare(const T& a, const T& b) = 0;
|
||||
|
||||
/// \brief Compute maximum and minimum elements in a batch of
|
||||
/// elements without any nulls
|
||||
virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
|
||||
|
||||
/// \brief Compute minimum and maximum elements from an Arrow array. Only
|
||||
/// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
|
||||
/// / arrow::BinaryArray
|
||||
virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
|
||||
|
||||
/// \brief Compute maximum and minimum elements in a batch of
|
||||
/// elements with accompanying bitmap indicating which elements are
|
||||
/// included (bit set) and excluded (bit not set)
|
||||
///
|
||||
/// \param[in] values the sequence of values
|
||||
/// \param[in] length the length of the sequence
|
||||
/// \param[in] valid_bits a bitmap indicating which elements are
|
||||
/// included (1) or excluded (0)
|
||||
/// \param[in] valid_bits_offset the bit offset into the bitmap of
|
||||
/// the first element in the sequence
|
||||
virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
|
||||
const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset) = 0;
|
||||
};
|
||||
|
||||
/// \brief Typed version of Comparator::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
|
||||
SortOrder::type sort_order,
|
||||
int type_length = -1) {
|
||||
return std::static_pointer_cast<TypedComparator<DType>>(
|
||||
Comparator::Make(physical_type, sort_order, type_length));
|
||||
}
|
||||
|
||||
/// \brief Typed version of Comparator::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
|
||||
return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/// \brief Structure represented encoded statistics to be written to
|
||||
/// and from Parquet serialized metadata
|
||||
class PARQUET_EXPORT EncodedStatistics {
|
||||
std::shared_ptr<std::string> max_, min_;
|
||||
bool is_signed_ = false;
|
||||
|
||||
public:
|
||||
EncodedStatistics()
|
||||
: max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
|
||||
|
||||
const std::string& max() const { return *max_; }
|
||||
const std::string& min() const { return *min_; }
|
||||
|
||||
int64_t null_count = 0;
|
||||
int64_t distinct_count = 0;
|
||||
|
||||
bool has_min = false;
|
||||
bool has_max = false;
|
||||
bool has_null_count = false;
|
||||
bool has_distinct_count = false;
|
||||
|
||||
// From parquet-mr
|
||||
// Don't write stats larger than the max size rather than truncating. The
|
||||
// rationale is that some engines may use the minimum value in the page as
|
||||
// the true minimum for aggregations and there is no way to mark that a
|
||||
// value has been truncated and is a lower bound and not in the page.
|
||||
void ApplyStatSizeLimits(size_t length) {
|
||||
if (max_->length() > length) {
|
||||
has_max = false;
|
||||
}
|
||||
if (min_->length() > length) {
|
||||
has_min = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_set() const {
|
||||
return has_min || has_max || has_null_count || has_distinct_count;
|
||||
}
|
||||
|
||||
bool is_signed() const { return is_signed_; }
|
||||
|
||||
void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
|
||||
|
||||
EncodedStatistics& set_max(const std::string& value) {
|
||||
*max_ = value;
|
||||
has_max = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EncodedStatistics& set_min(const std::string& value) {
|
||||
*min_ = value;
|
||||
has_min = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EncodedStatistics& set_null_count(int64_t value) {
|
||||
null_count = value;
|
||||
has_null_count = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EncodedStatistics& set_distinct_count(int64_t value) {
|
||||
distinct_count = value;
|
||||
has_distinct_count = true;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Base type for computing column statistics while writing a file
|
||||
class PARQUET_EXPORT Statistics {
|
||||
public:
|
||||
virtual ~Statistics() {}
|
||||
|
||||
/// \brief Create a new statistics instance given a column schema
|
||||
/// definition
|
||||
/// \param[in] descr the column schema
|
||||
/// \param[in] pool a memory pool to use for any memory allocations, optional
|
||||
static std::shared_ptr<Statistics> Make(
|
||||
const ColumnDescriptor* descr,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
/// \brief Create a new statistics instance given a column schema
|
||||
/// definition and pre-existing state
|
||||
/// \param[in] descr the column schema
|
||||
/// \param[in] encoded_min the encoded minimum value
|
||||
/// \param[in] encoded_max the encoded maximum value
|
||||
/// \param[in] num_values total number of values
|
||||
/// \param[in] null_count number of null values
|
||||
/// \param[in] distinct_count number of distinct values
|
||||
/// \param[in] has_min_max whether the min/max statistics are set
|
||||
/// \param[in] has_null_count whether the null_count statistics are set
|
||||
/// \param[in] has_distinct_count whether the distinct_count statistics are set
|
||||
/// \param[in] pool a memory pool to use for any memory allocations, optional
|
||||
static std::shared_ptr<Statistics> Make(
|
||||
const ColumnDescriptor* descr, const std::string& encoded_min,
|
||||
const std::string& encoded_max, int64_t num_values, int64_t null_count,
|
||||
int64_t distinct_count, bool has_min_max, bool has_null_count,
|
||||
bool has_distinct_count,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
/// \brief Return true if the count of null values is set
|
||||
virtual bool HasNullCount() const = 0;
|
||||
|
||||
/// \brief The number of null values, may not be set
|
||||
virtual int64_t null_count() const = 0;
|
||||
|
||||
/// \brief Return true if the count of distinct values is set
|
||||
virtual bool HasDistinctCount() const = 0;
|
||||
|
||||
/// \brief The number of distinct values, may not be set
|
||||
virtual int64_t distinct_count() const = 0;
|
||||
|
||||
/// \brief The total number of values in the column
|
||||
virtual int64_t num_values() const = 0;
|
||||
|
||||
/// \brief Return true if the min and max statistics are set. Obtain
|
||||
/// with TypedStatistics<T>::min and max
|
||||
virtual bool HasMinMax() const = 0;
|
||||
|
||||
/// \brief Reset state of object to initial (no data observed) state
|
||||
virtual void Reset() = 0;
|
||||
|
||||
/// \brief Plain-encoded minimum value
|
||||
virtual std::string EncodeMin() const = 0;
|
||||
|
||||
/// \brief Plain-encoded maximum value
|
||||
virtual std::string EncodeMax() const = 0;
|
||||
|
||||
/// \brief The finalized encoded form of the statistics for transport
|
||||
virtual EncodedStatistics Encode() = 0;
|
||||
|
||||
/// \brief The physical type of the column schema
|
||||
virtual Type::type physical_type() const = 0;
|
||||
|
||||
/// \brief The full type descriptor from the column schema
|
||||
virtual const ColumnDescriptor* descr() const = 0;
|
||||
|
||||
/// \brief Check two Statistics for equality
|
||||
virtual bool Equals(const Statistics& other) const = 0;
|
||||
|
||||
protected:
|
||||
static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
|
||||
const void* max, int64_t num_values,
|
||||
int64_t null_count, int64_t distinct_count);
|
||||
};
|
||||
|
||||
/// \brief A typed implementation of Statistics
|
||||
template <typename DType>
|
||||
class TypedStatistics : public Statistics {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
/// \brief The current minimum value
|
||||
virtual const T& min() const = 0;
|
||||
|
||||
/// \brief The current maximum value
|
||||
virtual const T& max() const = 0;
|
||||
|
||||
/// \brief Update state with state of another Statistics object
|
||||
virtual void Merge(const TypedStatistics<DType>& other) = 0;
|
||||
|
||||
/// \brief Batch statistics update
|
||||
virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
|
||||
|
||||
/// \brief Batch statistics update with supplied validity bitmap
|
||||
/// \param[in] values pointer to column values
|
||||
/// \param[in] valid_bits Pointer to bitmap representing if values are non-null.
|
||||
/// \param[in] valid_bits_offset Offset offset into valid_bits where the slice of
|
||||
/// data begins.
|
||||
/// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
|
||||
/// when calculating statistics. This can be smaller than
|
||||
/// num_not_null+num_null as num_null can include nulls
|
||||
/// from parents while num_spaced_values does not.
|
||||
/// \param[in] num_not_null Number of values that are not null.
|
||||
/// \param[in] num_null Number of values that are null.
|
||||
virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset, int64_t num_spaced_values,
|
||||
int64_t num_not_null, int64_t num_null) = 0;
|
||||
|
||||
/// \brief EXPERIMENTAL: Update statistics with an Arrow array without
|
||||
/// conversion to a primitive Parquet C type. Only implemented for certain
|
||||
/// Parquet type / Arrow type combinations like BYTE_ARRAY /
|
||||
/// arrow::BinaryArray
|
||||
///
|
||||
/// If update_counts is true then the null_count and num_values will be updated
|
||||
/// based on the null_count of values. Set to false if these are updated
|
||||
/// elsewhere (e.g. when updating a dictionary where the counts are taken from
|
||||
/// the indices and not the values)
|
||||
virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0;
|
||||
|
||||
/// \brief Set min and max values to particular values
|
||||
virtual void SetMinMax(const T& min, const T& max) = 0;
|
||||
|
||||
/// \brief Increments the null count directly
|
||||
/// Use Update to extract the null count from data. Use this if you determine
|
||||
/// the null count through some other means (e.g. dictionary arrays where the
|
||||
/// null count is determined from the indices)
|
||||
virtual void IncrementNullCount(int64_t n) = 0;
|
||||
|
||||
/// \brief Increments the number ov values directly
|
||||
/// The same note on IncrementNullCount applies here
|
||||
virtual void IncrementNumValues(int64_t n) = 0;
|
||||
};
|
||||
|
||||
using BoolStatistics = TypedStatistics<BooleanType>;
|
||||
using Int32Statistics = TypedStatistics<Int32Type>;
|
||||
using Int64Statistics = TypedStatistics<Int64Type>;
|
||||
using FloatStatistics = TypedStatistics<FloatType>;
|
||||
using DoubleStatistics = TypedStatistics<DoubleType>;
|
||||
using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
|
||||
using FLBAStatistics = TypedStatistics<FLBAType>;
|
||||
|
||||
/// \brief Typed version of Statistics::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
|
||||
const ColumnDescriptor* descr,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
|
||||
}
|
||||
|
||||
/// \brief Create Statistics initialized to a particular state
|
||||
/// \param[in] min the minimum value
|
||||
/// \param[in] max the minimum value
|
||||
/// \param[in] num_values number of values
|
||||
/// \param[in] null_count number of null values
|
||||
/// \param[in] distinct_count number of distinct values
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
|
||||
const typename DType::c_type& max,
|
||||
int64_t num_values,
|
||||
int64_t null_count,
|
||||
int64_t distinct_count) {
|
||||
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
|
||||
DType::type_num, &min, &max, num_values, null_count, distinct_count));
|
||||
}
|
||||
|
||||
/// \brief Typed version of Statistics::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
|
||||
const ColumnDescriptor* descr, const std::string& encoded_min,
|
||||
const std::string& encoded_max, int64_t num_values, int64_t null_count,
|
||||
int64_t distinct_count, bool has_min_max, bool has_null_count,
|
||||
bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
|
||||
descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
|
||||
has_min_max, has_null_count, has_distinct_count, pool));
|
||||
}
|
||||
|
||||
} // namespace parquet
|
299
.venv/Lib/site-packages/pyarrow/include/parquet/stream_reader.h
Normal file
299
.venv/Lib/site-packages/pyarrow/include/parquet/stream_reader.h
Normal file
@ -0,0 +1,299 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/optional.h"
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/file_reader.h"
|
||||
#include "parquet/stream_writer.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// \brief A class for reading Parquet files using an output stream type API.
|
||||
///
|
||||
/// The values given must be of the correct type i.e. the type must
|
||||
/// match the file schema exactly otherwise a ParquetException will be
|
||||
/// thrown.
|
||||
///
|
||||
/// The user must explicitly advance to the next row using the
|
||||
/// EndRow() function or EndRow input manipulator.
|
||||
///
|
||||
/// Required and optional fields are supported:
|
||||
/// - Required fields are read using operator>>(T)
|
||||
/// - Optional fields are read with
|
||||
/// operator>>(arrow::util::optional<T>)
|
||||
///
|
||||
/// Note that operator>>(arrow::util::optional<T>) can be used to read
|
||||
/// required fields.
|
||||
///
|
||||
/// Similarly operator>>(T) can be used to read optional fields.
|
||||
/// However, if the value is not present then a ParquetException will
|
||||
/// be raised.
|
||||
///
|
||||
/// Currently there is no support for repeated fields.
|
||||
///
|
||||
class PARQUET_EXPORT StreamReader {
|
||||
public:
|
||||
template <typename T>
|
||||
using optional = ::arrow::util::optional<T>;
|
||||
|
||||
// N.B. Default constructed objects are not usable. This
|
||||
// constructor is provided so that the object may be move
|
||||
// assigned afterwards.
|
||||
StreamReader() = default;
|
||||
|
||||
explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
|
||||
|
||||
~StreamReader() = default;
|
||||
|
||||
bool eof() const { return eof_; }
|
||||
|
||||
int current_column() const { return column_index_; }
|
||||
|
||||
int64_t current_row() const { return current_row_; }
|
||||
|
||||
int num_columns() const;
|
||||
|
||||
int64_t num_rows() const;
|
||||
|
||||
// Moving is possible.
|
||||
StreamReader(StreamReader&&) = default;
|
||||
StreamReader& operator=(StreamReader&&) = default;
|
||||
|
||||
// Copying is not allowed.
|
||||
StreamReader(const StreamReader&) = delete;
|
||||
StreamReader& operator=(const StreamReader&) = delete;
|
||||
|
||||
StreamReader& operator>>(bool& v);
|
||||
|
||||
StreamReader& operator>>(int8_t& v);
|
||||
|
||||
StreamReader& operator>>(uint8_t& v);
|
||||
|
||||
StreamReader& operator>>(int16_t& v);
|
||||
|
||||
StreamReader& operator>>(uint16_t& v);
|
||||
|
||||
StreamReader& operator>>(int32_t& v);
|
||||
|
||||
StreamReader& operator>>(uint32_t& v);
|
||||
|
||||
StreamReader& operator>>(int64_t& v);
|
||||
|
||||
StreamReader& operator>>(uint64_t& v);
|
||||
|
||||
StreamReader& operator>>(std::chrono::milliseconds& v);
|
||||
|
||||
StreamReader& operator>>(std::chrono::microseconds& v);
|
||||
|
||||
StreamReader& operator>>(float& v);
|
||||
|
||||
StreamReader& operator>>(double& v);
|
||||
|
||||
StreamReader& operator>>(char& v);
|
||||
|
||||
template <int N>
|
||||
StreamReader& operator>>(char (&v)[N]) {
|
||||
ReadFixedLength(v, N);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
StreamReader& operator>>(std::array<char, N>& v) {
|
||||
ReadFixedLength(v.data(), static_cast<int>(N));
|
||||
return *this;
|
||||
}
|
||||
|
||||
// N.B. Cannot allow for reading to a arbitrary char pointer as the
|
||||
// length cannot be verified. Also it would overshadow the
|
||||
// char[N] input operator.
|
||||
// StreamReader& operator>>(char * v);
|
||||
|
||||
StreamReader& operator>>(std::string& v);
|
||||
|
||||
// Input operators for optional fields.
|
||||
|
||||
StreamReader& operator>>(optional<bool>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int8_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint8_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int16_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint16_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int32_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint32_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int64_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint64_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<float>& v);
|
||||
|
||||
StreamReader& operator>>(optional<double>& v);
|
||||
|
||||
StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
|
||||
|
||||
StreamReader& operator>>(optional<std::chrono::microseconds>& v);
|
||||
|
||||
StreamReader& operator>>(optional<char>& v);
|
||||
|
||||
StreamReader& operator>>(optional<std::string>& v);
|
||||
|
||||
template <std::size_t N>
|
||||
StreamReader& operator>>(optional<std::array<char, N>>& v) {
|
||||
CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
|
||||
FixedLenByteArray flba;
|
||||
if (ReadOptional(&flba)) {
|
||||
v = std::array<char, N>{};
|
||||
std::memcpy(v->data(), flba.ptr, N);
|
||||
} else {
|
||||
v.reset();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Terminate current row and advance to next one.
|
||||
/// \throws ParquetException if all columns in the row were not
|
||||
/// read or skipped.
|
||||
void EndRow();
|
||||
|
||||
/// \brief Skip the data in the next columns.
|
||||
/// If the number of columns exceeds the columns remaining on the
|
||||
/// current row then skipping is terminated - it does _not_ continue
|
||||
/// skipping columns on the next row.
|
||||
/// Skipping of columns still requires the use 'EndRow' even if all
|
||||
/// remaining columns were skipped.
|
||||
/// \return Number of columns actually skipped.
|
||||
int64_t SkipColumns(int64_t num_columns_to_skip);
|
||||
|
||||
/// \brief Skip the data in the next rows.
|
||||
/// Skipping of rows is not allowed if reading of data for the
|
||||
/// current row is not finished.
|
||||
/// Skipping of rows will be terminated if the end of file is
|
||||
/// reached.
|
||||
/// \return Number of rows actually skipped.
|
||||
int64_t SkipRows(int64_t num_rows_to_skip);
|
||||
|
||||
protected:
|
||||
[[noreturn]] void ThrowReadFailedException(
|
||||
const std::shared_ptr<schema::PrimitiveNode>& node);
|
||||
|
||||
template <typename ReaderType, typename T>
|
||||
void Read(T* v) {
|
||||
const auto& node = nodes_[column_index_];
|
||||
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
|
||||
int16_t def_level;
|
||||
int16_t rep_level;
|
||||
int64_t values_read;
|
||||
|
||||
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
|
||||
|
||||
if (values_read != 1) {
|
||||
ThrowReadFailedException(node);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ReaderType, typename ReadType, typename T>
|
||||
void Read(T* v) {
|
||||
const auto& node = nodes_[column_index_];
|
||||
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
|
||||
int16_t def_level;
|
||||
int16_t rep_level;
|
||||
ReadType tmp;
|
||||
int64_t values_read;
|
||||
|
||||
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
|
||||
|
||||
if (values_read == 1) {
|
||||
*v = tmp;
|
||||
} else {
|
||||
ThrowReadFailedException(node);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
|
||||
void ReadOptional(optional<T>* v) {
|
||||
const auto& node = nodes_[column_index_];
|
||||
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
|
||||
int16_t def_level;
|
||||
int16_t rep_level;
|
||||
ReadType tmp;
|
||||
int64_t values_read;
|
||||
|
||||
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
|
||||
|
||||
if (values_read == 1) {
|
||||
*v = T(tmp);
|
||||
} else if ((values_read == 0) && (def_level == 0)) {
|
||||
v->reset();
|
||||
} else {
|
||||
ThrowReadFailedException(node);
|
||||
}
|
||||
}
|
||||
|
||||
void ReadFixedLength(char* ptr, int len);
|
||||
|
||||
void Read(ByteArray* v);
|
||||
|
||||
void Read(FixedLenByteArray* v);
|
||||
|
||||
bool ReadOptional(ByteArray* v);
|
||||
|
||||
bool ReadOptional(FixedLenByteArray* v);
|
||||
|
||||
void NextRowGroup();
|
||||
|
||||
void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
|
||||
int length = 0);
|
||||
|
||||
void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
|
||||
|
||||
void SetEof();
|
||||
|
||||
private:
|
||||
std::unique_ptr<ParquetFileReader> file_reader_;
|
||||
std::shared_ptr<FileMetaData> file_metadata_;
|
||||
std::shared_ptr<RowGroupReader> row_group_reader_;
|
||||
std::vector<std::shared_ptr<ColumnReader>> column_readers_;
|
||||
std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
|
||||
|
||||
bool eof_{true};
|
||||
int row_group_index_{0};
|
||||
int column_index_{0};
|
||||
int64_t current_row_{0};
|
||||
int64_t row_group_row_offset_{0};
|
||||
|
||||
static constexpr int64_t kBatchSizeOne = 1;
|
||||
}; // namespace parquet
|
||||
|
||||
PARQUET_EXPORT
|
||||
StreamReader& operator>>(StreamReader&, EndRowType);
|
||||
|
||||
} // namespace parquet
|
243
.venv/Lib/site-packages/pyarrow/include/parquet/stream_writer.h
Normal file
243
.venv/Lib/site-packages/pyarrow/include/parquet/stream_writer.h
Normal file
@ -0,0 +1,243 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/optional.h"
|
||||
#include "arrow/util/string_view.h"
|
||||
#include "parquet/column_writer.h"
|
||||
#include "parquet/file_writer.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// \brief A class for writing Parquet files using an output stream type API.
|
||||
///
|
||||
/// The values given must be of the correct type i.e. the type must
|
||||
/// match the file schema exactly otherwise a ParquetException will be
|
||||
/// thrown.
|
||||
///
|
||||
/// The user must explicitly indicate the end of the row using the
|
||||
/// EndRow() function or EndRow output manipulator.
|
||||
///
|
||||
/// A maximum row group size can be configured, the default size is
|
||||
/// 512MB. Alternatively the row group size can be set to zero and the
|
||||
/// user can create new row groups by calling the EndRowGroup()
|
||||
/// function or using the EndRowGroup output manipulator.
|
||||
///
|
||||
/// Required and optional fields are supported:
|
||||
/// - Required fields are written using operator<<(T)
|
||||
/// - Optional fields are written using
|
||||
/// operator<<(arrow::util::optional<T>).
|
||||
///
|
||||
/// Note that operator<<(T) can be used to write optional fields.
|
||||
///
|
||||
/// Similarly, operator<<(arrow::util::optional<T>) can be used to
|
||||
/// write required fields. However if the optional parameter does not
|
||||
/// have a value (i.e. it is nullopt) then a ParquetException will be
|
||||
/// raised.
|
||||
///
|
||||
/// Currently there is no support for repeated fields.
|
||||
///
|
||||
class PARQUET_EXPORT StreamWriter {
|
||||
public:
|
||||
template <typename T>
|
||||
using optional = ::arrow::util::optional<T>;
|
||||
|
||||
// N.B. Default constructed objects are not usable. This
|
||||
// constructor is provided so that the object may be move
|
||||
// assigned afterwards.
|
||||
StreamWriter() = default;
|
||||
|
||||
explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
|
||||
|
||||
~StreamWriter() = default;
|
||||
|
||||
static void SetDefaultMaxRowGroupSize(int64_t max_size);
|
||||
|
||||
void SetMaxRowGroupSize(int64_t max_size);
|
||||
|
||||
int current_column() const { return column_index_; }
|
||||
|
||||
int64_t current_row() const { return current_row_; }
|
||||
|
||||
int num_columns() const;
|
||||
|
||||
// Moving is possible.
|
||||
StreamWriter(StreamWriter&&) = default;
|
||||
StreamWriter& operator=(StreamWriter&&) = default;
|
||||
|
||||
// Copying is not allowed.
|
||||
StreamWriter(const StreamWriter&) = delete;
|
||||
StreamWriter& operator=(const StreamWriter&) = delete;
|
||||
|
||||
/// \brief Output operators for required fields.
|
||||
/// These can also be used for optional fields when a value must be set.
|
||||
StreamWriter& operator<<(bool v);
|
||||
|
||||
StreamWriter& operator<<(int8_t v);
|
||||
|
||||
StreamWriter& operator<<(uint8_t v);
|
||||
|
||||
StreamWriter& operator<<(int16_t v);
|
||||
|
||||
StreamWriter& operator<<(uint16_t v);
|
||||
|
||||
StreamWriter& operator<<(int32_t v);
|
||||
|
||||
StreamWriter& operator<<(uint32_t v);
|
||||
|
||||
StreamWriter& operator<<(int64_t v);
|
||||
|
||||
StreamWriter& operator<<(uint64_t v);
|
||||
|
||||
StreamWriter& operator<<(const std::chrono::milliseconds& v);
|
||||
|
||||
StreamWriter& operator<<(const std::chrono::microseconds& v);
|
||||
|
||||
StreamWriter& operator<<(float v);
|
||||
|
||||
StreamWriter& operator<<(double v);
|
||||
|
||||
StreamWriter& operator<<(char v);
|
||||
|
||||
/// \brief Helper class to write fixed length strings.
|
||||
/// This is useful as the standard string view (such as
|
||||
/// arrow::util::string_view) is for variable length data.
|
||||
struct PARQUET_EXPORT FixedStringView {
|
||||
FixedStringView() = default;
|
||||
|
||||
explicit FixedStringView(const char* data_ptr);
|
||||
|
||||
FixedStringView(const char* data_ptr, std::size_t data_len);
|
||||
|
||||
const char* data{NULLPTR};
|
||||
std::size_t size{0};
|
||||
};
|
||||
|
||||
/// \brief Output operators for fixed length strings.
|
||||
template <int N>
|
||||
StreamWriter& operator<<(const char (&v)[N]) {
|
||||
return WriteFixedLength(v, N);
|
||||
}
|
||||
template <std::size_t N>
|
||||
StreamWriter& operator<<(const std::array<char, N>& v) {
|
||||
return WriteFixedLength(v.data(), N);
|
||||
}
|
||||
StreamWriter& operator<<(FixedStringView v);
|
||||
|
||||
/// \brief Output operators for variable length strings.
|
||||
StreamWriter& operator<<(const char* v);
|
||||
StreamWriter& operator<<(const std::string& v);
|
||||
StreamWriter& operator<<(::arrow::util::string_view v);
|
||||
|
||||
/// \brief Output operator for optional fields.
|
||||
template <typename T>
|
||||
StreamWriter& operator<<(const optional<T>& v) {
|
||||
if (v) {
|
||||
return operator<<(*v);
|
||||
}
|
||||
SkipOptionalColumn();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Skip the next N columns of optional data. If there are
|
||||
/// less than N columns remaining then the excess columns are
|
||||
/// ignored.
|
||||
/// \throws ParquetException if there is an attempt to skip any
|
||||
/// required column.
|
||||
/// \return Number of columns actually skipped.
|
||||
int64_t SkipColumns(int num_columns_to_skip);
|
||||
|
||||
/// \brief Terminate the current row and advance to next one.
|
||||
/// \throws ParquetException if all columns in the row were not
|
||||
/// written or skipped.
|
||||
void EndRow();
|
||||
|
||||
/// \brief Terminate the current row group and create new one.
|
||||
void EndRowGroup();
|
||||
|
||||
protected:
|
||||
template <typename WriterType, typename T>
|
||||
StreamWriter& Write(const T v) {
|
||||
auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
|
||||
|
||||
writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
|
||||
|
||||
if (max_row_group_size_ > 0) {
|
||||
row_group_size_ += writer->EstimatedBufferedValueBytes();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len);
|
||||
|
||||
StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
|
||||
|
||||
void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
|
||||
int length = -1);
|
||||
|
||||
/// \brief Skip the next column which must be optional.
|
||||
/// \throws ParquetException if the next column does not exist or is
|
||||
/// not optional.
|
||||
void SkipOptionalColumn();
|
||||
|
||||
void WriteNullValue(ColumnWriter* writer);
|
||||
|
||||
private:
|
||||
using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
|
||||
|
||||
struct null_deleter {
|
||||
void operator()(void*) {}
|
||||
};
|
||||
|
||||
int32_t column_index_{0};
|
||||
int64_t current_row_{0};
|
||||
int64_t row_group_size_{0};
|
||||
int64_t max_row_group_size_{default_row_group_size_};
|
||||
|
||||
std::unique_ptr<ParquetFileWriter> file_writer_;
|
||||
std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
|
||||
std::vector<node_ptr_type> nodes_;
|
||||
|
||||
static constexpr int16_t kDefLevelZero = 0;
|
||||
static constexpr int16_t kDefLevelOne = 1;
|
||||
static constexpr int16_t kRepLevelZero = 0;
|
||||
static constexpr int64_t kBatchSizeOne = 1;
|
||||
|
||||
static int64_t default_row_group_size_;
|
||||
};
|
||||
|
||||
struct PARQUET_EXPORT EndRowType {};
|
||||
constexpr EndRowType EndRow = {};
|
||||
|
||||
struct PARQUET_EXPORT EndRowGroupType {};
|
||||
constexpr EndRowGroupType EndRowGroup = {};
|
||||
|
||||
PARQUET_EXPORT
|
||||
StreamWriter& operator<<(StreamWriter&, EndRowType);
|
||||
|
||||
PARQUET_EXPORT
|
||||
StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
|
||||
|
||||
} // namespace parquet
|
715
.venv/Lib/site-packages/pyarrow/include/parquet/test_util.h
Normal file
715
.venv/Lib/site-packages/pyarrow/include/parquet/test_util.h
Normal file
@ -0,0 +1,715 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module defines an abstract interface for iterating through pages in a
|
||||
// Parquet column chunk within a row group. It could be extended in the future
|
||||
// to iterate through all data pages in all chunks in a file.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "arrow/io/memory.h"
|
||||
#include "arrow/testing/util.h"
|
||||
|
||||
#include "parquet/column_page.h"
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/column_writer.h"
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
static constexpr int FLBA_LENGTH = 12;
|
||||
|
||||
inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
|
||||
return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
|
||||
}
|
||||
|
||||
namespace test {
|
||||
|
||||
typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, FloatType,
|
||||
DoubleType, ByteArrayType, FLBAType>
|
||||
ParquetTypes;
|
||||
|
||||
class ParquetTestException : public parquet::ParquetException {
|
||||
using ParquetException::ParquetException;
|
||||
};
|
||||
|
||||
const char* get_data_dir();
|
||||
std::string get_bad_data_dir();
|
||||
|
||||
std::string get_data_file(const std::string& filename, bool is_good = true);
|
||||
|
||||
template <typename T>
|
||||
static inline void assert_vector_equal(const std::vector<T>& left,
|
||||
const std::vector<T>& right) {
|
||||
ASSERT_EQ(left.size(), right.size());
|
||||
|
||||
for (size_t i = 0; i < left.size(); ++i) {
|
||||
ASSERT_EQ(left[i], right[i]) << i;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline bool vector_equal(const std::vector<T>& left, const std::vector<T>& right) {
|
||||
if (left.size() != right.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < left.size(); ++i) {
|
||||
if (left[i] != right[i]) {
|
||||
std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i]
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static std::vector<T> slice(const std::vector<T>& values, int start, int end) {
|
||||
if (end < start) {
|
||||
return std::vector<T>(0);
|
||||
}
|
||||
|
||||
std::vector<T> out(end - start);
|
||||
for (int i = start; i < end; ++i) {
|
||||
out[i - start] = values[i];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out);
|
||||
void random_bools(int n, double p, uint32_t seed, bool* out);
|
||||
|
||||
template <typename T>
|
||||
inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) {
|
||||
std::default_random_engine gen(seed);
|
||||
std::uniform_int_distribution<T> d(min_value, max_value);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
out[i] = d(gen);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void random_numbers(int n, uint32_t seed, float min_value, float max_value,
|
||||
float* out) {
|
||||
std::default_random_engine gen(seed);
|
||||
std::uniform_real_distribution<float> d(min_value, max_value);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
out[i] = d(gen);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void random_numbers(int n, uint32_t seed, double min_value, double max_value,
|
||||
double* out) {
|
||||
std::default_random_engine gen(seed);
|
||||
std::uniform_real_distribution<double> d(min_value, max_value);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
out[i] = d(gen);
|
||||
}
|
||||
}
|
||||
|
||||
void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
|
||||
Int96* out);
|
||||
|
||||
void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out);
|
||||
|
||||
void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
|
||||
int max_size);
|
||||
|
||||
void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size);
|
||||
|
||||
template <typename Type, typename Sequence>
|
||||
std::shared_ptr<Buffer> EncodeValues(Encoding::type encoding, bool use_dictionary,
|
||||
const Sequence& values, int length,
|
||||
const ColumnDescriptor* descr) {
|
||||
auto encoder = MakeTypedEncoder<Type>(encoding, use_dictionary, descr);
|
||||
encoder->Put(values, length);
|
||||
return encoder->FlushValues();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void InitValues(int num_values, std::vector<T>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
random_numbers(num_values, 0, std::numeric_limits<T>::min(),
|
||||
std::numeric_limits<T>::max(), values.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void InitDictValues(int num_values, int num_dicts, std::vector<T>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
int repeat_factor = num_values / num_dicts;
|
||||
InitValues<T>(num_dicts, values, buffer);
|
||||
// add some repeated values
|
||||
for (int j = 1; j < repeat_factor; ++j) {
|
||||
for (int i = 0; i < num_dicts; ++i) {
|
||||
std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T));
|
||||
}
|
||||
}
|
||||
// computed only dict_per_page * repeat_factor - 1 values < num_values
|
||||
// compute remaining
|
||||
for (int i = num_dicts * repeat_factor; i < num_values; ++i) {
|
||||
std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitDictValues<bool>(int num_values, int num_dicts, std::vector<bool>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
// No op for bool
|
||||
}
|
||||
|
||||
class MockPageReader : public PageReader {
|
||||
public:
|
||||
explicit MockPageReader(const std::vector<std::shared_ptr<Page>>& pages)
|
||||
: pages_(pages), page_index_(0) {}
|
||||
|
||||
std::shared_ptr<Page> NextPage() override {
|
||||
if (page_index_ == static_cast<int>(pages_.size())) {
|
||||
// EOS to consumer
|
||||
return std::shared_ptr<Page>(nullptr);
|
||||
}
|
||||
return pages_[page_index_++];
|
||||
}
|
||||
|
||||
// No-op
|
||||
void set_max_page_header_size(uint32_t size) override {}
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<Page>> pages_;
|
||||
int page_index_;
|
||||
};
|
||||
|
||||
// TODO(wesm): this is only used for testing for now. Refactor to form part of
|
||||
// primary file write path
|
||||
template <typename Type>
|
||||
class DataPageBuilder {
|
||||
public:
|
||||
using c_type = typename Type::c_type;
|
||||
|
||||
// This class writes data and metadata to the passed inputs
|
||||
explicit DataPageBuilder(ArrowOutputStream* sink)
|
||||
: sink_(sink),
|
||||
num_values_(0),
|
||||
encoding_(Encoding::PLAIN),
|
||||
definition_level_encoding_(Encoding::RLE),
|
||||
repetition_level_encoding_(Encoding::RLE),
|
||||
have_def_levels_(false),
|
||||
have_rep_levels_(false),
|
||||
have_values_(false) {}
|
||||
|
||||
void AppendDefLevels(const std::vector<int16_t>& levels, int16_t max_level,
|
||||
Encoding::type encoding = Encoding::RLE) {
|
||||
AppendLevels(levels, max_level, encoding);
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
|
||||
definition_level_encoding_ = encoding;
|
||||
have_def_levels_ = true;
|
||||
}
|
||||
|
||||
void AppendRepLevels(const std::vector<int16_t>& levels, int16_t max_level,
|
||||
Encoding::type encoding = Encoding::RLE) {
|
||||
AppendLevels(levels, max_level, encoding);
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
|
||||
repetition_level_encoding_ = encoding;
|
||||
have_rep_levels_ = true;
|
||||
}
|
||||
|
||||
void AppendValues(const ColumnDescriptor* d, const std::vector<c_type>& values,
|
||||
Encoding::type encoding = Encoding::PLAIN) {
|
||||
std::shared_ptr<Buffer> values_sink = EncodeValues<Type>(
|
||||
encoding, false, values.data(), static_cast<int>(values.size()), d);
|
||||
PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size()));
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
|
||||
encoding_ = encoding;
|
||||
have_values_ = true;
|
||||
}
|
||||
|
||||
int32_t num_values() const { return num_values_; }
|
||||
|
||||
Encoding::type encoding() const { return encoding_; }
|
||||
|
||||
Encoding::type rep_level_encoding() const { return repetition_level_encoding_; }
|
||||
|
||||
Encoding::type def_level_encoding() const { return definition_level_encoding_; }
|
||||
|
||||
private:
|
||||
ArrowOutputStream* sink_;
|
||||
|
||||
int32_t num_values_;
|
||||
Encoding::type encoding_;
|
||||
Encoding::type definition_level_encoding_;
|
||||
Encoding::type repetition_level_encoding_;
|
||||
|
||||
bool have_def_levels_;
|
||||
bool have_rep_levels_;
|
||||
bool have_values_;
|
||||
|
||||
// Used internally for both repetition and definition levels
|
||||
void AppendLevels(const std::vector<int16_t>& levels, int16_t max_level,
|
||||
Encoding::type encoding) {
|
||||
if (encoding != Encoding::RLE) {
|
||||
ParquetException::NYI("only rle encoding currently implemented");
|
||||
}
|
||||
|
||||
std::vector<uint8_t> encode_buffer(LevelEncoder::MaxBufferSize(
|
||||
Encoding::RLE, max_level, static_cast<int>(levels.size())));
|
||||
|
||||
// We encode into separate memory from the output stream because the
|
||||
// RLE-encoded bytes have to be preceded in the stream by their absolute
|
||||
// size.
|
||||
LevelEncoder encoder;
|
||||
encoder.Init(encoding, max_level, static_cast<int>(levels.size()),
|
||||
encode_buffer.data(), static_cast<int>(encode_buffer.size()));
|
||||
|
||||
encoder.Encode(static_cast<int>(levels.size()), levels.data());
|
||||
|
||||
int32_t rle_bytes = encoder.len();
|
||||
PARQUET_THROW_NOT_OK(
|
||||
sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes), sizeof(int32_t)));
|
||||
PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
inline void DataPageBuilder<BooleanType>::AppendValues(const ColumnDescriptor* d,
|
||||
const std::vector<bool>& values,
|
||||
Encoding::type encoding) {
|
||||
if (encoding != Encoding::PLAIN) {
|
||||
ParquetException::NYI("only plain encoding currently implemented");
|
||||
}
|
||||
|
||||
auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN, false, d);
|
||||
dynamic_cast<BooleanEncoder*>(encoder.get())
|
||||
->Put(values, static_cast<int>(values.size()));
|
||||
std::shared_ptr<Buffer> buffer = encoder->FlushValues();
|
||||
PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size()));
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
|
||||
encoding_ = encoding;
|
||||
have_values_ = true;
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
static std::shared_ptr<DataPageV1> MakeDataPage(
|
||||
const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
|
||||
int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size,
|
||||
const std::vector<int16_t>& def_levels, int16_t max_def_level,
|
||||
const std::vector<int16_t>& rep_levels, int16_t max_rep_level) {
|
||||
int num_values = 0;
|
||||
|
||||
auto page_stream = CreateOutputStream();
|
||||
test::DataPageBuilder<Type> page_builder(page_stream.get());
|
||||
|
||||
if (!rep_levels.empty()) {
|
||||
page_builder.AppendRepLevels(rep_levels, max_rep_level);
|
||||
}
|
||||
if (!def_levels.empty()) {
|
||||
page_builder.AppendDefLevels(def_levels, max_def_level);
|
||||
}
|
||||
|
||||
if (encoding == Encoding::PLAIN) {
|
||||
page_builder.AppendValues(d, values, encoding);
|
||||
num_values = std::max(page_builder.num_values(), num_vals);
|
||||
} else { // DICTIONARY PAGES
|
||||
PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size));
|
||||
num_values = std::max(page_builder.num_values(), num_vals);
|
||||
}
|
||||
|
||||
PARQUET_ASSIGN_OR_THROW(auto buffer, page_stream->Finish());
|
||||
|
||||
return std::make_shared<DataPageV1>(buffer, num_values, encoding,
|
||||
page_builder.def_level_encoding(),
|
||||
page_builder.rep_level_encoding(), buffer->size());
|
||||
}
|
||||
|
||||
template <typename TYPE>
|
||||
class DictionaryPageBuilder {
|
||||
public:
|
||||
typedef typename TYPE::c_type TC;
|
||||
static constexpr int TN = TYPE::type_num;
|
||||
using SpecializedEncoder = typename EncodingTraits<TYPE>::Encoder;
|
||||
|
||||
// This class writes data and metadata to the passed inputs
|
||||
explicit DictionaryPageBuilder(const ColumnDescriptor* d)
|
||||
: num_dict_values_(0), have_values_(false) {
|
||||
auto encoder = MakeTypedEncoder<TYPE>(Encoding::PLAIN, true, d);
|
||||
dict_traits_ = dynamic_cast<DictEncoder<TYPE>*>(encoder.get());
|
||||
encoder_.reset(dynamic_cast<SpecializedEncoder*>(encoder.release()));
|
||||
}
|
||||
|
||||
~DictionaryPageBuilder() {}
|
||||
|
||||
std::shared_ptr<Buffer> AppendValues(const std::vector<TC>& values) {
|
||||
int num_values = static_cast<int>(values.size());
|
||||
// Dictionary encoding
|
||||
encoder_->Put(values.data(), num_values);
|
||||
num_dict_values_ = dict_traits_->num_entries();
|
||||
have_values_ = true;
|
||||
return encoder_->FlushValues();
|
||||
}
|
||||
|
||||
std::shared_ptr<Buffer> WriteDict() {
|
||||
std::shared_ptr<Buffer> dict_buffer =
|
||||
AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size());
|
||||
dict_traits_->WriteDict(dict_buffer->mutable_data());
|
||||
return dict_buffer;
|
||||
}
|
||||
|
||||
int32_t num_values() const { return num_dict_values_; }
|
||||
|
||||
private:
|
||||
DictEncoder<TYPE>* dict_traits_;
|
||||
std::unique_ptr<SpecializedEncoder> encoder_;
|
||||
int32_t num_dict_values_;
|
||||
bool have_values_;
|
||||
};
|
||||
|
||||
template <>
|
||||
inline DictionaryPageBuilder<BooleanType>::DictionaryPageBuilder(
|
||||
const ColumnDescriptor* d) {
|
||||
ParquetException::NYI("only plain encoding currently implemented for boolean");
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::WriteDict() {
|
||||
ParquetException::NYI("only plain encoding currently implemented for boolean");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::AppendValues(
|
||||
const std::vector<TC>& values) {
|
||||
ParquetException::NYI("only plain encoding currently implemented for boolean");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
inline static std::shared_ptr<DictionaryPage> MakeDictPage(
|
||||
const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
|
||||
const std::vector<int>& values_per_page, Encoding::type encoding,
|
||||
std::vector<std::shared_ptr<Buffer>>& rle_indices) {
|
||||
test::DictionaryPageBuilder<Type> page_builder(d);
|
||||
int num_pages = static_cast<int>(values_per_page.size());
|
||||
int value_start = 0;
|
||||
|
||||
for (int i = 0; i < num_pages; i++) {
|
||||
rle_indices.push_back(page_builder.AppendValues(
|
||||
slice(values, value_start, value_start + values_per_page[i])));
|
||||
value_start += values_per_page[i];
|
||||
}
|
||||
|
||||
auto buffer = page_builder.WriteDict();
|
||||
|
||||
return std::make_shared<DictionaryPage>(buffer, page_builder.num_values(),
|
||||
Encoding::PLAIN);
|
||||
}
|
||||
|
||||
// Given def/rep levels and values create multiple dict pages
|
||||
template <typename Type>
|
||||
inline static void PaginateDict(const ColumnDescriptor* d,
|
||||
const std::vector<typename Type::c_type>& values,
|
||||
const std::vector<int16_t>& def_levels,
|
||||
int16_t max_def_level,
|
||||
const std::vector<int16_t>& rep_levels,
|
||||
int16_t max_rep_level, int num_levels_per_page,
|
||||
const std::vector<int>& values_per_page,
|
||||
std::vector<std::shared_ptr<Page>>& pages,
|
||||
Encoding::type encoding = Encoding::RLE_DICTIONARY) {
|
||||
int num_pages = static_cast<int>(values_per_page.size());
|
||||
std::vector<std::shared_ptr<Buffer>> rle_indices;
|
||||
std::shared_ptr<DictionaryPage> dict_page =
|
||||
MakeDictPage<Type>(d, values, values_per_page, encoding, rle_indices);
|
||||
pages.push_back(dict_page);
|
||||
int def_level_start = 0;
|
||||
int def_level_end = 0;
|
||||
int rep_level_start = 0;
|
||||
int rep_level_end = 0;
|
||||
for (int i = 0; i < num_pages; i++) {
|
||||
if (max_def_level > 0) {
|
||||
def_level_start = i * num_levels_per_page;
|
||||
def_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
if (max_rep_level > 0) {
|
||||
rep_level_start = i * num_levels_per_page;
|
||||
rep_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
std::shared_ptr<DataPageV1> data_page = MakeDataPage<Int32Type>(
|
||||
d, {}, values_per_page[i], encoding, rle_indices[i]->data(),
|
||||
static_cast<int>(rle_indices[i]->size()),
|
||||
slice(def_levels, def_level_start, def_level_end), max_def_level,
|
||||
slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
|
||||
pages.push_back(data_page);
|
||||
}
|
||||
}
|
||||
|
||||
// Given def/rep levels and values create multiple plain pages
|
||||
template <typename Type>
|
||||
static inline void PaginatePlain(const ColumnDescriptor* d,
|
||||
const std::vector<typename Type::c_type>& values,
|
||||
const std::vector<int16_t>& def_levels,
|
||||
int16_t max_def_level,
|
||||
const std::vector<int16_t>& rep_levels,
|
||||
int16_t max_rep_level, int num_levels_per_page,
|
||||
const std::vector<int>& values_per_page,
|
||||
std::vector<std::shared_ptr<Page>>& pages,
|
||||
Encoding::type encoding = Encoding::PLAIN) {
|
||||
int num_pages = static_cast<int>(values_per_page.size());
|
||||
int def_level_start = 0;
|
||||
int def_level_end = 0;
|
||||
int rep_level_start = 0;
|
||||
int rep_level_end = 0;
|
||||
int value_start = 0;
|
||||
for (int i = 0; i < num_pages; i++) {
|
||||
if (max_def_level > 0) {
|
||||
def_level_start = i * num_levels_per_page;
|
||||
def_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
if (max_rep_level > 0) {
|
||||
rep_level_start = i * num_levels_per_page;
|
||||
rep_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
std::shared_ptr<DataPage> page = MakeDataPage<Type>(
|
||||
d, slice(values, value_start, value_start + values_per_page[i]),
|
||||
values_per_page[i], encoding, nullptr, 0,
|
||||
slice(def_levels, def_level_start, def_level_end), max_def_level,
|
||||
slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
|
||||
pages.push_back(page);
|
||||
value_start += values_per_page[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Generates pages from randomly generated data
|
||||
template <typename Type>
|
||||
static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page,
|
||||
std::vector<int16_t>& def_levels,
|
||||
std::vector<int16_t>& rep_levels,
|
||||
std::vector<typename Type::c_type>& values,
|
||||
std::vector<uint8_t>& buffer,
|
||||
std::vector<std::shared_ptr<Page>>& pages,
|
||||
Encoding::type encoding = Encoding::PLAIN) {
|
||||
int num_levels = levels_per_page * num_pages;
|
||||
int num_values = 0;
|
||||
uint32_t seed = 0;
|
||||
int16_t zero = 0;
|
||||
int16_t max_def_level = d->max_definition_level();
|
||||
int16_t max_rep_level = d->max_repetition_level();
|
||||
std::vector<int> values_per_page(num_pages, levels_per_page);
|
||||
// Create definition levels
|
||||
if (max_def_level > 0) {
|
||||
def_levels.resize(num_levels);
|
||||
random_numbers(num_levels, seed, zero, max_def_level, def_levels.data());
|
||||
for (int p = 0; p < num_pages; p++) {
|
||||
int num_values_per_page = 0;
|
||||
for (int i = 0; i < levels_per_page; i++) {
|
||||
if (def_levels[i + p * levels_per_page] == max_def_level) {
|
||||
num_values_per_page++;
|
||||
num_values++;
|
||||
}
|
||||
}
|
||||
values_per_page[p] = num_values_per_page;
|
||||
}
|
||||
} else {
|
||||
num_values = num_levels;
|
||||
}
|
||||
// Create repetition levels
|
||||
if (max_rep_level > 0) {
|
||||
rep_levels.resize(num_levels);
|
||||
random_numbers(num_levels, seed, zero, max_rep_level, rep_levels.data());
|
||||
}
|
||||
// Create values
|
||||
values.resize(num_values);
|
||||
if (encoding == Encoding::PLAIN) {
|
||||
InitValues<typename Type::c_type>(num_values, values, buffer);
|
||||
PaginatePlain<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
|
||||
levels_per_page, values_per_page, pages);
|
||||
} else if (encoding == Encoding::RLE_DICTIONARY ||
|
||||
encoding == Encoding::PLAIN_DICTIONARY) {
|
||||
// Calls InitValues and repeats the data
|
||||
InitDictValues<typename Type::c_type>(num_values, levels_per_page, values, buffer);
|
||||
PaginateDict<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
|
||||
levels_per_page, values_per_page, pages);
|
||||
}
|
||||
|
||||
return num_values;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Test data generation
|
||||
|
||||
template <>
|
||||
void inline InitValues<bool>(int num_values, std::vector<bool>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
values = {};
|
||||
::arrow::random_is_valid(num_values, 0.5, &values,
|
||||
static_cast<int>(::arrow::random_seed()));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitValues<ByteArray>(int num_values, std::vector<ByteArray>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
int max_byte_array_len = 12;
|
||||
int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t));
|
||||
size_t nbytes = num_values * num_bytes;
|
||||
buffer.resize(nbytes);
|
||||
random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len);
|
||||
}
|
||||
|
||||
inline void InitWideByteArrayValues(int num_values, std::vector<ByteArray>& values,
|
||||
std::vector<uint8_t>& buffer, int min_len,
|
||||
int max_len) {
|
||||
int num_bytes = static_cast<int>(max_len + sizeof(uint32_t));
|
||||
size_t nbytes = num_values * num_bytes;
|
||||
buffer.resize(nbytes);
|
||||
random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitValues<FLBA>(int num_values, std::vector<FLBA>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
size_t nbytes = num_values * FLBA_LENGTH;
|
||||
buffer.resize(nbytes);
|
||||
random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitValues<Int96>(int num_values, std::vector<Int96>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
|
||||
std::numeric_limits<int32_t>::max(), values.data());
|
||||
}
|
||||
|
||||
inline std::string TestColumnName(int i) {
|
||||
std::stringstream col_name;
|
||||
col_name << "column_" << i;
|
||||
return col_name.str();
|
||||
}
|
||||
|
||||
// This class lives here because of its dependency on the InitValues specializations.
|
||||
template <typename TestType>
|
||||
class PrimitiveTypedTest : public ::testing::Test {
|
||||
public:
|
||||
using c_type = typename TestType::c_type;
|
||||
|
||||
void SetUpSchema(Repetition::type repetition, int num_columns = 1) {
|
||||
std::vector<schema::NodePtr> fields;
|
||||
|
||||
for (int i = 0; i < num_columns; ++i) {
|
||||
std::string name = TestColumnName(i);
|
||||
fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num,
|
||||
ConvertedType::NONE, FLBA_LENGTH));
|
||||
}
|
||||
node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
|
||||
schema_.Init(node_);
|
||||
}
|
||||
|
||||
void GenerateData(int64_t num_values);
|
||||
void SetupValuesOut(int64_t num_values);
|
||||
void SyncValuesOut();
|
||||
|
||||
protected:
|
||||
schema::NodePtr node_;
|
||||
SchemaDescriptor schema_;
|
||||
|
||||
// Input buffers
|
||||
std::vector<c_type> values_;
|
||||
|
||||
std::vector<int16_t> def_levels_;
|
||||
|
||||
std::vector<uint8_t> buffer_;
|
||||
// Pointer to the values, needed as we cannot use std::vector<bool>::data()
|
||||
c_type* values_ptr_;
|
||||
std::vector<uint8_t> bool_buffer_;
|
||||
|
||||
// Output buffers
|
||||
std::vector<c_type> values_out_;
|
||||
std::vector<uint8_t> bool_buffer_out_;
|
||||
c_type* values_out_ptr_;
|
||||
};
|
||||
|
||||
template <typename TestType>
|
||||
inline void PrimitiveTypedTest<TestType>::SyncValuesOut() {}
|
||||
|
||||
template <>
|
||||
inline void PrimitiveTypedTest<BooleanType>::SyncValuesOut() {
|
||||
std::vector<uint8_t>::const_iterator source_iterator = bool_buffer_out_.begin();
|
||||
std::vector<c_type>::iterator destination_iterator = values_out_.begin();
|
||||
while (source_iterator != bool_buffer_out_.end()) {
|
||||
*destination_iterator++ = *source_iterator++ != 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType>
|
||||
inline void PrimitiveTypedTest<TestType>::SetupValuesOut(int64_t num_values) {
|
||||
values_out_.clear();
|
||||
values_out_.resize(num_values);
|
||||
values_out_ptr_ = values_out_.data();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void PrimitiveTypedTest<BooleanType>::SetupValuesOut(int64_t num_values) {
|
||||
values_out_.clear();
|
||||
values_out_.resize(num_values);
|
||||
|
||||
bool_buffer_out_.clear();
|
||||
bool_buffer_out_.resize(num_values);
|
||||
// Write once to all values so we can copy it without getting Valgrind errors
|
||||
// about uninitialised values.
|
||||
std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true);
|
||||
values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data());
|
||||
}
|
||||
|
||||
template <typename TestType>
|
||||
inline void PrimitiveTypedTest<TestType>::GenerateData(int64_t num_values) {
|
||||
def_levels_.resize(num_values);
|
||||
values_.resize(num_values);
|
||||
|
||||
InitValues<c_type>(static_cast<int>(num_values), values_, buffer_);
|
||||
values_ptr_ = values_.data();
|
||||
|
||||
std::fill(def_levels_.begin(), def_levels_.end(), 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values) {
|
||||
def_levels_.resize(num_values);
|
||||
values_.resize(num_values);
|
||||
|
||||
InitValues<c_type>(static_cast<int>(num_values), values_, buffer_);
|
||||
bool_buffer_.resize(num_values);
|
||||
std::copy(values_.begin(), values_.end(), bool_buffer_.begin());
|
||||
values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data());
|
||||
|
||||
std::fill(def_levels_.begin(), def_levels_.end(), 1);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace parquet
|
88
.venv/Lib/site-packages/pyarrow/include/parquet/type_fwd.h
Normal file
88
.venv/Lib/site-packages/pyarrow/include/parquet/type_fwd.h
Normal file
@ -0,0 +1,88 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// \brief Feature selection when writing Parquet files
|
||||
///
|
||||
/// `ParquetVersion::type` governs which data types are allowed and how they
|
||||
/// are represented. For example, uint32_t data will be written differently
|
||||
/// depending on this value (as INT64 for PARQUET_1_0, as UINT32 for other
|
||||
/// versions).
|
||||
///
|
||||
/// However, some features - such as compression algorithms, encryption,
|
||||
/// or the improved "v2" data page format - must be enabled separately in
|
||||
/// ArrowWriterProperties.
|
||||
struct ParquetVersion {
|
||||
enum type : int {
|
||||
/// Enable only pre-2.2 Parquet format features when writing
|
||||
///
|
||||
/// This setting is useful for maximum compatibility with legacy readers.
|
||||
/// Note that logical types may still be emitted, as long they have a
|
||||
/// corresponding converted type.
|
||||
PARQUET_1_0,
|
||||
|
||||
/// DEPRECATED: Enable Parquet format 2.6 features
|
||||
///
|
||||
/// This misleadingly named enum value is roughly similar to PARQUET_2_6.
|
||||
PARQUET_2_0 ARROW_DEPRECATED_ENUM_VALUE("use PARQUET_2_4 or PARQUET_2_6 "
|
||||
"for fine-grained feature selection"),
|
||||
|
||||
/// Enable Parquet format 2.4 and earlier features when writing
|
||||
///
|
||||
/// This enables UINT32 as well as logical types which don't have
|
||||
/// a corresponding converted type.
|
||||
///
|
||||
/// Note: Parquet format 2.4.0 was released in October 2017.
|
||||
PARQUET_2_4,
|
||||
|
||||
/// Enable Parquet format 2.6 and earlier features when writing
|
||||
///
|
||||
/// This enables the NANOS time unit in addition to the PARQUET_2_4
|
||||
/// features.
|
||||
///
|
||||
/// Note: Parquet format 2.6.0 was released in September 2018.
|
||||
PARQUET_2_6,
|
||||
|
||||
/// Enable latest Parquet format 2.x features
|
||||
///
|
||||
/// This value is equal to the greatest 2.x version supported by
|
||||
/// this library.
|
||||
PARQUET_2_LATEST = PARQUET_2_6
|
||||
};
|
||||
};
|
||||
|
||||
class FileMetaData;
|
||||
class SchemaDescriptor;
|
||||
|
||||
class ReaderProperties;
|
||||
class ArrowReaderProperties;
|
||||
|
||||
class WriterProperties;
|
||||
class WriterPropertiesBuilder;
|
||||
class ArrowWriterProperties;
|
||||
class ArrowWriterPropertiesBuilder;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class FileWriter;
|
||||
class FileReader;
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
758
.venv/Lib/site-packages/pyarrow/include/parquet/types.h
Normal file
758
.venv/Lib/site-packages/pyarrow/include/parquet/types.h
Normal file
@ -0,0 +1,758 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/util/string_view.h"
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/type_fwd.h"
|
||||
#include "parquet/windows_fixup.h" // for OPTIONAL
|
||||
|
||||
namespace arrow {
|
||||
namespace util {
|
||||
|
||||
class Codec;
|
||||
|
||||
} // namespace util
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Metadata enums to match Thrift metadata
|
||||
//
|
||||
// The reason we maintain our own enums is to avoid transitive dependency on
|
||||
// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
|
||||
// public API. After building parquet-cpp, you should not need to include
|
||||
// Thrift headers in your application. This means some boilerplate to convert
|
||||
// between our types and Parquet's Thrift types.
|
||||
//
|
||||
// We can also add special values like NONE to distinguish between metadata
|
||||
// values being set and not set. As an example consider ConvertedType and
|
||||
// CompressionCodec
|
||||
|
||||
// Mirrors parquet::Type
|
||||
struct Type {
|
||||
enum type {
|
||||
BOOLEAN = 0,
|
||||
INT32 = 1,
|
||||
INT64 = 2,
|
||||
INT96 = 3,
|
||||
FLOAT = 4,
|
||||
DOUBLE = 5,
|
||||
BYTE_ARRAY = 6,
|
||||
FIXED_LEN_BYTE_ARRAY = 7,
|
||||
// Should always be last element.
|
||||
UNDEFINED = 8
|
||||
};
|
||||
};
|
||||
|
||||
// Mirrors parquet::ConvertedType
|
||||
struct ConvertedType {
|
||||
enum type {
|
||||
NONE, // Not a real converted type, but means no converted type is specified
|
||||
UTF8,
|
||||
MAP,
|
||||
MAP_KEY_VALUE,
|
||||
LIST,
|
||||
ENUM,
|
||||
DECIMAL,
|
||||
DATE,
|
||||
TIME_MILLIS,
|
||||
TIME_MICROS,
|
||||
TIMESTAMP_MILLIS,
|
||||
TIMESTAMP_MICROS,
|
||||
UINT_8,
|
||||
UINT_16,
|
||||
UINT_32,
|
||||
UINT_64,
|
||||
INT_8,
|
||||
INT_16,
|
||||
INT_32,
|
||||
INT_64,
|
||||
JSON,
|
||||
BSON,
|
||||
INTERVAL,
|
||||
// DEPRECATED INVALID ConvertedType for all-null data.
|
||||
// Only useful for reading legacy files written out by interim Parquet C++ releases.
|
||||
// For writing, always emit LogicalType::Null instead.
|
||||
// See PARQUET-1990.
|
||||
NA = 25,
|
||||
UNDEFINED = 26 // Not a real converted type; should always be last element
|
||||
};
|
||||
};
|
||||
|
||||
// forward declaration
|
||||
namespace format {
|
||||
|
||||
class LogicalType;
|
||||
|
||||
}
|
||||
|
||||
// Mirrors parquet::FieldRepetitionType
|
||||
struct Repetition {
|
||||
enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
|
||||
};
|
||||
|
||||
// Reference:
|
||||
// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
|
||||
// format/converter/ParquetMetadataConverter.java
|
||||
// Sort order for page and column statistics. Types are associated with sort
|
||||
// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
|
||||
// aggregated using a sort order. As of parquet-format version 2.3.1, the
|
||||
// order used to aggregate stats is always SIGNED and is not stored in the
|
||||
// Parquet file. These stats are discarded for types that need unsigned.
|
||||
// See PARQUET-686.
|
||||
struct SortOrder {
|
||||
enum type { SIGNED, UNSIGNED, UNKNOWN };
|
||||
};
|
||||
|
||||
namespace schema {
|
||||
|
||||
struct DecimalMetadata {
|
||||
bool isset;
|
||||
int32_t scale;
|
||||
int32_t precision;
|
||||
};
|
||||
|
||||
} // namespace schema
|
||||
|
||||
/// \brief Implementation of parquet.thrift LogicalType types.
|
||||
class PARQUET_EXPORT LogicalType {
|
||||
public:
|
||||
struct Type {
|
||||
enum type {
|
||||
UNDEFINED = 0, // Not a real logical type
|
||||
STRING = 1,
|
||||
MAP,
|
||||
LIST,
|
||||
ENUM,
|
||||
DECIMAL,
|
||||
DATE,
|
||||
TIME,
|
||||
TIMESTAMP,
|
||||
INTERVAL,
|
||||
INT,
|
||||
NIL, // Thrift NullType: annotates data that is always null
|
||||
JSON,
|
||||
BSON,
|
||||
UUID,
|
||||
NONE // Not a real logical type; should always be last element
|
||||
};
|
||||
};
|
||||
|
||||
struct TimeUnit {
|
||||
enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
|
||||
};
|
||||
|
||||
/// \brief If possible, return a logical type equivalent to the given legacy
|
||||
/// converted type (and decimal metadata if applicable).
|
||||
static std::shared_ptr<const LogicalType> FromConvertedType(
|
||||
const parquet::ConvertedType::type converted_type,
|
||||
const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
|
||||
-1});
|
||||
|
||||
/// \brief Return the logical type represented by the Thrift intermediary object.
|
||||
static std::shared_ptr<const LogicalType> FromThrift(
|
||||
const parquet::format::LogicalType& thrift_logical_type);
|
||||
|
||||
/// \brief Return the explicitly requested logical type.
|
||||
static std::shared_ptr<const LogicalType> String();
|
||||
static std::shared_ptr<const LogicalType> Map();
|
||||
static std::shared_ptr<const LogicalType> List();
|
||||
static std::shared_ptr<const LogicalType> Enum();
|
||||
static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
|
||||
static std::shared_ptr<const LogicalType> Date();
|
||||
static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
|
||||
LogicalType::TimeUnit::unit time_unit);
|
||||
|
||||
/// \brief Create a Timestamp logical type
|
||||
/// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
|
||||
/// \param[in] time_unit the resolution of the timestamp
|
||||
/// \param[in] is_from_converted_type if true, the timestamp was generated
|
||||
/// by translating a legacy converted type of TIMESTAMP_MILLIS or
|
||||
/// TIMESTAMP_MICROS. Default is false.
|
||||
/// \param[in] force_set_converted_type if true, always set the
|
||||
/// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
|
||||
/// metadata. Default is false
|
||||
static std::shared_ptr<const LogicalType> Timestamp(
|
||||
bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
|
||||
bool is_from_converted_type = false, bool force_set_converted_type = false);
|
||||
|
||||
static std::shared_ptr<const LogicalType> Interval();
|
||||
static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
|
||||
|
||||
/// \brief Create a logical type for data that's always null
|
||||
///
|
||||
/// Any physical type can be annotated with this logical type.
|
||||
static std::shared_ptr<const LogicalType> Null();
|
||||
|
||||
static std::shared_ptr<const LogicalType> JSON();
|
||||
static std::shared_ptr<const LogicalType> BSON();
|
||||
static std::shared_ptr<const LogicalType> UUID();
|
||||
|
||||
/// \brief Create a placeholder for when no logical type is specified
|
||||
static std::shared_ptr<const LogicalType> None();
|
||||
|
||||
/// \brief Return true if this logical type is consistent with the given underlying
|
||||
/// physical type.
|
||||
bool is_applicable(parquet::Type::type primitive_type,
|
||||
int32_t primitive_length = -1) const;
|
||||
|
||||
/// \brief Return true if this logical type is equivalent to the given legacy converted
|
||||
/// type (and decimal metadata if applicable).
|
||||
bool is_compatible(parquet::ConvertedType::type converted_type,
|
||||
parquet::schema::DecimalMetadata converted_decimal_metadata = {
|
||||
false, -1, -1}) const;
|
||||
|
||||
/// \brief If possible, return the legacy converted type (and decimal metadata if
|
||||
/// applicable) equivalent to this logical type.
|
||||
parquet::ConvertedType::type ToConvertedType(
|
||||
parquet::schema::DecimalMetadata* out_decimal_metadata) const;
|
||||
|
||||
/// \brief Return a printable representation of this logical type.
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Return a JSON representation of this logical type.
|
||||
std::string ToJSON() const;
|
||||
|
||||
/// \brief Return a serializable Thrift object for this logical type.
|
||||
parquet::format::LogicalType ToThrift() const;
|
||||
|
||||
/// \brief Return true if the given logical type is equivalent to this logical type.
|
||||
bool Equals(const LogicalType& other) const;
|
||||
|
||||
/// \brief Return the enumerated type of this logical type.
|
||||
LogicalType::Type::type type() const;
|
||||
|
||||
/// \brief Return the appropriate sort order for this logical type.
|
||||
SortOrder::type sort_order() const;
|
||||
|
||||
// Type checks ...
|
||||
bool is_string() const;
|
||||
bool is_map() const;
|
||||
bool is_list() const;
|
||||
bool is_enum() const;
|
||||
bool is_decimal() const;
|
||||
bool is_date() const;
|
||||
bool is_time() const;
|
||||
bool is_timestamp() const;
|
||||
bool is_interval() const;
|
||||
bool is_int() const;
|
||||
bool is_null() const;
|
||||
bool is_JSON() const;
|
||||
bool is_BSON() const;
|
||||
bool is_UUID() const;
|
||||
bool is_none() const;
|
||||
/// \brief Return true if this logical type is of a known type.
|
||||
bool is_valid() const;
|
||||
bool is_invalid() const;
|
||||
/// \brief Return true if this logical type is suitable for a schema GroupNode.
|
||||
bool is_nested() const;
|
||||
bool is_nonnested() const;
|
||||
/// \brief Return true if this logical type is included in the Thrift output for its
|
||||
/// node.
|
||||
bool is_serialized() const;
|
||||
|
||||
LogicalType(const LogicalType&) = delete;
|
||||
LogicalType& operator=(const LogicalType&) = delete;
|
||||
virtual ~LogicalType() noexcept;
|
||||
|
||||
protected:
|
||||
LogicalType();
|
||||
|
||||
class Impl;
|
||||
std::unique_ptr<const Impl> impl_;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
|
||||
class PARQUET_EXPORT StringLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
StringLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for group nodes only.
|
||||
class PARQUET_EXPORT MapLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
MapLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for group nodes only.
|
||||
class PARQUET_EXPORT ListLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
ListLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
|
||||
class PARQUET_EXPORT EnumLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
EnumLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
|
||||
/// depending on the precision.
|
||||
class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
|
||||
int32_t precision() const;
|
||||
int32_t scale() const;
|
||||
|
||||
private:
|
||||
DecimalLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32.
|
||||
class PARQUET_EXPORT DateLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
DateLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
|
||||
class PARQUET_EXPORT TimeLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
|
||||
LogicalType::TimeUnit::unit time_unit);
|
||||
bool is_adjusted_to_utc() const;
|
||||
LogicalType::TimeUnit::unit time_unit() const;
|
||||
|
||||
private:
|
||||
TimeLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT64.
|
||||
class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
|
||||
LogicalType::TimeUnit::unit time_unit,
|
||||
bool is_from_converted_type = false,
|
||||
bool force_set_converted_type = false);
|
||||
bool is_adjusted_to_utc() const;
|
||||
LogicalType::TimeUnit::unit time_unit() const;
|
||||
|
||||
/// \brief If true, will not set LogicalType in Thrift metadata
|
||||
bool is_from_converted_type() const;
|
||||
|
||||
/// \brief If true, will set ConvertedType for micros and millis
|
||||
/// resolution in legacy ConvertedType Thrift metadata
|
||||
bool force_set_converted_type() const;
|
||||
|
||||
private:
|
||||
TimestampLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
|
||||
class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
IntervalLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
|
||||
/// (for bit width 64).
|
||||
class PARQUET_EXPORT IntLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
|
||||
int bit_width() const;
|
||||
bool is_signed() const;
|
||||
|
||||
private:
|
||||
IntLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for any physical type.
|
||||
class PARQUET_EXPORT NullLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
NullLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY.
|
||||
class PARQUET_EXPORT JSONLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
JSONLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY.
|
||||
class PARQUET_EXPORT BSONLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
BSONLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
|
||||
/// must encode raw UUID bytes.
|
||||
class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
UUIDLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for any physical type.
|
||||
class PARQUET_EXPORT NoLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
NoLogicalType() = default;
|
||||
};
|
||||
|
||||
// Internal API, for unrecognized logical types
|
||||
class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
UndefinedLogicalType() = default;
|
||||
};
|
||||
|
||||
// Data encodings. Mirrors parquet::Encoding
|
||||
struct Encoding {
|
||||
enum type {
|
||||
PLAIN = 0,
|
||||
PLAIN_DICTIONARY = 2,
|
||||
RLE = 3,
|
||||
BIT_PACKED = 4,
|
||||
DELTA_BINARY_PACKED = 5,
|
||||
DELTA_LENGTH_BYTE_ARRAY = 6,
|
||||
DELTA_BYTE_ARRAY = 7,
|
||||
RLE_DICTIONARY = 8,
|
||||
BYTE_STREAM_SPLIT = 9,
|
||||
// Should always be last element (except UNKNOWN)
|
||||
UNDEFINED = 10,
|
||||
UNKNOWN = 999
|
||||
};
|
||||
};
|
||||
|
||||
// Exposed data encodings. It is the encoding of the data read from the file,
|
||||
// rather than the encoding of the data in the file. E.g., the data encoded as
|
||||
// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
|
||||
// decoding, in which case the data read from the file is DICTIONARY encoded.
|
||||
enum class ExposedEncoding {
|
||||
NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
|
||||
DICTIONARY = 1
|
||||
};
|
||||
|
||||
/// \brief Return true if Parquet supports indicated compression type
|
||||
PARQUET_EXPORT
|
||||
bool IsCodecSupported(Compression::type codec);
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Codec> GetCodec(Compression::type codec);
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
|
||||
|
||||
struct ParquetCipher {
|
||||
enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
|
||||
};
|
||||
|
||||
struct AadMetadata {
|
||||
std::string aad_prefix;
|
||||
std::string aad_file_unique;
|
||||
bool supply_aad_prefix;
|
||||
};
|
||||
|
||||
struct EncryptionAlgorithm {
|
||||
ParquetCipher::type algorithm;
|
||||
AadMetadata aad;
|
||||
};
|
||||
|
||||
// parquet::PageType
|
||||
struct PageType {
|
||||
enum type {
|
||||
DATA_PAGE,
|
||||
INDEX_PAGE,
|
||||
DICTIONARY_PAGE,
|
||||
DATA_PAGE_V2,
|
||||
// Should always be last element
|
||||
UNDEFINED
|
||||
};
|
||||
};
|
||||
|
||||
class ColumnOrder {
|
||||
public:
|
||||
enum type { UNDEFINED, TYPE_DEFINED_ORDER };
|
||||
explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
|
||||
// Default to Type Defined Order
|
||||
ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
|
||||
ColumnOrder::type get_order() { return column_order_; }
|
||||
|
||||
static ColumnOrder undefined_;
|
||||
static ColumnOrder type_defined_;
|
||||
|
||||
private:
|
||||
ColumnOrder::type column_order_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
struct ByteArray {
|
||||
ByteArray() : len(0), ptr(NULLPTR) {}
|
||||
ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
|
||||
|
||||
ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
|
||||
: ByteArray(static_cast<uint32_t>(view.size()),
|
||||
reinterpret_cast<const uint8_t*>(view.data())) {}
|
||||
uint32_t len;
|
||||
const uint8_t* ptr;
|
||||
};
|
||||
|
||||
inline bool operator==(const ByteArray& left, const ByteArray& right) {
|
||||
return left.len == right.len &&
|
||||
(left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
|
||||
}
|
||||
|
||||
inline bool operator!=(const ByteArray& left, const ByteArray& right) {
|
||||
return !(left == right);
|
||||
}
|
||||
|
||||
struct FixedLenByteArray {
|
||||
FixedLenByteArray() : ptr(NULLPTR) {}
|
||||
explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
|
||||
const uint8_t* ptr;
|
||||
};
|
||||
|
||||
using FLBA = FixedLenByteArray;
|
||||
|
||||
// Julian day at unix epoch.
|
||||
//
|
||||
// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
|
||||
// the Julian day count starting from noon Universal time, with Julian day
|
||||
// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
|
||||
// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
|
||||
// calendar),
|
||||
constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
|
||||
constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
|
||||
constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
|
||||
constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
|
||||
constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
|
||||
|
||||
MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
|
||||
STRUCT_END(Int96, 12);
|
||||
|
||||
inline bool operator==(const Int96& left, const Int96& right) {
|
||||
return std::equal(left.value, left.value + 3, right.value);
|
||||
}
|
||||
|
||||
inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
|
||||
|
||||
static inline std::string ByteArrayToString(const ByteArray& a) {
|
||||
return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
|
||||
}
|
||||
|
||||
static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
|
||||
std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
|
||||
}
|
||||
|
||||
struct DecodedInt96 {
|
||||
uint64_t days_since_epoch;
|
||||
uint64_t nanoseconds;
|
||||
};
|
||||
|
||||
static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
|
||||
// We do the computations in the unsigned domain to avoid unsigned behaviour
|
||||
// on overflow.
|
||||
DecodedInt96 result;
|
||||
result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
|
||||
result.nanoseconds = 0;
|
||||
|
||||
memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
|
||||
decoded.nanoseconds);
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
|
||||
microseconds);
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
|
||||
milliseconds);
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
|
||||
}
|
||||
|
||||
static inline std::string Int96ToString(const Int96& a) {
|
||||
std::ostringstream result;
|
||||
std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
|
||||
return result.str();
|
||||
}
|
||||
|
||||
static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
|
||||
std::ostringstream result;
|
||||
std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
|
||||
return result.str();
|
||||
}
|
||||
|
||||
template <Type::type TYPE>
|
||||
struct type_traits {};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::BOOLEAN> {
|
||||
using value_type = bool;
|
||||
|
||||
static constexpr int value_byte_size = 1;
|
||||
static constexpr const char* printf_code = "d";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::INT32> {
|
||||
using value_type = int32_t;
|
||||
|
||||
static constexpr int value_byte_size = 4;
|
||||
static constexpr const char* printf_code = "d";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::INT64> {
|
||||
using value_type = int64_t;
|
||||
|
||||
static constexpr int value_byte_size = 8;
|
||||
static constexpr const char* printf_code =
|
||||
(sizeof(long) == 64) ? "ld" : "lld"; // NOLINT: runtime/int
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::INT96> {
|
||||
using value_type = Int96;
|
||||
|
||||
static constexpr int value_byte_size = 12;
|
||||
static constexpr const char* printf_code = "s";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::FLOAT> {
|
||||
using value_type = float;
|
||||
|
||||
static constexpr int value_byte_size = 4;
|
||||
static constexpr const char* printf_code = "f";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::DOUBLE> {
|
||||
using value_type = double;
|
||||
|
||||
static constexpr int value_byte_size = 8;
|
||||
static constexpr const char* printf_code = "lf";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::BYTE_ARRAY> {
|
||||
using value_type = ByteArray;
|
||||
|
||||
static constexpr int value_byte_size = sizeof(ByteArray);
|
||||
static constexpr const char* printf_code = "s";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
|
||||
using value_type = FixedLenByteArray;
|
||||
|
||||
static constexpr int value_byte_size = sizeof(FixedLenByteArray);
|
||||
static constexpr const char* printf_code = "s";
|
||||
};
|
||||
|
||||
template <Type::type TYPE>
|
||||
struct PhysicalType {
|
||||
using c_type = typename type_traits<TYPE>::value_type;
|
||||
static constexpr Type::type type_num = TYPE;
|
||||
};
|
||||
|
||||
using BooleanType = PhysicalType<Type::BOOLEAN>;
|
||||
using Int32Type = PhysicalType<Type::INT32>;
|
||||
using Int64Type = PhysicalType<Type::INT64>;
|
||||
using Int96Type = PhysicalType<Type::INT96>;
|
||||
using FloatType = PhysicalType<Type::FLOAT>;
|
||||
using DoubleType = PhysicalType<Type::DOUBLE>;
|
||||
using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
|
||||
using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
|
||||
|
||||
template <typename Type>
|
||||
inline std::string format_fwf(int width) {
|
||||
std::stringstream ss;
|
||||
ss << "%-" << width << type_traits<Type::type_num>::printf_code;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
|
||||
|
||||
PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
|
||||
|
||||
PARQUET_EXPORT std::string TypeToString(Type::type t);
|
||||
|
||||
PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
|
||||
::arrow::util::string_view val);
|
||||
|
||||
PARQUET_EXPORT int GetTypeByteSize(Type::type t);
|
||||
|
||||
PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
|
||||
|
||||
PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
|
||||
Type::type primitive);
|
||||
|
||||
PARQUET_EXPORT SortOrder::type GetSortOrder(
|
||||
const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
|
||||
|
||||
} // namespace parquet
|
@ -0,0 +1,21 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/windows_compatibility.h"
|
||||
#include "parquet/windows_fixup.h"
|
@ -0,0 +1,29 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This header needs to be included multiple times.
|
||||
|
||||
#include "arrow/util/windows_fixup.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
// parquet.thrift's OPTIONAL RepetitionType conflicts with a Windows #define
|
||||
#ifdef OPTIONAL
|
||||
#undef OPTIONAL
|
||||
#endif
|
||||
|
||||
#endif // _WIN32
|
Reference in New Issue
Block a user