first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,20 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "parquet/exception.h"

View File

@ -0,0 +1,35 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
// Column reader API
#include "parquet/column_reader.h"
#include "parquet/column_scanner.h"
#include "parquet/exception.h"
#include "parquet/file_reader.h"
#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/printer.h"
#include "parquet/properties.h"
#include "parquet/statistics.h"
// Schemas
#include "parquet/api/schema.h"
// IO
#include "parquet/api/io.h"

View File

@ -0,0 +1,21 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
// Schemas
#include "parquet/schema.h"

View File

@ -0,0 +1,25 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "parquet/api/io.h"
#include "parquet/api/schema.h"
#include "parquet/column_writer.h"
#include "parquet/exception.h"
#include "parquet/file_writer.h"
#include "parquet/statistics.h"

View File

@ -0,0 +1,344 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
// N.B. we don't include async_generator.h as it's relatively heavy
#include <functional>
#include <memory>
#include <vector>
#include "parquet/file_reader.h"
#include "parquet/platform.h"
#include "parquet/properties.h"
namespace arrow {
class ChunkedArray;
class KeyValueMetadata;
class RecordBatchReader;
struct Scalar;
class Schema;
class Table;
class RecordBatch;
} // namespace arrow
namespace parquet {
class FileMetaData;
class SchemaDescriptor;
namespace arrow {
class ColumnChunkReader;
class ColumnReader;
struct SchemaManifest;
class RowGroupReader;
/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
///
/// This interfaces caters for different use cases and thus provides different
/// interfaces. In its most simplistic form, we cater for a user that wants to
/// read the whole Parquet at once with the `FileReader::ReadTable` method.
///
/// More advanced users that also want to implement parallelism on top of each
/// single Parquet files should do this on the RowGroup level. For this, they can
/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
/// RowGroup as a table.
///
/// In the most advanced situation, where a consumer wants to independently read
/// RowGroups in parallel and consume each column individually, they can call
/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
/// instance.
///
/// The parquet format supports an optional integer field_id which can be assigned
/// to a field. Arrow will convert these field IDs to a metadata key named
/// PARQUET:field_id on the appropriate field.
// TODO(wesm): nested data does not always make sense with this user
// interface unless you are only reading a single leaf node from a branch of
// a table. For example:
//
// repeated group data {
// optional group record {
// optional int32 val1;
// optional byte_array val2;
// optional bool val3;
// }
// optional int32 val4;
// }
//
// In the Parquet file, there are 3 leaf nodes:
//
// * data.record.val1
// * data.record.val2
// * data.record.val3
// * data.val4
//
// When materializing this data in an Arrow array, we would have:
//
// data: list<struct<
// record: struct<
// val1: int32,
// val2: string (= list<uint8>),
// val3: bool,
// >,
// val4: int32
// >>
//
// However, in the Parquet format, each leaf node has its own repetition and
// definition levels describing the structure of the intermediate nodes in
// this array structure. Thus, we will need to scan the leaf data for a group
// of leaf nodes part of the same type tree to create a single result Arrow
// nested array structure.
//
// This is additionally complicated "chunky" repeated fields or very large byte
// arrays
class PARQUET_EXPORT FileReader {
public:
/// Factory function to create a FileReader from a ParquetFileReader and properties
static ::arrow::Status Make(::arrow::MemoryPool* pool,
std::unique_ptr<ParquetFileReader> reader,
const ArrowReaderProperties& properties,
std::unique_ptr<FileReader>* out);
/// Factory function to create a FileReader from a ParquetFileReader
static ::arrow::Status Make(::arrow::MemoryPool* pool,
std::unique_ptr<ParquetFileReader> reader,
std::unique_ptr<FileReader>* out);
// Since the distribution of columns amongst a Parquet file's row groups may
// be uneven (the number of values in each column chunk can be different), we
// provide a column-oriented read interface. The ColumnReader hides the
// details of paging through the file's row groups and yielding
// fully-materialized arrow::Array instances
//
// Returns error status if the column of interest is not flat.
virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
/// \brief Return arrow schema for all the columns.
virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
/// \brief Read column as a whole into a chunked array.
///
/// The indicated column index is relative to the schema
virtual ::arrow::Status ReadColumn(int i,
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
// NOTE: Experimental API
// Reads a specific top level schema field into an Array
// The index i refers the index of the top level schema field, which may
// be nested or flat - e.g.
//
// 0 foo.bar
// foo.bar.baz
// foo.qux
// 1 foo2
// 2 foo3
//
// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
virtual ::arrow::Status ReadSchemaField(
int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
/// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
///
/// Note that the ordering in row_group_indices matters. FileReaders must outlive
/// their RecordBatchReaders.
///
/// \returns error Status if row_group_indices contains an invalid index
virtual ::arrow::Status GetRecordBatchReader(
const std::vector<int>& row_group_indices,
std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
std::shared_ptr<::arrow::RecordBatchReader>* out);
/// \brief Return a RecordBatchReader of row groups selected from
/// row_group_indices, whose columns are selected by column_indices.
///
/// Note that the ordering in row_group_indices and column_indices
/// matter. FileReaders must outlive their RecordBatchReaders.
///
/// \returns error Status if either row_group_indices or column_indices
/// contains an invalid index
virtual ::arrow::Status GetRecordBatchReader(
const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
/// \brief Return a generator of record batches.
///
/// The FileReader must outlive the generator, so this requires that you pass in a
/// shared_ptr.
///
/// \returns error Result if either row_group_indices or column_indices contains an
/// invalid index
virtual ::arrow::Result<
std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
const std::vector<int> row_group_indices,
const std::vector<int> column_indices,
::arrow::internal::Executor* cpu_executor = NULLPTR,
int64_t rows_to_readahead = 0) = 0;
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
const std::vector<int>& column_indices,
std::shared_ptr<::arrow::RecordBatchReader>* out);
/// Read all columns into a Table
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
/// \brief Read the given columns into a Table
///
/// The indicated column indices are relative to the schema
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
std::shared_ptr<::arrow::Table>* out) = 0;
virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
std::shared_ptr<::arrow::Table>* out) = 0;
virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
const std::vector<int>& column_indices,
std::shared_ptr<::arrow::Table>* out) = 0;
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
std::shared_ptr<::arrow::Table>* out) = 0;
/// \brief Scan file contents with one thread, return number of rows
virtual ::arrow::Status ScanContents(std::vector<int> columns,
const int32_t column_batch_size,
int64_t* num_rows) = 0;
/// \brief Return a reader for the RowGroup, this object must not outlive the
/// FileReader.
virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
/// \brief The number of row groups in the file
virtual int num_row_groups() const = 0;
virtual ParquetFileReader* parquet_reader() const = 0;
/// Set whether to use multiple threads during reads of multiple columns.
/// By default only one thread is used.
virtual void set_use_threads(bool use_threads) = 0;
/// Set number of records to read per batch for the RecordBatchReader.
virtual void set_batch_size(int64_t batch_size) = 0;
virtual const ArrowReaderProperties& properties() const = 0;
virtual const SchemaManifest& manifest() const = 0;
virtual ~FileReader() = default;
};
class RowGroupReader {
public:
virtual ~RowGroupReader() = default;
virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
std::shared_ptr<::arrow::Table>* out) = 0;
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
private:
struct Iterator;
};
class ColumnChunkReader {
public:
virtual ~ColumnChunkReader() = default;
virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
};
// At this point, the column reader is a stream iterator. It only knows how to
// read the next batch of values for a particular column from the file until it
// runs out.
//
// We also do not expose any internal Parquet details, such as row groups. This
// might change in the future.
class PARQUET_EXPORT ColumnReader {
public:
virtual ~ColumnReader() = default;
// Scan the next array of the indicated size. The actual size of the
// returned array may be less than the passed size depending how much data is
// available in the file.
//
// When all the data in the file has been exhausted, the result is set to
// nullptr.
//
// Returns Status::OK on a successful read, including if you have exhausted
// the data available in the file.
virtual ::arrow::Status NextBatch(int64_t batch_size,
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
};
/// \brief Experimental helper class for bindings (like Python) that struggle
/// either with std::move or C++ exceptions
class PARQUET_EXPORT FileReaderBuilder {
public:
FileReaderBuilder();
/// Create FileReaderBuilder from Arrow file and optional properties / metadata
::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
const ReaderProperties& properties = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
ParquetFileReader* raw_reader() { return raw_reader_.get(); }
/// Set Arrow MemoryPool for memory allocation
FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
/// Set Arrow reader properties
FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
/// Build FileReader instance
::arrow::Status Build(std::unique_ptr<FileReader>* out);
private:
::arrow::MemoryPool* pool_;
ArrowReaderProperties properties_;
std::unique_ptr<ParquetFileReader> raw_reader_;
};
/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
///
/// @{
/// \brief Build FileReader from Arrow file and MemoryPool
///
/// Advanced settings are supported through the FileReaderBuilder class.
PARQUET_EXPORT
::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
::arrow::MemoryPool* allocator,
std::unique_ptr<FileReader>* reader);
/// @}
PARQUET_EXPORT
::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
std::shared_ptr<::arrow::Scalar>* min,
std::shared_ptr<::arrow::Scalar>* max);
namespace internal {
PARQUET_EXPORT
::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
} // namespace internal
} // namespace arrow
} // namespace parquet

View File

@ -0,0 +1,184 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cassert>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "parquet/level_conversion.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
namespace parquet {
class ArrowReaderProperties;
class ArrowWriterProperties;
class WriterProperties;
namespace arrow {
/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
/// schema into a Parquet schema.
///
/// @{
PARQUET_EXPORT
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties,
schema::NodePtr* out);
PARQUET_EXPORT
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties,
std::shared_ptr<SchemaDescriptor>* out);
PARQUET_EXPORT
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
const WriterProperties& properties,
std::shared_ptr<SchemaDescriptor>* out);
/// @}
/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
/// schema into an Arrow schema.
///
/// @{
PARQUET_EXPORT
::arrow::Status FromParquetSchema(
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
std::shared_ptr<::arrow::Schema>* out);
PARQUET_EXPORT
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
const ArrowReaderProperties& properties,
std::shared_ptr<::arrow::Schema>* out);
PARQUET_EXPORT
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
std::shared_ptr<::arrow::Schema>* out);
/// @}
/// \brief Bridge between an arrow::Field and parquet column indices.
struct PARQUET_EXPORT SchemaField {
std::shared_ptr<::arrow::Field> field;
std::vector<SchemaField> children;
// Only set for leaf nodes
int column_index = -1;
parquet::internal::LevelInfo level_info;
bool is_leaf() const { return column_index != -1; }
};
/// \brief Bridge between a parquet Schema and an arrow Schema.
///
/// Expose parquet columns as a tree structure. Useful traverse and link
/// between arrow's Schema and parquet's Schema.
struct PARQUET_EXPORT SchemaManifest {
static ::arrow::Status Make(
const SchemaDescriptor* schema,
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
const ArrowReaderProperties& properties, SchemaManifest* manifest);
const SchemaDescriptor* descr;
std::shared_ptr<::arrow::Schema> origin_schema;
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
std::vector<SchemaField> schema_fields;
std::unordered_map<int, const SchemaField*> column_index_to_field;
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
auto it = column_index_to_field.find(column_index);
if (it == column_index_to_field.end()) {
return ::arrow::Status::KeyError("Column index ", column_index,
" not found in schema manifest, may be malformed");
}
*out = it->second;
return ::arrow::Status::OK();
}
const SchemaField* GetParent(const SchemaField* field) const {
// Returns nullptr also if not found
auto it = child_to_parent.find(field);
if (it == child_to_parent.end()) {
return NULLPTR;
}
return it->second;
}
/// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
/// correspond to the column root (first node below the parquet schema's root group) of
/// each leaf referenced in column_indices.
///
/// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
/// the roots are `a` and `i` (return=[0,2]).
///
/// root
/// -- a <------
/// -- -- b | |
/// -- -- -- c |
/// -- -- -- d |
/// -- -- -- -- e
/// -- f
/// -- -- g
/// -- -- -- h
/// -- i <---
/// -- -- j |
/// -- -- -- k
::arrow::Result<std::vector<int>> GetFieldIndices(
const std::vector<int>& column_indices) const {
const schema::GroupNode* group = descr->group_node();
std::unordered_set<int> already_added;
std::vector<int> out;
for (int column_idx : column_indices) {
if (column_idx < 0 || column_idx >= descr->num_columns()) {
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
}
auto field_node = descr->GetColumnRoot(column_idx);
auto field_idx = group->FieldIndex(*field_node);
if (field_idx == -1) {
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
}
if (already_added.insert(field_idx).second) {
out.push_back(field_idx);
}
}
return out;
}
};
} // namespace arrow
} // namespace parquet

View File

@ -0,0 +1,507 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <limits>
#include <memory>
#include <random>
#include <string>
#include <utility>
#include <vector>
#include "arrow/array.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/decimal.h"
#include "parquet/column_reader.h"
namespace parquet {
using internal::RecordReader;
namespace arrow {
using ::arrow::Array;
using ::arrow::ChunkedArray;
using ::arrow::Status;
template <int32_t PRECISION>
struct DecimalWithPrecisionAndScale {
static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value");
using type = ::arrow::Decimal128Type;
static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id;
static constexpr int32_t precision = PRECISION;
static constexpr int32_t scale = PRECISION - 1;
};
template <int32_t PRECISION>
struct Decimal256WithPrecisionAndScale {
static_assert(PRECISION >= 1 && PRECISION <= 76, "Invalid precision value");
using type = ::arrow::Decimal256Type;
static constexpr ::arrow::Type::type type_id = ::arrow::Decimal256Type::type_id;
static constexpr int32_t precision = PRECISION;
static constexpr int32_t scale = PRECISION - 1;
};
template <class ArrowType>
::arrow::enable_if_floating_point<ArrowType, Status> NonNullArray(
size_t size, std::shared_ptr<Array>* out) {
using c_type = typename ArrowType::c_type;
std::vector<c_type> values;
::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1), &values);
::arrow::NumericBuilder<ArrowType> builder;
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
return builder.Finish(out);
}
template <class ArrowType>
::arrow::enable_if_integer<ArrowType, Status> NonNullArray(size_t size,
std::shared_ptr<Array>* out) {
std::vector<typename ArrowType::c_type> values;
::arrow::randint(size, 0, 64, &values);
// Passing data type so this will work with TimestampType too
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
::arrow::default_memory_pool());
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
return builder.Finish(out);
}
template <class ArrowType>
::arrow::enable_if_date<ArrowType, Status> NonNullArray(size_t size,
std::shared_ptr<Array>* out) {
std::vector<typename ArrowType::c_type> values;
::arrow::randint(size, 0, 24, &values);
for (size_t i = 0; i < size; i++) {
values[i] *= 86400000;
}
// Passing data type so this will work with TimestampType too
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
::arrow::default_memory_pool());
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
return builder.Finish(out);
}
template <class ArrowType>
::arrow::enable_if_base_binary<ArrowType, Status> NonNullArray(
size_t size, std::shared_ptr<Array>* out) {
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
BuilderType builder;
for (size_t i = 0; i < size; i++) {
RETURN_NOT_OK(builder.Append("test-string"));
}
return builder.Finish(out);
}
template <typename ArrowType>
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
size_t size, std::shared_ptr<Array>* out) {
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
// set byte_width to the length of "fixed": 5
// todo: find a way to generate test data with more diversity.
BuilderType builder(::arrow::fixed_size_binary(5));
for (size_t i = 0; i < size; i++) {
RETURN_NOT_OK(builder.Append("fixed"));
}
return builder.Finish(out);
}
static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
auto gen = ::arrow::random::RandomArrayGenerator(seed);
std::shared_ptr<Array> decimals;
int32_t byte_width = 0;
if (precision <= ::arrow::Decimal128Type::kMaxPrecision) {
decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
byte_width = ::arrow::Decimal128Type::kByteWidth;
} else {
decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
byte_width = ::arrow::Decimal256Type::kByteWidth;
}
std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
}
template <typename ArrowType, int32_t precision = ArrowType::precision>
::arrow::enable_if_t<
std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>
NonNullArray(size_t size, std::shared_ptr<Array>* out) {
constexpr int32_t kDecimalPrecision = precision;
constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
::arrow::Decimal128Builder builder(type);
const int32_t byte_width =
static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
constexpr int32_t seed = 0;
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
return builder.Finish(out);
}
template <typename ArrowType, int32_t precision = ArrowType::precision>
::arrow::enable_if_t<
std::is_same<ArrowType, Decimal256WithPrecisionAndScale<precision>>::value, Status>
NonNullArray(size_t size, std::shared_ptr<Array>* out) {
constexpr int32_t kDecimalPrecision = precision;
constexpr int32_t kDecimalScale = Decimal256WithPrecisionAndScale<precision>::scale;
const auto type = ::arrow::decimal256(kDecimalPrecision, kDecimalScale);
::arrow::Decimal256Builder builder(type);
const int32_t byte_width =
static_cast<const ::arrow::Decimal256Type&>(*type).byte_width();
constexpr int32_t seed = 0;
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
return builder.Finish(out);
}
template <class ArrowType>
::arrow::enable_if_boolean<ArrowType, Status> NonNullArray(size_t size,
std::shared_ptr<Array>* out) {
std::vector<uint8_t> values;
::arrow::randint(size, 0, 1, &values);
::arrow::BooleanBuilder builder;
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
return builder.Finish(out);
}
// This helper function only supports (size/2) nulls.
template <typename ArrowType>
::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
using c_type = typename ArrowType::c_type;
std::vector<c_type> values;
::arrow::random_real(size, seed, static_cast<c_type>(-1e10), static_cast<c_type>(1e10),
&values);
std::vector<uint8_t> valid_bytes(size, 1);
for (size_t i = 0; i < num_nulls; i++) {
valid_bytes[i * 2] = 0;
}
::arrow::NumericBuilder<ArrowType> builder;
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
return builder.Finish(out);
}
// This helper function only supports (size/2) nulls.
template <typename ArrowType>
::arrow::enable_if_integer<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
uint32_t seed,
std::shared_ptr<Array>* out) {
std::vector<typename ArrowType::c_type> values;
// Seed is random in Arrow right now
(void)seed;
::arrow::randint(size, 0, 64, &values);
std::vector<uint8_t> valid_bytes(size, 1);
for (size_t i = 0; i < num_nulls; i++) {
valid_bytes[i * 2] = 0;
}
// Passing data type so this will work with TimestampType too
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
::arrow::default_memory_pool());
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
return builder.Finish(out);
}
template <typename ArrowType>
::arrow::enable_if_date<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
uint32_t seed,
std::shared_ptr<Array>* out) {
std::vector<typename ArrowType::c_type> values;
// Seed is random in Arrow right now
(void)seed;
::arrow::randint(size, 0, 24, &values);
for (size_t i = 0; i < size; i++) {
values[i] *= 86400000;
}
std::vector<uint8_t> valid_bytes(size, 1);
for (size_t i = 0; i < num_nulls; i++) {
valid_bytes[i * 2] = 0;
}
// Passing data type so this will work with TimestampType too
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
::arrow::default_memory_pool());
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
return builder.Finish(out);
}
// This helper function only supports (size/2) nulls yet.
template <typename ArrowType>
::arrow::enable_if_base_binary<ArrowType, Status> NullableArray(
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
std::vector<uint8_t> valid_bytes(size, 1);
for (size_t i = 0; i < num_nulls; i++) {
valid_bytes[i * 2] = 0;
}
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
BuilderType builder;
const int kBufferSize = 10;
uint8_t buffer[kBufferSize];
for (size_t i = 0; i < size; i++) {
if (!valid_bytes[i]) {
RETURN_NOT_OK(builder.AppendNull());
} else {
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
if (ArrowType::is_utf8) {
// Trivially force data to be valid UTF8 by making it all ASCII
for (auto& byte : buffer) {
byte &= 0x7f;
}
}
RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
}
}
return builder.Finish(out);
}
// This helper function only supports (size/2) nulls yet,
// same as NullableArray<String|Binary>(..)
template <typename ArrowType>
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NullableArray(
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
std::vector<uint8_t> valid_bytes(size, 1);
for (size_t i = 0; i < num_nulls; i++) {
valid_bytes[i * 2] = 0;
}
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
const int byte_width = 10;
BuilderType builder(::arrow::fixed_size_binary(byte_width));
const int kBufferSize = byte_width;
uint8_t buffer[kBufferSize];
for (size_t i = 0; i < size; i++) {
if (!valid_bytes[i]) {
RETURN_NOT_OK(builder.AppendNull());
} else {
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
RETURN_NOT_OK(builder.Append(buffer));
}
}
return builder.Finish(out);
}
template <typename ArrowType, int32_t precision = ArrowType::precision>
::arrow::enable_if_t<
std::is_same<ArrowType, DecimalWithPrecisionAndScale<precision>>::value, Status>
NullableArray(size_t size, size_t num_nulls, uint32_t seed,
std::shared_ptr<::arrow::Array>* out) {
std::vector<uint8_t> valid_bytes(size, '\1');
for (size_t i = 0; i < num_nulls; ++i) {
valid_bytes[i * 2] = '\0';
}
constexpr int32_t kDecimalPrecision = precision;
constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale<precision>::scale;
const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale);
const int32_t byte_width =
static_cast<const ::arrow::Decimal128Type&>(*type).byte_width();
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
random_decimals(size, seed, precision, out_buf->mutable_data());
::arrow::Decimal128Builder builder(type);
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
return builder.Finish(out);
}
template <typename ArrowType, int32_t precision = ArrowType::precision>
::arrow::enable_if_t<
std::is_same<ArrowType, Decimal256WithPrecisionAndScale<precision>>::value, Status>
NullableArray(size_t size, size_t num_nulls, uint32_t seed,
std::shared_ptr<::arrow::Array>* out) {
std::vector<uint8_t> valid_bytes(size, '\1');
for (size_t i = 0; i < num_nulls; ++i) {
valid_bytes[i * 2] = '\0';
}
constexpr int32_t kDecimalPrecision = precision;
constexpr int32_t kDecimalScale = Decimal256WithPrecisionAndScale<precision>::scale;
const auto type = ::arrow::decimal256(kDecimalPrecision, kDecimalScale);
const int32_t byte_width =
static_cast<const ::arrow::Decimal256Type&>(*type).byte_width();
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
random_decimals(size, seed, precision, out_buf->mutable_data());
::arrow::Decimal256Builder builder(type);
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
return builder.Finish(out);
}
// This helper function only supports (size/2) nulls yet.
template <class ArrowType>
::arrow::enable_if_boolean<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
uint32_t seed,
std::shared_ptr<Array>* out) {
std::vector<uint8_t> values;
// Seed is random in Arrow right now
(void)seed;
::arrow::randint(size, 0, 1, &values);
std::vector<uint8_t> valid_bytes(size, 1);
for (size_t i = 0; i < num_nulls; i++) {
valid_bytes[i * 2] = 0;
}
::arrow::BooleanBuilder builder;
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
return builder.Finish(out);
}
/// Wrap an Array into a ListArray by splitting it up into size lists.
///
/// This helper function only supports (size/2) nulls.
Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
int64_t null_count, const std::string& item_name,
bool nullable_values, std::shared_ptr<::arrow::ListArray>* out) {
// We always include an empty list
int64_t non_null_entries = size - null_count - 1;
int64_t length_per_entry = values->length() / non_null_entries;
auto offsets = AllocateBuffer();
RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
auto null_bitmap = AllocateBuffer();
int64_t bitmap_size = ::arrow::bit_util::BytesForBits(size);
RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
memset(null_bitmap_ptr, 0, bitmap_size);
int32_t current_offset = 0;
for (int64_t i = 0; i < size; i++) {
offsets_ptr[i] = current_offset;
if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
// Non-null list (list with index 1 is always empty).
::arrow::bit_util::SetBit(null_bitmap_ptr, i);
if (i != 1) {
current_offset += static_cast<int32_t>(length_per_entry);
}
}
}
offsets_ptr[size] = static_cast<int32_t>(values->length());
auto value_field = ::arrow::field(item_name, values->type(), nullable_values);
*out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
values, null_bitmap, null_count);
return Status::OK();
}
// Make an array containing only empty lists, with a null values array
Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
// Allocate an offsets buffer containing only zeroes
const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, ::arrow::AllocateBuffer(offsets_nbytes));
memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
auto value_field =
::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
auto list_type = ::arrow::list(value_field);
std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
nullptr /* values */};
auto child_data =
::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
std::move(offsets_buffer)};
auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
array_data->child_data.push_back(child_data);
*out_array = ::arrow::MakeArray(array_data);
return Status::OK();
}
std::shared_ptr<::arrow::Table> MakeSimpleTable(
const std::shared_ptr<ChunkedArray>& values, bool nullable) {
auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)});
return ::arrow::Table::Make(schema, {values});
}
std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
bool nullable) {
auto carr = std::make_shared<::arrow::ChunkedArray>(values);
return MakeSimpleTable(carr, nullable);
}
template <typename T>
void ExpectArray(T* expected, Array* result) {
auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
for (int i = 0; i < result->length(); i++) {
EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
}
}
template <typename ArrowType>
void ExpectArrayT(void* expected, Array* result) {
::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
for (int64_t i = 0; i < result->length(); i++) {
EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
reinterpret_cast<const typename ArrowType::c_type*>(
p_array->values()->data())[i]);
}
}
template <>
void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
::arrow::BooleanBuilder builder;
ARROW_EXPECT_OK(
builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
std::shared_ptr<Array> expected_array;
ARROW_EXPECT_OK(builder.Finish(&expected_array));
EXPECT_TRUE(result->Equals(*expected_array));
}
} // namespace arrow
} // namespace parquet

View File

@ -0,0 +1,109 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include "parquet/platform.h"
#include "parquet/properties.h"
namespace arrow {
class Array;
class ChunkedArray;
class Schema;
class Table;
} // namespace arrow
namespace parquet {
class FileMetaData;
class ParquetFileWriter;
namespace arrow {
/// \brief Iterative FileWriter class
///
/// Start a new RowGroup or Chunk with NewRowGroup.
/// Write column-by-column the whole column chunk.
///
/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
/// value is a nonnegative integer, then it will be used as the field_id in the parquet
/// file.
class PARQUET_EXPORT FileWriter {
public:
static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
std::shared_ptr<::arrow::Schema> schema,
std::shared_ptr<ArrowWriterProperties> arrow_properties,
std::unique_ptr<FileWriter>* out);
static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
std::shared_ptr<::arrow::io::OutputStream> sink,
std::shared_ptr<WriterProperties> properties,
std::unique_ptr<FileWriter>* writer);
static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
std::shared_ptr<::arrow::io::OutputStream> sink,
std::shared_ptr<WriterProperties> properties,
std::shared_ptr<ArrowWriterProperties> arrow_properties,
std::unique_ptr<FileWriter>* writer);
virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
/// \brief Write a Table to Parquet.
virtual ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size) = 0;
virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0;
virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
/// \brief Write ColumnChunk in row group using slice of a ChunkedArray
virtual ::arrow::Status WriteColumnChunk(
const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
int64_t size) = 0;
virtual ::arrow::Status WriteColumnChunk(
const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
virtual ::arrow::Status Close() = 0;
virtual ~FileWriter();
virtual MemoryPool* memory_pool() const = 0;
virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
};
/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
PARQUET_EXPORT
::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);
/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
PARQUET_EXPORT
::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);
/// \brief Write a Table to Parquet.
::arrow::Status PARQUET_EXPORT
WriteTable(const ::arrow::Table& table, MemoryPool* pool,
std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
std::shared_ptr<ArrowWriterProperties> arrow_properties =
default_arrow_writer_properties());
} // namespace arrow
} // namespace parquet

View File

@ -0,0 +1,247 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cmath>
#include <cstdint>
#include <memory>
#include "arrow/util/bit_util.h"
#include "arrow/util/logging.h"
#include "parquet/hasher.h"
#include "parquet/platform.h"
#include "parquet/types.h"
namespace parquet {
// A Bloom filter is a compact structure to indicate whether an item is not in a set or
// probably in a set. The Bloom filter usually consists of a bit set that represents a
// set of elements, a hash strategy and a Bloom filter algorithm.
class PARQUET_EXPORT BloomFilter {
public:
// Maximum Bloom filter size, it sets to HDFS default block size 128MB
// This value will be reconsidered when implementing Bloom filter producer.
static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
/// Determine whether an element exist in set or not.
///
/// @param hash the element to contain.
/// @return false if value is definitely not in set, and true means PROBABLY
/// in set.
virtual bool FindHash(uint64_t hash) const = 0;
/// Insert element to set represented by Bloom filter bitset.
/// @param hash the hash of value to insert into Bloom filter.
virtual void InsertHash(uint64_t hash) = 0;
/// Write this Bloom filter to an output stream. A Bloom filter structure should
/// include bitset length, hash strategy, algorithm, and bitset.
///
/// @param sink the output stream to write
virtual void WriteTo(ArrowOutputStream* sink) const = 0;
/// Get the number of bytes of bitset
virtual uint32_t GetBitsetSize() const = 0;
/// Compute hash for 32 bits value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(int32_t value) const = 0;
/// Compute hash for 64 bits value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(int64_t value) const = 0;
/// Compute hash for float value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(float value) const = 0;
/// Compute hash for double value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(double value) const = 0;
/// Compute hash for Int96 value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(const Int96* value) const = 0;
/// Compute hash for ByteArray value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(const ByteArray* value) const = 0;
/// Compute hash for fixed byte array value by using its plain encoding result.
///
/// @param value the value address.
/// @param len the value length.
/// @return hash result.
virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
virtual ~BloomFilter() {}
protected:
// Hash strategy available for Bloom filter.
enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
// Bloom filter algorithm.
enum class Algorithm : uint32_t { BLOCK = 0 };
};
// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
//
// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
public:
/// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
BlockSplitBloomFilter();
/// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
/// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
/// rounded up/down to lower/upper bound if num_bytes is out of range and also
/// will be rounded up to a power of 2.
///
/// @param num_bytes The number of bytes to store Bloom filter bitset.
void Init(uint32_t num_bytes);
/// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
/// bitset because the given bitset may not satisfy the 32-byte alignment requirement
/// which may lead to segfault when performing SIMD instructions. It is the caller's
/// responsibility to free the bitset passed in. This is used when reconstructing
/// a Bloom filter from a parquet file.
///
/// @param bitset The given bitset to initialize the Bloom filter.
/// @param num_bytes The number of bytes of given bitset.
void Init(const uint8_t* bitset, uint32_t num_bytes);
// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
static constexpr uint32_t kMinimumBloomFilterBytes = 32;
/// Calculate optimal size according to the number of distinct values and false
/// positive probability.
///
/// @param ndv The number of distinct values.
/// @param fpp The false positive probability.
/// @return it always return a value between kMinimumBloomFilterBytes and
/// kMaximumBloomFilterBytes, and the return value is always a power of 2
static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
DCHECK(fpp > 0.0 && fpp < 1.0);
const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
uint32_t num_bits;
// Handle overflow.
if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
} else {
num_bits = static_cast<uint32_t>(m);
}
// Round up to lower bound
if (num_bits < kMinimumBloomFilterBytes << 3) {
num_bits = kMinimumBloomFilterBytes << 3;
}
// Get next power of 2 if bits is not power of 2.
if ((num_bits & (num_bits - 1)) != 0) {
num_bits = static_cast<uint32_t>(::arrow::bit_util::NextPower2(num_bits));
}
// Round down to upper bound
if (num_bits > kMaximumBloomFilterBytes << 3) {
num_bits = kMaximumBloomFilterBytes << 3;
}
return num_bits;
}
bool FindHash(uint64_t hash) const override;
void InsertHash(uint64_t hash) override;
void WriteTo(ArrowOutputStream* sink) const override;
uint32_t GetBitsetSize() const override { return num_bytes_; }
uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
uint64_t Hash(float value) const override { return hasher_->Hash(value); }
uint64_t Hash(double value) const override { return hasher_->Hash(value); }
uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
uint64_t Hash(const FLBA* value, uint32_t len) const override {
return hasher_->Hash(value, len);
}
/// Deserialize the Bloom filter from an input stream. It is used when reconstructing
/// a Bloom filter from a parquet filter.
///
/// @param input_stream The input stream from which to construct the Bloom filter
/// @return The BlockSplitBloomFilter.
static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
private:
// Bytes in a tiny Bloom filter block.
static constexpr int kBytesPerFilterBlock = 32;
// The number of bits to be set in each tiny Bloom filter
static constexpr int kBitsSetPerBlock = 8;
// A mask structure used to set bits in each tiny Bloom filter.
struct BlockMask {
uint32_t item[kBitsSetPerBlock];
};
// The block-based algorithm needs eight odd SALT values to calculate eight indexes
// of bit to set, one bit in each 32-bit word.
static constexpr uint32_t SALT[kBitsSetPerBlock] = {
0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
/// Set bits in mask array according to input key.
/// @param key the value to calculate mask values.
/// @param mask the mask array is used to set inside a block
void SetMask(uint32_t key, BlockMask& mask) const;
// Memory pool to allocate aligned buffer for bitset
::arrow::MemoryPool* pool_;
// The underlying buffer of bitset.
std::shared_ptr<Buffer> data_;
// The number of bytes of Bloom filter bitset.
uint32_t num_bytes_;
// Hash strategy used in this Bloom filter.
HashStrategy hash_strategy_;
// Algorithm used in this Bloom filter.
Algorithm algorithm_;
// The hash pointer points to actual hash class used.
std::unique_ptr<Hasher> hasher_;
};
} // namespace parquet

View File

@ -0,0 +1,160 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This module defines an abstract interface for iterating through pages in a
// Parquet column chunk within a row group. It could be extended in the future
// to iterate through all data pages in all chunks in a file.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include "parquet/statistics.h"
#include "parquet/types.h"
namespace parquet {
// TODO: Parallel processing is not yet safe because of memory-ownership
// semantics (the PageReader may or may not own the memory referenced by a
// page)
//
// TODO(wesm): In the future Parquet implementations may store the crc code
// in format::PageHeader. parquet-mr currently does not, so we also skip it
// here, both on the read and write path
class Page {
public:
Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
: buffer_(buffer), type_(type) {}
PageType::type type() const { return type_; }
std::shared_ptr<Buffer> buffer() const { return buffer_; }
// @returns: a pointer to the page's data
const uint8_t* data() const { return buffer_->data(); }
// @returns: the total size in bytes of the page's data buffer
int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
private:
std::shared_ptr<Buffer> buffer_;
PageType::type type_;
};
/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
class DataPage : public Page {
public:
int32_t num_values() const { return num_values_; }
Encoding::type encoding() const { return encoding_; }
int64_t uncompressed_size() const { return uncompressed_size_; }
const EncodedStatistics& statistics() const { return statistics_; }
virtual ~DataPage() = default;
protected:
DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
Encoding::type encoding, int64_t uncompressed_size,
const EncodedStatistics& statistics = EncodedStatistics())
: Page(buffer, type),
num_values_(num_values),
encoding_(encoding),
uncompressed_size_(uncompressed_size),
statistics_(statistics) {}
int32_t num_values_;
Encoding::type encoding_;
int64_t uncompressed_size_;
EncodedStatistics statistics_;
};
class DataPageV1 : public DataPage {
public:
DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
Encoding::type encoding, Encoding::type definition_level_encoding,
Encoding::type repetition_level_encoding, int64_t uncompressed_size,
const EncodedStatistics& statistics = EncodedStatistics())
: DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
statistics),
definition_level_encoding_(definition_level_encoding),
repetition_level_encoding_(repetition_level_encoding) {}
Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
private:
Encoding::type definition_level_encoding_;
Encoding::type repetition_level_encoding_;
};
class DataPageV2 : public DataPage {
public:
DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
int32_t num_rows, Encoding::type encoding,
int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
int64_t uncompressed_size, bool is_compressed = false,
const EncodedStatistics& statistics = EncodedStatistics())
: DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
statistics),
num_nulls_(num_nulls),
num_rows_(num_rows),
definition_levels_byte_length_(definition_levels_byte_length),
repetition_levels_byte_length_(repetition_levels_byte_length),
is_compressed_(is_compressed) {}
int32_t num_nulls() const { return num_nulls_; }
int32_t num_rows() const { return num_rows_; }
int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
bool is_compressed() const { return is_compressed_; }
private:
int32_t num_nulls_;
int32_t num_rows_;
int32_t definition_levels_byte_length_;
int32_t repetition_levels_byte_length_;
bool is_compressed_;
};
class DictionaryPage : public Page {
public:
DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
Encoding::type encoding, bool is_sorted = false)
: Page(buffer, PageType::DICTIONARY_PAGE),
num_values_(num_values),
encoding_(encoding),
is_sorted_(is_sorted) {}
int32_t num_values() const { return num_values_; }
Encoding::type encoding() const { return encoding_; }
bool is_sorted() const { return is_sorted_; }
private:
int32_t num_values_;
Encoding::type encoding_;
bool is_sorted_;
};
} // namespace parquet

View File

@ -0,0 +1,376 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>
#include "parquet/exception.h"
#include "parquet/level_conversion.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace arrow {
class Array;
class ChunkedArray;
namespace bit_util {
class BitReader;
} // namespace bit_util
namespace util {
class RleDecoder;
} // namespace util
} // namespace arrow
namespace parquet {
class Decryptor;
class Page;
// 16 MB is the default maximum page header size
static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
// 16 KB is the default expected page header size
static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
class PARQUET_EXPORT LevelDecoder {
public:
LevelDecoder();
~LevelDecoder();
// Initialize the LevelDecoder state with new data
// and return the number of bytes consumed
int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
const uint8_t* data, int32_t data_size);
void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
const uint8_t* data);
// Decodes a batch of levels into an array and returns the number of levels decoded
int Decode(int batch_size, int16_t* levels);
private:
int bit_width_;
int num_values_remaining_;
Encoding::type encoding_;
std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
int16_t max_level_;
};
struct CryptoContext {
CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
: start_decrypt_with_dictionary_page(start_with_dictionary_page),
row_group_ordinal(rg_ordinal),
column_ordinal(col_ordinal),
meta_decryptor(std::move(meta)),
data_decryptor(std::move(data)) {}
CryptoContext() {}
bool start_decrypt_with_dictionary_page = false;
int16_t row_group_ordinal = -1;
int16_t column_ordinal = -1;
std::shared_ptr<Decryptor> meta_decryptor;
std::shared_ptr<Decryptor> data_decryptor;
};
// Abstract page iterator interface. This way, we can feed column pages to the
// ColumnReader through whatever mechanism we choose
class PARQUET_EXPORT PageReader {
public:
virtual ~PageReader() = default;
static std::unique_ptr<PageReader> Open(
std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
const CryptoContext* ctx = NULLPTR);
// @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
// containing new Page otherwise
virtual std::shared_ptr<Page> NextPage() = 0;
virtual void set_max_page_header_size(uint32_t size) = 0;
};
class PARQUET_EXPORT ColumnReader {
public:
virtual ~ColumnReader() = default;
static std::shared_ptr<ColumnReader> Make(
const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
// Returns true if there are still values in this column.
virtual bool HasNext() = 0;
virtual Type::type type() const = 0;
virtual const ColumnDescriptor* descr() const = 0;
// Get the encoding that can be exposed by this reader. If it returns
// dictionary encoding, then ReadBatchWithDictionary can be used to read data.
//
// \note API EXPERIMENTAL
virtual ExposedEncoding GetExposedEncoding() = 0;
protected:
friend class RowGroupReader;
// Set the encoding that can be exposed by this reader.
//
// \note API EXPERIMENTAL
virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
};
// API to read values from a single column. This is a main client facing API.
template <typename DType>
class TypedColumnReader : public ColumnReader {
public:
typedef typename DType::c_type T;
// Read a batch of repetition levels, definition levels, and values from the
// column.
//
// Since null values are not stored in the values, the number of values read
// may be less than the number of repetition and definition levels. With
// nested data this is almost certainly true.
//
// Set def_levels or rep_levels to nullptr if you want to skip reading them.
// This is only safe if you know through some other source that there are no
// undefined values.
//
// To fully exhaust a row group, you must read batches until the number of
// values read reaches the number of stored values according to the metadata.
//
// This API is the same for both V1 and V2 of the DataPage
//
// @returns: actual number of levels read (see values_read for number of values read)
virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
T* values, int64_t* values_read) = 0;
/// Read a batch of repetition levels, definition levels, and values from the
/// column and leave spaces for null entries on the lowest level in the values
/// buffer.
///
/// In comparison to ReadBatch the length of repetition and definition levels
/// is the same as of the number of values read for max_definition_level == 1.
/// In the case of max_definition_level > 1, the repetition and definition
/// levels are larger than the values but the values include the null entries
/// with definition_level == (max_definition_level - 1).
///
/// To fully exhaust a row group, you must read batches until the number of
/// values read reaches the number of stored values according to the metadata.
///
/// @param batch_size the number of levels to read
/// @param[out] def_levels The Parquet definition levels, output has
/// the length levels_read.
/// @param[out] rep_levels The Parquet repetition levels, output has
/// the length levels_read.
/// @param[out] values The values in the lowest nested level including
/// spacing for nulls on the lowest levels; output has the length
/// values_read.
/// @param[out] valid_bits Memory allocated for a bitmap that indicates if
/// the row is null or on the maximum definition level. For performance
/// reasons the underlying buffer should be able to store 1 bit more than
/// required. If this requires an additional byte, this byte is only read
/// but never written to.
/// @param valid_bits_offset The offset in bits of the valid_bits where the
/// first relevant bit resides.
/// @param[out] levels_read The number of repetition/definition levels that were read.
/// @param[out] values_read The number of values read, this includes all
/// non-null entries as well as all null-entries on the lowest level
/// (i.e. definition_level == max_definition_level - 1)
/// @param[out] null_count The number of nulls on the lowest levels.
/// (i.e. (values_read - null_count) is total number of non-null entries)
///
/// \deprecated Since 4.0.0
ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
int16_t* rep_levels, T* values, uint8_t* valid_bits,
int64_t valid_bits_offset, int64_t* levels_read,
int64_t* values_read, int64_t* null_count) = 0;
// Skip reading levels
// Returns the number of levels skipped
virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
// Read a batch of repetition levels, definition levels, and indices from the
// column. And read the dictionary if a dictionary page is encountered during
// reading pages. This API is similar to ReadBatch(), with ability to read
// dictionary and indices. It is only valid to call this method when the reader can
// expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
// DICTIONARY).
//
// The dictionary is read along with the data page. When there's no data page,
// the dictionary won't be returned.
//
// @param batch_size The batch size to read
// @param[out] def_levels The Parquet definition levels.
// @param[out] rep_levels The Parquet repetition levels.
// @param[out] indices The dictionary indices.
// @param[out] indices_read The number of indices read.
// @param[out] dict The pointer to dictionary values. It will return nullptr if
// there's no data page. Each column chunk only has one dictionary page. The dictionary
// is owned by the reader, so the caller is responsible for copying the dictionary
// values before the reader gets destroyed.
// @param[out] dict_len The dictionary length. It will return 0 if there's no data
// page.
// @returns: actual number of levels read (see indices_read for number of
// indices read
//
// \note API EXPERIMENTAL
virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
int16_t* rep_levels, int32_t* indices,
int64_t* indices_read, const T** dict,
int32_t* dict_len) = 0;
};
namespace internal {
/// \brief Stateful column reader that delimits semantic records for both flat
/// and nested columns
///
/// \note API EXPERIMENTAL
/// \since 1.3.0
class RecordReader {
public:
static std::shared_ptr<RecordReader> Make(
const ColumnDescriptor* descr, LevelInfo leaf_info,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
const bool read_dictionary = false);
virtual ~RecordReader() = default;
/// \brief Attempt to read indicated number of records from column chunk
/// \return number of records read
virtual int64_t ReadRecords(int64_t num_records) = 0;
/// \brief Pre-allocate space for data. Results in better flat read performance
virtual void Reserve(int64_t num_values) = 0;
/// \brief Clear consumed values and repetition/definition levels as the
/// result of calling ReadRecords
virtual void Reset() = 0;
/// \brief Transfer filled values buffer to caller. A new one will be
/// allocated in subsequent ReadRecords calls
virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
/// \brief Transfer filled validity bitmap buffer to caller. A new one will
/// be allocated in subsequent ReadRecords calls
virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
/// \brief Return true if the record reader has more internal data yet to
/// process
virtual bool HasMoreData() const = 0;
/// \brief Advance record reader to the next row group
/// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
virtual void DebugPrintState() = 0;
/// \brief Decoded definition levels
int16_t* def_levels() const {
return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
}
/// \brief Decoded repetition levels
int16_t* rep_levels() const {
return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
}
/// \brief Decoded values, including nulls, if any
uint8_t* values() const { return values_->mutable_data(); }
/// \brief Number of values written including nulls (if any)
int64_t values_written() const { return values_written_; }
/// \brief Number of definition / repetition levels (from those that have
/// been decoded) that have been consumed inside the reader.
int64_t levels_position() const { return levels_position_; }
/// \brief Number of definition / repetition levels that have been written
/// internally in the reader
int64_t levels_written() const { return levels_written_; }
/// \brief Number of nulls in the leaf
int64_t null_count() const { return null_count_; }
/// \brief True if the leaf values are nullable
bool nullable_values() const { return nullable_values_; }
/// \brief True if reading directly as Arrow dictionary-encoded
bool read_dictionary() const { return read_dictionary_; }
protected:
bool nullable_values_;
bool at_record_start_;
int64_t records_read_;
int64_t values_written_;
int64_t values_capacity_;
int64_t null_count_;
int64_t levels_written_;
int64_t levels_position_;
int64_t levels_capacity_;
std::shared_ptr<::arrow::ResizableBuffer> values_;
// In the case of false, don't allocate the values buffer (when we directly read into
// builder classes).
bool uses_values_;
std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
bool read_dictionary_ = false;
};
class BinaryRecordReader : virtual public RecordReader {
public:
virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
};
/// \brief Read records directly to dictionary-encoded Arrow form (int32
/// indices). Only valid for BYTE_ARRAY columns
class DictionaryRecordReader : virtual public RecordReader {
public:
virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
};
} // namespace internal
using BoolReader = TypedColumnReader<BooleanType>;
using Int32Reader = TypedColumnReader<Int32Type>;
using Int64Reader = TypedColumnReader<Int64Type>;
using Int96Reader = TypedColumnReader<Int96Type>;
using FloatReader = TypedColumnReader<FloatType>;
using DoubleReader = TypedColumnReader<DoubleType>;
using ByteArrayReader = TypedColumnReader<ByteArrayType>;
using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
} // namespace parquet

View File

@ -0,0 +1,262 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <stdio.h>
#include <cstdint>
#include <memory>
#include <ostream>
#include <string>
#include <utility>
#include <vector>
#include "parquet/column_reader.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace parquet {
static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
class PARQUET_EXPORT Scanner {
public:
explicit Scanner(std::shared_ptr<ColumnReader> reader,
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
: batch_size_(batch_size),
level_offset_(0),
levels_buffered_(0),
value_buffer_(AllocateBuffer(pool)),
value_offset_(0),
values_buffered_(0),
reader_(std::move(reader)) {
def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
}
virtual ~Scanner() {}
static std::shared_ptr<Scanner> Make(
std::shared_ptr<ColumnReader> col_reader,
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
const ColumnDescriptor* descr() const { return reader_->descr(); }
int64_t batch_size() const { return batch_size_; }
void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
protected:
int64_t batch_size_;
std::vector<int16_t> def_levels_;
std::vector<int16_t> rep_levels_;
int level_offset_;
int levels_buffered_;
std::shared_ptr<ResizableBuffer> value_buffer_;
int value_offset_;
int64_t values_buffered_;
std::shared_ptr<ColumnReader> reader_;
};
template <typename DType>
class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
public:
typedef typename DType::c_type T;
explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
: Scanner(std::move(reader), batch_size, pool) {
typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
int value_byte_size = type_traits<DType::type_num>::value_byte_size;
PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
}
virtual ~TypedScanner() {}
bool NextLevels(int16_t* def_level, int16_t* rep_level) {
if (level_offset_ == levels_buffered_) {
levels_buffered_ = static_cast<int>(
typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
rep_levels_.data(), values_, &values_buffered_));
value_offset_ = 0;
level_offset_ = 0;
if (!levels_buffered_) {
return false;
}
}
*def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
*rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
level_offset_++;
return true;
}
bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
if (level_offset_ == levels_buffered_) {
if (!HasNext()) {
// Out of data pages
return false;
}
}
NextLevels(def_level, rep_level);
*is_null = *def_level < descr()->max_definition_level();
if (*is_null) {
return true;
}
if (value_offset_ == values_buffered_) {
throw ParquetException("Value was non-null, but has not been buffered");
}
*val = values_[value_offset_++];
return true;
}
// Returns true if there is a next value
bool NextValue(T* val, bool* is_null) {
if (level_offset_ == levels_buffered_) {
if (!HasNext()) {
// Out of data pages
return false;
}
}
// Out of values
int16_t def_level = -1;
int16_t rep_level = -1;
NextLevels(&def_level, &rep_level);
*is_null = def_level < descr()->max_definition_level();
if (*is_null) {
return true;
}
if (value_offset_ == values_buffered_) {
throw ParquetException("Value was non-null, but has not been buffered");
}
*val = values_[value_offset_++];
return true;
}
virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
T val{};
int16_t def_level = -1;
int16_t rep_level = -1;
bool is_null = false;
char buffer[80];
if (!Next(&val, &def_level, &rep_level, &is_null)) {
throw ParquetException("No more values buffered");
}
if (with_levels) {
out << " D:" << def_level << " R:" << rep_level << " ";
if (!is_null) {
out << "V:";
}
}
if (is_null) {
std::string null_fmt = format_fwf<ByteArrayType>(width);
snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
} else {
FormatValue(&val, buffer, sizeof(buffer), width);
}
out << buffer;
}
private:
// The ownership of this object is expressed through the reader_ variable in the base
TypedColumnReader<DType>* typed_reader_;
inline void FormatValue(void* val, char* buffer, int bufsize, int width);
T* values_;
};
template <typename DType>
inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
int width) {
std::string fmt = format_fwf<DType>(width);
snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
}
template <>
inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
int width) {
std::string fmt = format_fwf<Int96Type>(width);
std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}
template <>
inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
int width) {
std::string fmt = format_fwf<ByteArrayType>(width);
std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}
template <>
inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
int width) {
std::string fmt = format_fwf<FLBAType>(width);
std::string result = FixedLenByteArrayToString(
*reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}
typedef TypedScanner<BooleanType> BoolScanner;
typedef TypedScanner<Int32Type> Int32Scanner;
typedef TypedScanner<Int64Type> Int64Scanner;
typedef TypedScanner<Int96Type> Int96Scanner;
typedef TypedScanner<FloatType> FloatScanner;
typedef TypedScanner<DoubleType> DoubleScanner;
typedef TypedScanner<ByteArrayType> ByteArrayScanner;
typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
template <typename RType>
int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
uint8_t* values, int64_t* values_buffered,
parquet::ColumnReader* reader) {
typedef typename RType::T Type;
auto typed_reader = static_cast<RType*>(reader);
auto vals = reinterpret_cast<Type*>(&values[0]);
return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
values_buffered);
}
int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
int16_t* rep_levels, uint8_t* values,
int64_t* values_buffered,
parquet::ColumnReader* reader);
} // namespace parquet

View File

@ -0,0 +1,270 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <cstring>
#include <memory>
#include "parquet/exception.h"
#include "parquet/platform.h"
#include "parquet/types.h"
namespace arrow {
class Array;
namespace bit_util {
class BitWriter;
} // namespace bit_util
namespace util {
class RleEncoder;
} // namespace util
} // namespace arrow
namespace parquet {
struct ArrowWriteContext;
class ColumnDescriptor;
class DataPage;
class DictionaryPage;
class ColumnChunkMetaDataBuilder;
class Encryptor;
class WriterProperties;
class PARQUET_EXPORT LevelEncoder {
public:
LevelEncoder();
~LevelEncoder();
static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
int num_buffered_values);
// Initialize the LevelEncoder.
void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
uint8_t* data, int data_size);
// Encodes a batch of levels from an array and returns the number of levels encoded
int Encode(int batch_size, const int16_t* levels);
int32_t len() {
if (encoding_ != Encoding::RLE) {
throw ParquetException("Only implemented for RLE encoding");
}
return rle_length_;
}
private:
int bit_width_;
int rle_length_;
Encoding::type encoding_;
std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
std::unique_ptr<::arrow::bit_util::BitWriter> bit_packed_encoder_;
};
class PARQUET_EXPORT PageWriter {
public:
virtual ~PageWriter() {}
static std::unique_ptr<PageWriter> Open(
std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
int compression_level, ColumnChunkMetaDataBuilder* metadata,
int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
bool buffered_row_group = false,
std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
std::shared_ptr<Encryptor> data_encryptor = NULLPTR);
// The Column Writer decides if dictionary encoding is used if set and
// if the dictionary encoding has fallen back to default encoding on reaching dictionary
// page limit
virtual void Close(bool has_dictionary, bool fallback) = 0;
// Return the number of uncompressed bytes written (including header size)
virtual int64_t WriteDataPage(const DataPage& page) = 0;
// Return the number of uncompressed bytes written (including header size)
virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
virtual bool has_compressor() = 0;
virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
};
static constexpr int WRITE_BATCH_SIZE = 1000;
class PARQUET_EXPORT ColumnWriter {
public:
virtual ~ColumnWriter() = default;
static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
std::unique_ptr<PageWriter>,
const WriterProperties* properties);
/// \brief Closes the ColumnWriter, commits any buffered values to pages.
/// \return Total size of the column in bytes
virtual int64_t Close() = 0;
/// \brief The physical Parquet type of the column
virtual Type::type type() const = 0;
/// \brief The schema for the column
virtual const ColumnDescriptor* descr() const = 0;
/// \brief The number of rows written so far
virtual int64_t rows_written() const = 0;
/// \brief The total size of the compressed pages + page headers. Some values
/// might be still buffered and not written to a page yet
virtual int64_t total_compressed_bytes() const = 0;
/// \brief The total number of bytes written as serialized data and
/// dictionary pages to the ColumnChunk so far
virtual int64_t total_bytes_written() const = 0;
/// \brief The file-level writer properties
virtual const WriterProperties* properties() = 0;
/// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
/// error status if the array data type is not compatible with the concrete
/// writer type.
///
/// leaf_array is always a primitive (possibly dictionary encoded type).
/// Leaf_field_nullable indicates whether the leaf array is considered nullable
/// according to its schema in a Table or its parent array.
virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
int64_t num_levels, const ::arrow::Array& leaf_array,
ArrowWriteContext* ctx,
bool leaf_field_nullable) = 0;
};
// API to write values to a single column. This is the main client facing API.
template <typename DType>
class TypedColumnWriter : public ColumnWriter {
public:
using T = typename DType::c_type;
// Write a batch of repetition levels, definition levels, and values to the
// column.
// `num_values` is the number of logical leaf values.
// `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
// (resp. max repetition level) is 0.
// If not null, each of `def_levels` and `rep_levels` must have at least
// `num_values`.
//
// The number of physical values written (taken from `values`) is returned.
// It can be smaller than `num_values` is there are some undefined values.
virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
const int16_t* rep_levels, const T* values) = 0;
/// Write a batch of repetition levels, definition levels, and values to the
/// column.
///
/// In comparison to WriteBatch the length of repetition and definition levels
/// is the same as of the number of values read for max_definition_level == 1.
/// In the case of max_definition_level > 1, the repetition and definition
/// levels are larger than the values but the values include the null entries
/// with definition_level == (max_definition_level - 1). Thus we have to differentiate
/// in the parameters of this function if the input has the length of num_values or the
/// _number of rows in the lowest nesting level_.
///
/// In the case that the most inner node in the Parquet is required, the _number of rows
/// in the lowest nesting level_ is equal to the number of non-null values. If the
/// inner-most schema node is optional, the _number of rows in the lowest nesting level_
/// also includes all values with definition_level == (max_definition_level - 1).
///
/// @param num_values number of levels to write.
/// @param def_levels The Parquet definition levels, length is num_values
/// @param rep_levels The Parquet repetition levels, length is num_values
/// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
/// level. The length is number of rows in the lowest nesting level.
/// @param valid_bits_offset The offset in bits of the valid_bits where the
/// first relevant bit resides.
/// @param values The values in the lowest nested level including
/// spacing for nulls on the lowest levels; input has the length
/// of the number of rows on the lowest nesting level.
virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
const int16_t* rep_levels, const uint8_t* valid_bits,
int64_t valid_bits_offset, const T* values) = 0;
// Estimated size of the values that are not written to a page yet
virtual int64_t EstimatedBufferedValueBytes() const = 0;
};
using BoolWriter = TypedColumnWriter<BooleanType>;
using Int32Writer = TypedColumnWriter<Int32Type>;
using Int64Writer = TypedColumnWriter<Int64Type>;
using Int96Writer = TypedColumnWriter<Int96Type>;
using FloatWriter = TypedColumnWriter<FloatType>;
using DoubleWriter = TypedColumnWriter<DoubleType>;
using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
namespace internal {
/**
* Timestamp conversion constants
*/
constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
(*impala_timestamp).value[2] = (uint32_t)julian_days;
int64_t last_day_units = time % UnitPerDay;
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
// impala_timestamp will be unaligned every other entry so do memcpy instead
// of assign and reinterpret cast to avoid undefined behavior.
std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
}
constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
impala_timestamp);
}
constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
Int96* impala_timestamp) {
ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
milliseconds, impala_timestamp);
}
constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
Int96* impala_timestamp) {
ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
microseconds, impala_timestamp);
}
constexpr int64_t kNanosecondsInNanos = INT64_C(1);
inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
Int96* impala_timestamp) {
ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
nanoseconds, impala_timestamp);
}
} // namespace internal
} // namespace parquet

View File

@ -0,0 +1,460 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <cstring>
#include <memory>
#include <vector>
#include "arrow/util/spaced.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
#include "parquet/types.h"
namespace arrow {
class Array;
class ArrayBuilder;
class BinaryArray;
class BinaryBuilder;
class BooleanBuilder;
class Int32Type;
class Int64Type;
class FloatType;
class DoubleType;
class FixedSizeBinaryType;
template <typename T>
class NumericBuilder;
class FixedSizeBinaryBuilder;
template <typename T>
class Dictionary32Builder;
} // namespace arrow
namespace parquet {
template <typename DType>
class TypedEncoder;
using BooleanEncoder = TypedEncoder<BooleanType>;
using Int32Encoder = TypedEncoder<Int32Type>;
using Int64Encoder = TypedEncoder<Int64Type>;
using Int96Encoder = TypedEncoder<Int96Type>;
using FloatEncoder = TypedEncoder<FloatType>;
using DoubleEncoder = TypedEncoder<DoubleType>;
using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
using FLBAEncoder = TypedEncoder<FLBAType>;
template <typename DType>
class TypedDecoder;
class BooleanDecoder;
using Int32Decoder = TypedDecoder<Int32Type>;
using Int64Decoder = TypedDecoder<Int64Type>;
using Int96Decoder = TypedDecoder<Int96Type>;
using FloatDecoder = TypedDecoder<FloatType>;
using DoubleDecoder = TypedDecoder<DoubleType>;
using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
class FLBADecoder;
template <typename T>
struct EncodingTraits;
template <>
struct EncodingTraits<BooleanType> {
using Encoder = BooleanEncoder;
using Decoder = BooleanDecoder;
using ArrowType = ::arrow::BooleanType;
using Accumulator = ::arrow::BooleanBuilder;
struct DictAccumulator {};
};
template <>
struct EncodingTraits<Int32Type> {
using Encoder = Int32Encoder;
using Decoder = Int32Decoder;
using ArrowType = ::arrow::Int32Type;
using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
};
template <>
struct EncodingTraits<Int64Type> {
using Encoder = Int64Encoder;
using Decoder = Int64Decoder;
using ArrowType = ::arrow::Int64Type;
using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
};
template <>
struct EncodingTraits<Int96Type> {
using Encoder = Int96Encoder;
using Decoder = Int96Decoder;
struct Accumulator {};
struct DictAccumulator {};
};
template <>
struct EncodingTraits<FloatType> {
using Encoder = FloatEncoder;
using Decoder = FloatDecoder;
using ArrowType = ::arrow::FloatType;
using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
};
template <>
struct EncodingTraits<DoubleType> {
using Encoder = DoubleEncoder;
using Decoder = DoubleDecoder;
using ArrowType = ::arrow::DoubleType;
using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
};
template <>
struct EncodingTraits<ByteArrayType> {
using Encoder = ByteArrayEncoder;
using Decoder = ByteArrayDecoder;
/// \brief Internal helper class for decoding BYTE_ARRAY data where we can
/// overflow the capacity of a single arrow::BinaryArray
struct Accumulator {
std::unique_ptr<::arrow::BinaryBuilder> builder;
std::vector<std::shared_ptr<::arrow::Array>> chunks;
};
using ArrowType = ::arrow::BinaryType;
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
};
template <>
struct EncodingTraits<FLBAType> {
using Encoder = FLBAEncoder;
using Decoder = FLBADecoder;
using ArrowType = ::arrow::FixedSizeBinaryType;
using Accumulator = ::arrow::FixedSizeBinaryBuilder;
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
};
class ColumnDescriptor;
// Untyped base for all encoders
class Encoder {
public:
virtual ~Encoder() = default;
virtual int64_t EstimatedDataEncodedSize() = 0;
virtual std::shared_ptr<Buffer> FlushValues() = 0;
virtual Encoding::type encoding() const = 0;
virtual void Put(const ::arrow::Array& values) = 0;
virtual MemoryPool* memory_pool() const = 0;
};
// Base class for value encoders. Since encoders may or not have state (e.g.,
// dictionary encoding) we use a class instance to maintain any state.
//
// Encode interfaces are internal, subject to change without deprecation.
template <typename DType>
class TypedEncoder : virtual public Encoder {
public:
typedef typename DType::c_type T;
using Encoder::Put;
virtual void Put(const T* src, int num_values) = 0;
virtual void Put(const std::vector<T>& src, int num_values = -1);
virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
int64_t valid_bits_offset) = 0;
};
template <typename DType>
void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
if (num_values == -1) {
num_values = static_cast<int>(src.size());
}
Put(src.data(), num_values);
}
template <>
inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
// NOTE(wesm): This stub is here only to satisfy the compiler; it is
// overridden later with the actual implementation
}
// Base class for dictionary encoders
template <typename DType>
class DictEncoder : virtual public TypedEncoder<DType> {
public:
/// Writes out any buffered indices to buffer preceded by the bit width of this data.
/// Returns the number of bytes written.
/// If the supplied buffer is not big enough, returns -1.
/// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
/// to size buffer.
virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
virtual int dict_encoded_size() = 0;
// virtual int dict_encoded_size() { return dict_encoded_size_; }
virtual int bit_width() const = 0;
/// Writes out the encoded dictionary to buffer. buffer must be preallocated to
/// dict_encoded_size() bytes.
virtual void WriteDict(uint8_t* buffer) = 0;
virtual int num_entries() const = 0;
/// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
/// assumed (without any boundschecking) that the indices reference
/// pre-existing dictionary values
/// \param[in] indices the dictionary index values. Only Int32Array currently
/// supported
virtual void PutIndices(const ::arrow::Array& indices) = 0;
/// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
/// separately. Currently throws exception if the current dictionary memo is
/// non-empty
/// \param[in] values the dictionary values. Only valid for certain
/// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
virtual void PutDictionary(const ::arrow::Array& values) = 0;
};
// ----------------------------------------------------------------------
// Value decoding
class Decoder {
public:
virtual ~Decoder() = default;
// Sets the data for a new page. This will be called multiple times on the same
// decoder and should reset all internal state.
virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
// Returns the number of values left (for the last call to SetData()). This is
// the number of values left in this page.
virtual int values_left() const = 0;
virtual Encoding::type encoding() const = 0;
};
template <typename DType>
class TypedDecoder : virtual public Decoder {
public:
using T = typename DType::c_type;
/// \brief Decode values into a buffer
///
/// Subclasses may override the more specialized Decode methods below.
///
/// \param[in] buffer destination for decoded values
/// \param[in] max_values maximum number of values to decode
/// \return The number of values decoded. Should be identical to max_values except
/// at the end of the current data page.
virtual int Decode(T* buffer, int max_values) = 0;
/// \brief Decode the values in this data page but leave spaces for null entries.
///
/// \param[in] buffer destination for decoded values
/// \param[in] num_values size of the def_levels and buffer arrays including the number
/// of null slots
/// \param[in] null_count number of null slots
/// \param[in] valid_bits bitmap data indicating position of valid slots
/// \param[in] valid_bits_offset offset into valid_bits
/// \return The number of values decoded, including nulls.
virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
const uint8_t* valid_bits, int64_t valid_bits_offset) {
if (null_count > 0) {
int values_to_read = num_values - null_count;
int values_read = Decode(buffer, values_to_read);
if (values_read != values_to_read) {
throw ParquetException("Number of values / definition_levels read did not match");
}
return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
valid_bits, valid_bits_offset);
} else {
return Decode(buffer, num_values);
}
}
/// \brief Decode into an ArrayBuilder or other accumulator
///
/// This function assumes the definition levels were already decoded
/// as a validity bitmap in the given `valid_bits`. `null_count`
/// is the number of 0s in `valid_bits`.
/// As a space optimization, it is allowed for `valid_bits` to be null
/// if `null_count` is zero.
///
/// \return number of values decoded
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
typename EncodingTraits<DType>::Accumulator* out) = 0;
/// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
///
/// \return number of values decoded
int DecodeArrowNonNull(int num_values,
typename EncodingTraits<DType>::Accumulator* out) {
return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
}
/// \brief Decode into a DictionaryBuilder
///
/// This function assumes the definition levels were already decoded
/// as a validity bitmap in the given `valid_bits`. `null_count`
/// is the number of 0s in `valid_bits`.
/// As a space optimization, it is allowed for `valid_bits` to be null
/// if `null_count` is zero.
///
/// \return number of values decoded
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
/// \brief Decode into a DictionaryBuilder ignoring nulls
///
/// \return number of values decoded
int DecodeArrowNonNull(int num_values,
typename EncodingTraits<DType>::DictAccumulator* builder) {
return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
}
};
template <typename DType>
class DictDecoder : virtual public TypedDecoder<DType> {
public:
using T = typename DType::c_type;
virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
/// \brief Insert dictionary values into the Arrow dictionary builder's memo,
/// but do not append any indices
virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
/// \brief Decode only dictionary indices and append to dictionary
/// builder. The builder must have had the dictionary from this decoder
/// inserted already.
///
/// \warning Remember to reset the builder each time the dict decoder is initialized
/// with a new dictionary page
virtual int DecodeIndicesSpaced(int num_values, int null_count,
const uint8_t* valid_bits, int64_t valid_bits_offset,
::arrow::ArrayBuilder* builder) = 0;
/// \brief Decode only dictionary indices (no nulls)
///
/// \warning Remember to reset the builder each time the dict decoder is initialized
/// with a new dictionary page
virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
/// \brief Decode only dictionary indices (no nulls). Same as above
/// DecodeIndices but target is an array instead of a builder.
///
/// \note API EXPERIMENTAL
virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
/// \brief Get dictionary. The reader will call this API when it encounters a
/// new dictionary.
///
/// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
/// the decoder and is destroyed when the decoder is destroyed.
/// @param[out] dictionary_length The dictionary length.
///
/// \note API EXPERIMENTAL
virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
};
// ----------------------------------------------------------------------
// TypedEncoder specializations, traits, and factory functions
class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
public:
using TypedDecoder<BooleanType>::Decode;
virtual int Decode(uint8_t* buffer, int max_values) = 0;
};
class FLBADecoder : virtual public TypedDecoder<FLBAType> {
public:
using TypedDecoder<FLBAType>::DecodeSpaced;
// TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
// there is value in adding specialized read methods for
// FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
// then perhaps not
};
PARQUET_EXPORT
std::unique_ptr<Encoder> MakeEncoder(
Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
const ColumnDescriptor* descr = NULLPTR,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
template <typename DType>
std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
Encoding::type encoding, bool use_dictionary = false,
const ColumnDescriptor* descr = NULLPTR,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
using OutType = typename EncodingTraits<DType>::Encoder;
std::unique_ptr<Encoder> base =
MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
}
PARQUET_EXPORT
std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
const ColumnDescriptor* descr = NULLPTR);
namespace detail {
PARQUET_EXPORT
std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
const ColumnDescriptor* descr,
::arrow::MemoryPool* pool);
} // namespace detail
template <typename DType>
std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
const ColumnDescriptor* descr = NULLPTR,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
using OutType = DictDecoder<DType>;
auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
}
template <typename DType>
std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
using OutType = typename EncodingTraits<DType>::Decoder;
std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
}
} // namespace parquet

View File

@ -0,0 +1,135 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "parquet/encryption/encryption.h"
#include "parquet/encryption/file_key_wrapper.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
ParquetCipher::AES_GCM_V1;
static constexpr bool kDefaultPlaintextFooter = false;
static constexpr bool kDefaultDoubleWrapping = true;
static constexpr double kDefaultCacheLifetimeSeconds = 600; // 10 minutes
static constexpr bool kDefaultInternalKeyMaterial = true;
static constexpr bool kDefaultUniformEncryption = false;
static constexpr int32_t kDefaultDataKeyLengthBits = 128;
struct PARQUET_EXPORT EncryptionConfiguration {
explicit EncryptionConfiguration(const std::string& footer_key)
: footer_key(footer_key) {}
/// ID of the master key for footer encryption/signing
std::string footer_key;
/// List of columns to encrypt, with master key IDs (see HIVE-21848).
/// Format: "masterKeyID:colName,colName;masterKeyID:colName..."
/// Either
/// (1) column_keys must be set
/// or
/// (2) uniform_encryption must be set to true
/// If none of (1) and (2) are true, or if both are true, an exception will be
/// thrown.
std::string column_keys;
/// Encrypt footer and all columns with the same encryption key.
bool uniform_encryption = kDefaultUniformEncryption;
/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1".
ParquetCipher::type encryption_algorithm = kDefaultEncryptionAlgorithm;
/// Write files with plaintext footer.
/// The default is false - files are written with encrypted footer.
bool plaintext_footer = kDefaultPlaintextFooter;
/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key
/// encryption keys (KEKs), which in turn are encrypted with master keys.
/// The default is true. If set to false, use single wrapping - where DEKs are
/// encrypted directly with master keys.
bool double_wrapping = kDefaultDoubleWrapping;
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
/// objects).
/// The default is 600 (10 minutes).
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
/// Store key material inside Parquet file footers; this mode doesnt produce
/// additional files. By default, true. If set to false, key material is stored in
/// separate files in the same folder, which enables key rotation for immutable
/// Parquet files.
bool internal_key_material = kDefaultInternalKeyMaterial;
/// Length of data encryption keys (DEKs), randomly generated by parquet key
/// management tools. Can be 128, 192 or 256 bits.
/// The default is 128 bits.
int32_t data_key_length_bits = kDefaultDataKeyLengthBits;
};
struct PARQUET_EXPORT DecryptionConfiguration {
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
/// objects).
/// The default is 600 (10 minutes).
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
};
/// This is a core class, that translates the parameters of high level encryption (like
/// the names of encrypted columns, names of master keys, etc), into parameters of low
/// level encryption (like the key metadata, DEK, etc). A factory that produces the low
/// level FileEncryptionProperties and FileDecryptionProperties objects, from the high
/// level parameters.
class PARQUET_EXPORT CryptoFactory {
public:
/// a KmsClientFactory object must be registered via this method before calling any of
/// GetFileEncryptionProperties()/GetFileDecryptionProperties() methods.
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory);
std::shared_ptr<FileEncryptionProperties> GetFileEncryptionProperties(
const KmsConnectionConfig& kms_connection_config,
const EncryptionConfiguration& encryption_config);
/// The returned FileDecryptionProperties object will use the cache inside this
/// CryptoFactory object, so please keep this
/// CryptoFactory object alive along with the returned
/// FileDecryptionProperties object.
std::shared_ptr<FileDecryptionProperties> GetFileDecryptionProperties(
const KmsConnectionConfig& kms_connection_config,
const DecryptionConfiguration& decryption_config);
void RemoveCacheEntriesForToken(const std::string& access_token) {
key_toolkit_.RemoveCacheEntriesForToken(access_token);
}
void RemoveCacheEntriesForAllTokens() { key_toolkit_.RemoveCacheEntriesForAllTokens(); }
private:
ColumnPathToEncryptionPropertiesMap GetColumnEncryptionProperties(
int dek_length, const std::string& column_keys, FileKeyWrapper* key_wrapper);
/// Key utilities object for kms client initialization and cache control
KeyToolkit key_toolkit_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,510 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <utility>
#include "parquet/exception.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace parquet {
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
ParquetCipher::AES_GCM_V1;
static constexpr int32_t kMaximalAadMetadataLength = 256;
static constexpr bool kDefaultEncryptedFooter = true;
static constexpr bool kDefaultCheckSignature = true;
static constexpr bool kDefaultAllowPlaintextFiles = false;
static constexpr int32_t kAadFileUniqueLength = 8;
class ColumnDecryptionProperties;
using ColumnPathToDecryptionPropertiesMap =
std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
class ColumnEncryptionProperties;
using ColumnPathToEncryptionPropertiesMap =
std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
class PARQUET_EXPORT DecryptionKeyRetriever {
public:
virtual std::string GetKey(const std::string& key_metadata) = 0;
virtual ~DecryptionKeyRetriever() {}
};
/// Simple integer key retriever
class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
public:
void PutKey(uint32_t key_id, const std::string& key);
std::string GetKey(const std::string& key_metadata) override;
private:
std::map<uint32_t, std::string> key_map_;
};
// Simple string key retriever
class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
public:
void PutKey(const std::string& key_id, const std::string& key);
std::string GetKey(const std::string& key_metadata) override;
private:
std::map<std::string, std::string> key_map_;
};
class PARQUET_EXPORT HiddenColumnException : public ParquetException {
public:
explicit HiddenColumnException(const std::string& columnPath)
: ParquetException(columnPath.c_str()) {}
};
class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
public:
explicit KeyAccessDeniedException(const std::string& columnPath)
: ParquetException(columnPath.c_str()) {}
};
inline const uint8_t* str2bytes(const std::string& str) {
if (str.empty()) return NULLPTR;
char* cbytes = const_cast<char*>(str.c_str());
return reinterpret_cast<const uint8_t*>(cbytes);
}
class PARQUET_EXPORT ColumnEncryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
/// Convenience builder for encrypted columns.
explicit Builder(const std::string& name) : Builder(name, true) {}
/// Convenience builder for encrypted columns.
explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
: Builder(path->ToDotString(), true) {}
/// Set a column-specific key.
/// If key is not set on an encrypted column, the column will
/// be encrypted with the footer key.
/// keyBytes Key length must be either 16, 24 or 32 bytes.
/// The key is cloned, and will be wiped out (array values set to 0) upon completion
/// of file writing.
/// Caller is responsible for wiping out the input key array.
Builder* key(std::string column_key);
/// Set a key retrieval metadata.
/// use either key_metadata() or key_id(), not both
Builder* key_metadata(const std::string& key_metadata);
/// A convenience function to set key metadata using a string id.
/// Set a key retrieval metadata (converted from String).
/// use either key_metadata() or key_id(), not both
/// key_id will be converted to metadata (UTF-8 array).
Builder* key_id(const std::string& key_id);
std::shared_ptr<ColumnEncryptionProperties> build() {
return std::shared_ptr<ColumnEncryptionProperties>(
new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
}
private:
const std::string column_path_;
bool encrypted_;
std::string key_;
std::string key_metadata_;
Builder(const std::string path, bool encrypted)
: column_path_(path), encrypted_(encrypted) {}
};
std::string column_path() const { return column_path_; }
bool is_encrypted() const { return encrypted_; }
bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
std::string key() const { return key_; }
std::string key_metadata() const { return key_metadata_; }
/// Upon completion of file writing, the encryption key
/// will be wiped out.
void WipeOutEncryptionKey() { key_.clear(); }
bool is_utilized() {
if (key_.empty())
return false; // can re-use column properties without encryption keys
return utilized_;
}
/// ColumnEncryptionProperties object can be used for writing one file only.
/// Mark ColumnEncryptionProperties as utilized once it is used in
/// FileEncryptionProperties as the encryption key will be wiped out upon
/// completion of file writing.
void set_utilized() { utilized_ = true; }
std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
std::string key_copy = key_;
return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
encrypted_, column_path_, key_copy, key_metadata_));
}
ColumnEncryptionProperties() = default;
ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
private:
const std::string column_path_;
bool encrypted_;
bool encrypted_with_footer_key_;
std::string key_;
std::string key_metadata_;
bool utilized_;
explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
const std::string& key,
const std::string& key_metadata);
};
class PARQUET_EXPORT ColumnDecryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
explicit Builder(const std::string& name) : column_path_(name) {}
explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
: Builder(path->ToDotString()) {}
/// Set an explicit column key. If applied on a file that contains
/// key metadata for this column the metadata will be ignored,
/// the column will be decrypted with this key.
/// key length must be either 16, 24 or 32 bytes.
Builder* key(const std::string& key);
std::shared_ptr<ColumnDecryptionProperties> build();
private:
const std::string column_path_;
std::string key_;
};
ColumnDecryptionProperties() = default;
ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
std::string column_path() const { return column_path_; }
std::string key() const { return key_; }
bool is_utilized() { return utilized_; }
/// ColumnDecryptionProperties object can be used for reading one file only.
/// Mark ColumnDecryptionProperties as utilized once it is used in
/// FileDecryptionProperties as the encryption key will be wiped out upon
/// completion of file reading.
void set_utilized() { utilized_ = true; }
/// Upon completion of file reading, the encryption key
/// will be wiped out.
void WipeOutDecryptionKey();
std::shared_ptr<ColumnDecryptionProperties> DeepClone();
private:
const std::string column_path_;
std::string key_;
bool utilized_;
/// This class is only required for setting explicit column decryption keys -
/// to override key retriever (or to provide keys when key metadata and/or
/// key retriever are not available)
explicit ColumnDecryptionProperties(const std::string& column_path,
const std::string& key);
};
class PARQUET_EXPORT AADPrefixVerifier {
public:
/// Verifies identity (AAD Prefix) of individual file,
/// or of file collection in a data set.
/// Throws exception if an AAD prefix is wrong.
/// In a data set, AAD Prefixes should be collected,
/// and then checked for missing files.
virtual void Verify(const std::string& aad_prefix) = 0;
virtual ~AADPrefixVerifier() {}
};
class PARQUET_EXPORT FileDecryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
Builder() {
check_plaintext_footer_integrity_ = kDefaultCheckSignature;
plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
}
/// Set an explicit footer key. If applied on a file that contains
/// footer key metadata the metadata will be ignored, the footer
/// will be decrypted/verified with this key.
/// If explicit key is not set, footer key will be fetched from
/// key retriever.
/// With explicit keys or AAD prefix, new encryption properties object must be
/// created for each encrypted file.
/// Explicit encryption keys (footer and column) are cloned.
/// Upon completion of file reading, the cloned encryption keys in the properties
/// will be wiped out (array values set to 0).
/// Caller is responsible for wiping out the input key array.
/// param footerKey Key length must be either 16, 24 or 32 bytes.
Builder* footer_key(const std::string footer_key);
/// Set explicit column keys (decryption properties).
/// Its also possible to set a key retriever on this property object.
/// Upon file decryption, availability of explicit keys is checked before
/// invocation of the retriever callback.
/// If an explicit key is available for a footer or a column,
/// its key metadata will be ignored.
Builder* column_keys(
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
/// Set a key retriever callback. Its also possible to
/// set explicit footer or column keys on this file property object.
/// Upon file decryption, availability of explicit keys is checked before
/// invocation of the retriever callback.
/// If an explicit key is available for a footer or a column,
/// its key metadata will be ignored.
Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
/// Skip integrity verification of plaintext footers.
/// If not called, integrity of plaintext footers will be checked in runtime,
/// and an exception will be thrown in the following situations:
/// - footer signing key is not available
/// (not passed, or not found by key retriever)
/// - footer content and signature don't match
Builder* disable_footer_signature_verification() {
check_plaintext_footer_integrity_ = false;
return this;
}
/// Explicitly supply the file AAD prefix.
/// A must when a prefix is used for file encryption, but not stored in file.
/// If AAD prefix is stored in file, it will be compared to the explicitly
/// supplied value and an exception will be thrown if they differ.
Builder* aad_prefix(const std::string& aad_prefix);
/// Set callback for verification of AAD Prefixes stored in file.
Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
/// By default, reading plaintext (unencrypted) files is not
/// allowed when using a decryptor
/// - in order to detect files that were not encrypted by mistake.
/// However, the default behavior can be overridden by calling this method.
/// The caller should use then a different method to ensure encryption
/// of files with sensitive data.
Builder* plaintext_files_allowed() {
plaintext_files_allowed_ = true;
return this;
}
std::shared_ptr<FileDecryptionProperties> build() {
return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
}
private:
std::string footer_key_;
std::string aad_prefix_;
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
bool check_plaintext_footer_integrity_;
bool plaintext_files_allowed_;
};
std::string column_key(const std::string& column_path) const;
std::string footer_key() const { return footer_key_; }
std::string aad_prefix() const { return aad_prefix_; }
const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
return key_retriever_;
}
bool check_plaintext_footer_integrity() const {
return check_plaintext_footer_integrity_;
}
bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
return aad_prefix_verifier_;
}
/// Upon completion of file reading, the encryption keys in the properties
/// will be wiped out (array values set to 0).
void WipeOutDecryptionKeys();
bool is_utilized();
/// FileDecryptionProperties object can be used for reading one file only.
/// Mark FileDecryptionProperties as utilized once it is used to read a file as the
/// encryption keys will be wiped out upon completion of file reading.
void set_utilized() { utilized_ = true; }
/// FileDecryptionProperties object can be used for reading one file only.
/// (unless this object keeps the keyRetrieval callback only, and no explicit
/// keys or aadPrefix).
/// At the end, keys are wiped out in the memory.
/// This method allows to clone identical properties for another file,
/// with an option to update the aadPrefix (if newAadPrefix is null,
/// aadPrefix will be cloned too)
std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
private:
std::string footer_key_;
std::string aad_prefix_;
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
const std::string empty_string_ = "";
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
bool check_plaintext_footer_integrity_;
bool plaintext_files_allowed_;
bool utilized_;
FileDecryptionProperties(
const std::string& footer_key,
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
bool check_plaintext_footer_integrity, const std::string& aad_prefix,
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
bool plaintext_files_allowed);
};
class PARQUET_EXPORT FileEncryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
explicit Builder(const std::string& footer_key)
: parquet_cipher_(kDefaultEncryptionAlgorithm),
encrypted_footer_(kDefaultEncryptedFooter) {
footer_key_ = footer_key;
store_aad_prefix_in_file_ = false;
}
/// Create files with plaintext footer.
/// If not called, the files will be created with encrypted footer (default).
Builder* set_plaintext_footer() {
encrypted_footer_ = false;
return this;
}
/// Set encryption algorithm.
/// If not called, files will be encrypted with AES_GCM_V1 (default).
Builder* algorithm(ParquetCipher::type parquet_cipher) {
parquet_cipher_ = parquet_cipher;
return this;
}
/// Set a key retrieval metadata (converted from String).
/// use either footer_key_metadata or footer_key_id, not both.
Builder* footer_key_id(const std::string& key_id);
/// Set a key retrieval metadata.
/// use either footer_key_metadata or footer_key_id, not both.
Builder* footer_key_metadata(const std::string& footer_key_metadata);
/// Set the file AAD Prefix.
Builder* aad_prefix(const std::string& aad_prefix);
/// Skip storing AAD Prefix in file.
/// If not called, and if AAD Prefix is set, it will be stored.
Builder* disable_aad_prefix_storage();
/// Set the list of encrypted columns and their properties (keys etc).
/// If not called, all columns will be encrypted with the footer key.
/// If called, the file columns not in the list will be left unencrypted.
Builder* encrypted_columns(
const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
std::shared_ptr<FileEncryptionProperties> build() {
return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
}
private:
ParquetCipher::type parquet_cipher_;
bool encrypted_footer_;
std::string footer_key_;
std::string footer_key_metadata_;
std::string aad_prefix_;
bool store_aad_prefix_in_file_;
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
};
bool encrypted_footer() const { return encrypted_footer_; }
EncryptionAlgorithm algorithm() const { return algorithm_; }
std::string footer_key() const { return footer_key_; }
std::string footer_key_metadata() const { return footer_key_metadata_; }
std::string file_aad() const { return file_aad_; }
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
const std::string& column_path);
bool is_utilized() const { return utilized_; }
/// FileEncryptionProperties object can be used for writing one file only.
/// Mark FileEncryptionProperties as utilized once it is used to write a file as the
/// encryption keys will be wiped out upon completion of file writing.
void set_utilized() { utilized_ = true; }
/// Upon completion of file writing, the encryption keys
/// will be wiped out (array values set to 0).
void WipeOutEncryptionKeys();
/// FileEncryptionProperties object can be used for writing one file only.
/// (at the end, keys are wiped out in the memory).
/// This method allows to clone identical properties for another file,
/// with an option to update the aadPrefix (if newAadPrefix is null,
/// aadPrefix will be cloned too)
std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
return encrypted_columns_;
}
private:
EncryptionAlgorithm algorithm_;
std::string footer_key_;
std::string footer_key_metadata_;
bool encrypted_footer_;
std::string file_aad_;
std::string aad_prefix_;
bool utilized_;
bool store_aad_prefix_in_file_;
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
const std::string& footer_key_metadata, bool encrypted_footer,
const std::string& aad_prefix, bool store_aad_prefix_in_file,
const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
};
} // namespace parquet

View File

@ -0,0 +1,31 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License") = 0; you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
namespace parquet {
namespace encryption {
// Key material can be stored outside the Parquet file, for example in a separate small
// file in the same folder. This is important for “key rotation”, when MEKs have to be
// changed (if compromised; or periodically, just in case) - without modifying the Parquet
// files (often immutable).
// TODO: details will be implemented later
class FileKeyMaterialStore {};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,66 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/encryption.h"
#include "parquet/encryption/key_material.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/key_toolkit_internal.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// This class will retrieve the key from "key metadata", following these steps:
// 1. Parse "key metadata" (see structure in KeyMetadata class).
// 2. Retrieve "key material" which can be stored inside or outside "key metadata"
// Currently we don't support the case "key material" stores outside "key metadata"
// yet.
// 3. Unwrap the "data encryption key" from "key material". There are 2 modes:
// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master
// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted
// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above
// "key encryption key"
class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever {
public:
/// key_toolkit and kms_connection_config is to get KmsClient from cache or create
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
/// KmsClient in the cache.
FileKeyUnwrapper(KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
double cache_lifetime_seconds);
std::string GetKey(const std::string& key_metadata) override;
private:
internal::KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material);
std::shared_ptr<KmsClient> GetKmsClientFromConfigOrKeyMaterial(
const KeyMaterial& key_material);
/// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, std::string>> kek_per_kek_id_;
KeyToolkit* key_toolkit_;
KmsConnectionConfig kms_connection_config_;
const double cache_entry_lifetime_seconds_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,82 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/file_key_material_store.h"
#include "parquet/encryption/key_encryption_key.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// This class will generate "key metadata" from "data encryption key" and "master key",
// following these steps:
// 1. Wrap "data encryption key". There are 2 modes:
// 1.1. single wrapping: encrypt "data encryption key" directly with "master encryption
// key"
// 1.2. double wrapping: 2 steps:
// 1.2.1. "key encryption key" is randomized (see KeyEncryptionKey class)
// 1.2.2. "data encryption key" is encrypted with the above "key encryption key"
// 2. Create "key material" (see structure in KeyMaterial class)
// 3. Create "key metadata" with "key material" inside or a reference to outside "key
// material" (see structure in KeyMetadata class).
// We don't support the case "key material" stores outside "key metadata" yet.
class PARQUET_EXPORT FileKeyWrapper {
public:
static constexpr int kKeyEncryptionKeyLength = 16;
static constexpr int kKeyEncryptionKeyIdLength = 16;
/// key_toolkit and kms_connection_config is to get KmsClient from the cache or create
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
/// KmsClient in the cache. key_material_store is to store "key material" outside
/// parquet file, NULL if "key material" is stored inside parquet file.
FileKeyWrapper(KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
std::shared_ptr<FileKeyMaterialStore> key_material_store,
double cache_entry_lifetime_seconds, bool double_wrapping);
/// Creates key_metadata field for a given data key, via wrapping the key with the
/// master key
std::string GetEncryptionKeyMetadata(const std::string& data_key,
const std::string& master_key_id,
bool is_footer_key);
private:
KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id);
/// A map of Master Encryption Key ID -> KeyEncryptionKey, for the current token
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, KeyEncryptionKey>>
kek_per_master_key_id_;
std::shared_ptr<KmsClient> kms_client_;
KmsConnectionConfig kms_connection_config_;
std::shared_ptr<FileKeyMaterialStore> key_material_store_;
const double cache_entry_lifetime_seconds_;
const bool double_wrapping_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,59 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <vector>
#include "arrow/util/base64.h"
namespace parquet {
namespace encryption {
// In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key
// encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK).
// In a writer process, a random KEK is generated for each MEK ID, and cached in a <MEK-ID
// : KEK> map. This allows to perform an interaction with a KMS server only once for each
// MEK, in order to wrap its KEK. "Data encryption key" (DEK) wrapping is performed
// locally, and does not involve an interaction with a KMS server.
class KeyEncryptionKey {
public:
KeyEncryptionKey(std::string kek_bytes, std::string kek_id,
std::string encoded_wrapped_kek)
: kek_bytes_(std::move(kek_bytes)),
kek_id_(std::move(kek_id)),
encoded_kek_id_(::arrow::util::base64_encode(kek_id_)),
encoded_wrapped_kek_(std::move(encoded_wrapped_kek)) {}
const std::string& kek_bytes() const { return kek_bytes_; }
const std::string& kek_id() const { return kek_id_; }
const std::string& encoded_kek_id() const { return encoded_kek_id_; }
const std::string& encoded_wrapped_kek() const { return encoded_wrapped_kek_; }
private:
std::string kek_bytes_;
std::string kek_id_;
std::string encoded_kek_id_;
std::string encoded_wrapped_kek_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,131 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "parquet/platform.h"
namespace arrow {
namespace json {
namespace internal {
class ObjectParser;
} // namespace internal
} // namespace json
} // namespace arrow
namespace parquet {
namespace encryption {
// KeyMaterial class represents the "key material", keeping the information that allows
// readers to recover an encryption key (see description of the KeyMetadata class). The
// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a
// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key
// material is generated by encrypting the "data encryption key" (DEK) by a "master key".
// In the double wrapping mode, the key material is generated by encrypting the DEK by a
// "key encryption key" (KEK), that in turn is encrypted by a "master key".
//
// Key material is kept in a flat json object, with the following fields:
// 1. "keyMaterialType" - a String, with the type of key material. In the current
// version, only one value is allowed - "PKMT1" (stands
// for "parquet key management tools, version 1"). For external key material storage,
// this field is written in both "key metadata" and "key material" jsons. For internal
// key material storage, this field is written only once in the common json.
// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer
// key, and keeps additional information (such as
// KMS instance ID and URL). If false, means that the material belongs to a column
// key.
// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key
// material.
// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key
// material.
// 5. "masterKeyID" - a String, with the ID of the master key used to generate the
// material.
// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding).
// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in
// double wrapping mode.
// If false - in single wrapping mode.
// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the
// material. Written only in double wrapping mode.
// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in
// double wrapping mode.
class PARQUET_EXPORT KeyMaterial {
public:
// these fields are defined in a specification and should never be changed
static constexpr const char kKeyMaterialTypeField[] = "keyMaterialType";
static constexpr const char kKeyMaterialType1[] = "PKMT1";
static constexpr const char kFooterKeyIdInFile[] = "footerKey";
static constexpr const char kColumnKeyIdInFilePrefix[] = "columnKey";
static constexpr const char kIsFooterKeyField[] = "isFooterKey";
static constexpr const char kDoubleWrappingField[] = "doubleWrapping";
static constexpr const char kKmsInstanceIdField[] = "kmsInstanceID";
static constexpr const char kKmsInstanceUrlField[] = "kmsInstanceURL";
static constexpr const char kMasterKeyIdField[] = "masterKeyID";
static constexpr const char kWrappedDataEncryptionKeyField[] = "wrappedDEK";
static constexpr const char kKeyEncryptionKeyIdField[] = "keyEncryptionKeyID";
static constexpr const char kWrappedKeyEncryptionKeyField[] = "wrappedKEK";
public:
KeyMaterial() = default;
static KeyMaterial Parse(const std::string& key_material_string);
static KeyMaterial Parse(
const ::arrow::json::internal::ObjectParser* key_material_json);
/// This method returns a json string that will be stored either inside a parquet file
/// or in a key material store outside the parquet file.
static std::string SerializeToJson(bool is_footer_key,
const std::string& kms_instance_id,
const std::string& kms_instance_url,
const std::string& master_key_id,
bool is_double_wrapped, const std::string& kek_id,
const std::string& encoded_wrapped_kek,
const std::string& encoded_wrapped_dek,
bool is_internal_storage);
bool is_footer_key() const { return is_footer_key_; }
bool is_double_wrapped() const { return is_double_wrapped_; }
const std::string& master_key_id() const { return master_key_id_; }
const std::string& wrapped_dek() const { return encoded_wrapped_dek_; }
const std::string& kek_id() const { return kek_id_; }
const std::string& wrapped_kek() const { return encoded_wrapped_kek_; }
const std::string& kms_instance_id() const { return kms_instance_id_; }
const std::string& kms_instance_url() const { return kms_instance_url_; }
private:
KeyMaterial(bool is_footer_key, const std::string& kms_instance_id,
const std::string& kms_instance_url, const std::string& master_key_id,
bool is_double_wrapped, const std::string& kek_id,
const std::string& encoded_wrapped_kek,
const std::string& encoded_wrapped_dek);
bool is_footer_key_;
std::string kms_instance_id_;
std::string kms_instance_url_;
std::string master_key_id_;
bool is_double_wrapped_;
std::string kek_id_;
std::string encoded_wrapped_kek_;
std::string encoded_wrapped_dek_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,94 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "arrow/util/variant.h"
#include "parquet/encryption/key_material.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// Parquet encryption specification defines "key metadata" as an arbitrary byte array,
// generated by file writers for each encryption key, and passed to the low level API for
// storage in the file footer. The "key metadata" field is made available to file readers
// to enable recovery of the key. This interface can be utilized for implementation
// of any key management scheme.
//
// The keytools package (PARQUET-1373) implements one approach, of many possible, to key
// management and to generation of the "key metadata" fields. This approach, based on the
// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual
// material, required to recover a key, in a "key material" object (see the KeyMaterial
// class for details). This class is implemented to support version 1 of the parquet key
// management tools specification.
//
// KeyMetadata writes (and reads) the "key metadata" field as a flat json object,
// with the following fields:
// 1. "keyMaterialType" - a String, with the type of key material.
// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the
// "key metadata" field. If false, "key material" is kept externally (outside Parquet
// files) - in this case, "key metadata" keeps a reference to the external "key material".
// 3. "keyReference" - a String, with the reference to the external "key material".
// Written only if internalStorage is false.
//
// If internalStorage is true, "key material" is a part of "key metadata", and the json
// keeps additional fields, described in the KeyMaterial class.
class PARQUET_EXPORT KeyMetadata {
public:
static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage";
static constexpr const char kKeyReferenceField[] = "keyReference";
/// key_metadata_bytes is the key metadata field stored in the parquet file,
/// in the serialized json object format.
static KeyMetadata Parse(const std::string& key_metadata_bytes);
static std::string CreateSerializedForExternalMaterial(
const std::string& key_reference);
bool key_material_stored_internally() const { return is_internal_storage_; }
const KeyMaterial& key_material() const {
if (!is_internal_storage_) {
throw ParquetException("key material is stored externally.");
}
return ::arrow::util::get<KeyMaterial>(key_material_or_reference_);
}
const std::string& key_reference() const {
if (is_internal_storage_) {
throw ParquetException("key material is stored internally.");
}
return ::arrow::util::get<std::string>(key_material_or_reference_);
}
private:
explicit KeyMetadata(const KeyMaterial& key_material);
explicit KeyMetadata(const std::string& key_reference);
bool is_internal_storage_;
/// If is_internal_storage_ is true, KeyMaterial is set,
/// else a string referencing to an outside "key material" is set.
::arrow::util::Variant<KeyMaterial, std::string> key_material_or_reference_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include "parquet/encryption/key_encryption_key.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/encryption/two_level_cache_with_expiration.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// KeyToolkit is a utility that keeps various tools for key management (such as key
// rotation, kms client instantiation, cache control, etc), plus a number of auxiliary
// classes for internal use.
class PARQUET_EXPORT KeyToolkit {
public:
/// KMS client two level cache: token -> KMSInstanceId -> KmsClient
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>>& kms_client_cache_per_token() {
return kms_client_cache_;
}
/// Key encryption key two level cache for wrapping: token -> MasterEncryptionKeyId ->
/// KeyEncryptionKey
TwoLevelCacheWithExpiration<KeyEncryptionKey>& kek_write_cache_per_token() {
return key_encryption_key_write_cache_;
}
/// Key encryption key two level cache for unwrapping: token -> KeyEncryptionKeyId ->
/// KeyEncryptionKeyBytes
TwoLevelCacheWithExpiration<std::string>& kek_read_cache_per_token() {
return key_encryption_key_read_cache_;
}
std::shared_ptr<KmsClient> GetKmsClient(
const KmsConnectionConfig& kms_connection_config, double cache_entry_lifetime_ms);
/// Flush any caches that are tied to the (compromised) access_token
void RemoveCacheEntriesForToken(const std::string& access_token);
void RemoveCacheEntriesForAllTokens();
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory) {
if (kms_client_factory_ != NULL) {
throw ParquetException("KMS client factory has already been registered.");
}
kms_client_factory_ = kms_client_factory;
}
private:
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>> kms_client_cache_;
TwoLevelCacheWithExpiration<KeyEncryptionKey> key_encryption_key_write_cache_;
TwoLevelCacheWithExpiration<std::string> key_encryption_key_read_cache_;
std::shared_ptr<KmsClientFactory> kms_client_factory_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,95 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "arrow/util/mutex.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
/// This class wraps the key access token of a KMS server. If your token changes over
/// time, you should keep the reference to the KeyAccessToken object and call Refresh()
/// method every time you have a new token.
class PARQUET_EXPORT KeyAccessToken {
public:
KeyAccessToken() = default;
explicit KeyAccessToken(const std::string value) : value_(value) {}
void Refresh(const std::string& new_value) {
auto lock = mutex_.Lock();
value_ = new_value;
}
const std::string& value() const {
auto lock = mutex_.Lock();
return value_;
}
private:
std::string value_;
mutable ::arrow::util::Mutex mutex_;
};
struct PARQUET_EXPORT KmsConnectionConfig {
std::string kms_instance_id;
std::string kms_instance_url;
/// If the access token is changed in the future, you should keep a reference to
/// this object and call Refresh() on it whenever there is a new access token.
std::shared_ptr<KeyAccessToken> refreshable_key_access_token;
std::unordered_map<std::string, std::string> custom_kms_conf;
KmsConnectionConfig();
const std::string& key_access_token() const {
if (refreshable_key_access_token == NULL ||
refreshable_key_access_token->value().empty()) {
throw ParquetException("key access token is not set!");
}
return refreshable_key_access_token->value();
}
void SetDefaultIfEmpty();
};
class PARQUET_EXPORT KmsClient {
public:
static constexpr const char kKmsInstanceIdDefault[] = "DEFAULT";
static constexpr const char kKmsInstanceUrlDefault[] = "DEFAULT";
static constexpr const char kKeyAccessTokenDefault[] = "DEFAULT";
/// Wraps a key - encrypts it with the master key, encodes the result
/// and potentially adds a KMS-specific metadata.
virtual std::string WrapKey(const std::string& key_bytes,
const std::string& master_key_identifier) = 0;
/// Decrypts (unwraps) a key with the master key.
virtual std::string UnwrapKey(const std::string& wrapped_key,
const std::string& master_key_identifier) = 0;
virtual ~KmsClient() {}
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,40 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
class PARQUET_EXPORT KmsClientFactory {
public:
explicit KmsClientFactory(bool wrap_locally = false) : wrap_locally_(wrap_locally) {}
virtual ~KmsClientFactory() = default;
virtual std::shared_ptr<KmsClient> CreateKmsClient(
const KmsConnectionConfig& kms_connection_config) = 0;
protected:
bool wrap_locally_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,96 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <unordered_map>
#include <vector>
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
/// This class supports local wrapping mode, master keys will be fetched from the KMS
/// server and used to encrypt other keys (data encryption keys or key encryption keys).
class PARQUET_EXPORT LocalWrapKmsClient : public KmsClient {
public:
static constexpr const char kLocalWrapNoKeyVersion[] = "NO_VERSION";
explicit LocalWrapKmsClient(const KmsConnectionConfig& kms_connection_config);
std::string WrapKey(const std::string& key_bytes,
const std::string& master_key_identifier) override;
std::string UnwrapKey(const std::string& wrapped_key,
const std::string& master_key_identifier) override;
protected:
/// Get master key from the remote KMS server.
/// Note: this function might be called by multiple threads
virtual std::string GetMasterKeyFromServer(
const std::string& master_key_identifier) = 0;
private:
/// KMS systems wrap keys by encrypting them by master keys, and attaching additional
/// information (such as the version number of the masker key) to the result of
/// encryption. The master key version is required in key rotation. Currently, the
/// local wrapping mode does not support key rotation (because not all KMS systems allow
/// to fetch a master key by its ID and version number). Still, the local wrapping mode
/// adds a placeholder for the master key version, that will enable support for key
/// rotation in this mode in the future, with appropriate KMS systems. This will also
/// enable backward compatibility, where future readers will be able to extract master
/// key version in the files written by the current code.
///
/// LocalKeyWrap class writes (and reads) the "key wrap" as a flat json with the
/// following fields:
/// 1. "masterKeyVersion" - a String, with the master key version. In the current
/// version, only one value is allowed - "NO_VERSION".
/// 2. "encryptedKey" - a String, with the key encrypted by the master key
/// (base64-encoded).
class LocalKeyWrap {
public:
static constexpr const char kLocalWrapKeyVersionField[] = "masterKeyVersion";
static constexpr const char kLocalWrapEncryptedKeyField[] = "encryptedKey";
LocalKeyWrap(std::string master_key_version, std::string encrypted_encoded_key);
static std::string CreateSerialized(const std::string& encrypted_encoded_key);
static LocalKeyWrap Parse(const std::string& wrapped_key);
const std::string& master_key_version() const { return master_key_version_; }
const std::string& encrypted_encoded_key() const { return encrypted_encoded_key_; }
private:
std::string encrypted_encoded_key_;
std::string master_key_version_;
};
std::string GetKeyFromServer(const std::string& key_identifier);
protected:
KmsConnectionConfig kms_connection_config_;
::arrow::util::ConcurrentMap<std::string, std::string> master_key_cache_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,118 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This module defines an abstract interface for iterating through pages in a
// Parquet column chunk within a row group. It could be extended in the future
// to iterate through all data pages in all chunks in a file.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <gtest/gtest.h>
#include "arrow/util/io_util.h"
#include "parquet/encryption/encryption.h"
#include "parquet/test_util.h"
namespace parquet {
class ParquetFileReader;
namespace encryption {
namespace test {
using ::arrow::internal::TemporaryDir;
constexpr int kFixedLength = 10;
const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16
const char kColumnEncryptionKey1[] = "1234567890123450";
const char kColumnEncryptionKey2[] = "1234567890123451";
const char kFileName[] = "tester";
// Get the path of file inside parquet test data directory
std::string data_file(const char* file);
// A temporary directory that contains the encrypted files generated in the tests.
extern std::unique_ptr<TemporaryDir> temp_dir;
inline ::arrow::Result<std::unique_ptr<TemporaryDir>> temp_data_dir() {
return TemporaryDir::Make("parquet-encryption-test-");
}
const char kDoubleFieldName[] = "double_field";
const char kFloatFieldName[] = "float_field";
const char kBooleanFieldName[] = "boolean_field";
const char kInt32FieldName[] = "int32_field";
const char kInt64FieldName[] = "int64_field";
const char kInt96FieldName[] = "int96_field";
const char kByteArrayFieldName[] = "ba_field";
const char kFixedLenByteArrayFieldName[] = "flba_field";
const char kFooterMasterKey[] = "0123456789112345";
const char kFooterMasterKeyId[] = "kf";
const char* const kColumnMasterKeys[] = {"1234567890123450", "1234567890123451",
"1234567890123452", "1234567890123453",
"1234567890123454", "1234567890123455"};
const char* const kColumnMasterKeyIds[] = {"kc1", "kc2", "kc3", "kc4", "kc5", "kc6"};
// The result of this function will be used to set into TestOnlyInMemoryKmsClientFactory
// as the key mapping to look at.
std::unordered_map<std::string, std::string> BuildKeyMap(const char* const* column_ids,
const char* const* column_keys,
const char* footer_id,
const char* footer_key);
// The result of this function will be used to set into EncryptionConfiguration
// as colum keys.
std::string BuildColumnKeyMapping();
// FileEncryptor and FileDecryptor are helper classes to write/read an encrypted parquet
// file corresponding to each pair of FileEncryptionProperties/FileDecryptionProperties.
// FileEncryptor writes the file with fixed data values and FileDecryptor reads the file
// and verify the correctness of data values.
class FileEncryptor {
public:
FileEncryptor();
void EncryptFile(
std::string file,
std::shared_ptr<parquet::FileEncryptionProperties> encryption_configurations);
private:
std::shared_ptr<schema::GroupNode> SetupEncryptionSchema();
int num_rowgroups_ = 5;
int rows_per_rowgroup_ = 50;
std::shared_ptr<schema::GroupNode> schema_;
};
class FileDecryptor {
public:
void DecryptFile(std::string file_name,
std::shared_ptr<FileDecryptionProperties> file_decryption_properties);
private:
void CheckFile(parquet::ParquetFileReader* file_reader,
FileDecryptionProperties* file_decryption_properties);
};
} // namespace test
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,89 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <unordered_map>
#include "arrow/util/base64.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/encryption/local_wrap_kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// This is a mock class, built for testing only. Don't use it as an example of
// LocalWrapKmsClient implementation.
class TestOnlyLocalWrapInMemoryKms : public LocalWrapKmsClient {
public:
explicit TestOnlyLocalWrapInMemoryKms(const KmsConnectionConfig& kms_connection_config);
static void InitializeMasterKeys(
const std::unordered_map<std::string, std::string>& master_keys_map);
protected:
std::string GetMasterKeyFromServer(const std::string& master_key_identifier) override;
private:
static std::unordered_map<std::string, std::string> master_key_map_;
};
// This is a mock class, built for testing only. Don't use it as an example of KmsClient
// implementation.
class TestOnlyInServerWrapKms : public KmsClient {
public:
static void InitializeMasterKeys(
const std::unordered_map<std::string, std::string>& master_keys_map);
std::string WrapKey(const std::string& key_bytes,
const std::string& master_key_identifier) override;
std::string UnwrapKey(const std::string& wrapped_key,
const std::string& master_key_identifier) override;
private:
std::string GetMasterKeyFromServer(const std::string& master_key_identifier);
static std::unordered_map<std::string, std::string> master_key_map_;
};
// This is a mock class, built for testing only. Don't use it as an example of
// KmsClientFactory implementation.
class TestOnlyInMemoryKmsClientFactory : public KmsClientFactory {
public:
TestOnlyInMemoryKmsClientFactory(
bool wrap_locally,
const std::unordered_map<std::string, std::string>& master_keys_map)
: KmsClientFactory(wrap_locally) {
TestOnlyLocalWrapInMemoryKms::InitializeMasterKeys(master_keys_map);
TestOnlyInServerWrapKms::InitializeMasterKeys(master_keys_map);
}
std::shared_ptr<KmsClient> CreateKmsClient(
const KmsConnectionConfig& kms_connection_config) {
if (wrap_locally_) {
return std::make_shared<TestOnlyLocalWrapInMemoryKms>(kms_connection_config);
} else {
return std::make_shared<TestOnlyInServerWrapKms>();
}
}
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,159 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <chrono>
#include <unordered_map>
#include "arrow/util/concurrent_map.h"
#include "arrow/util/mutex.h"
namespace parquet {
namespace encryption {
using ::arrow::util::ConcurrentMap;
namespace internal {
using TimePoint =
std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<double>>;
inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); }
template <typename E>
class ExpiringCacheEntry {
public:
ExpiringCacheEntry() = default;
ExpiringCacheEntry(E cached_item, double expiration_interval_seconds)
: expiration_timestamp_(CurrentTimePoint() +
std::chrono::duration<double>(expiration_interval_seconds)),
cached_item_(std::move(cached_item)) {}
bool IsExpired() const {
const auto now = CurrentTimePoint();
return (now > expiration_timestamp_);
}
E cached_item() { return cached_item_; }
private:
const TimePoint expiration_timestamp_;
E cached_item_;
};
// This class is to avoid the below warning when compiling KeyToolkit class with VS2015
// warning C4503: decorated name length exceeded, name was truncated
template <typename V>
class ExpiringCacheMapEntry {
public:
ExpiringCacheMapEntry() = default;
explicit ExpiringCacheMapEntry(
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item,
double expiration_interval_seconds)
: map_cache_(cached_item, expiration_interval_seconds) {}
bool IsExpired() { return map_cache_.IsExpired(); }
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item() {
return map_cache_.cached_item();
}
private:
// ConcurrentMap object may be accessed and modified at many places at the same time,
// from multiple threads, or even removed from cache.
ExpiringCacheEntry<std::shared_ptr<ConcurrentMap<std::string, V>>> map_cache_;
};
} // namespace internal
// Two-level cache with expiration of internal caches according to token lifetime.
// External cache is per token, internal is per string key.
// Wrapper class around:
// std::unordered_map<std::string,
// internal::ExpiringCacheEntry<std::unordered_map<std::string, V>>>
// This cache is safe to be shared between threads.
template <typename V>
class TwoLevelCacheWithExpiration {
public:
TwoLevelCacheWithExpiration() {
last_cache_cleanup_timestamp_ = internal::CurrentTimePoint();
}
std::shared_ptr<ConcurrentMap<std::string, V>> GetOrCreateInternalCache(
const std::string& access_token, double cache_entry_lifetime_seconds) {
auto lock = mutex_.Lock();
auto external_cache_entry = cache_.find(access_token);
if (external_cache_entry == cache_.end() ||
external_cache_entry->second.IsExpired()) {
cache_.insert({access_token, internal::ExpiringCacheMapEntry<V>(
std::shared_ptr<ConcurrentMap<std::string, V>>(
new ConcurrentMap<std::string, V>()),
cache_entry_lifetime_seconds)});
}
return cache_[access_token].cached_item();
}
void CheckCacheForExpiredTokens(double cache_cleanup_period_seconds) {
auto lock = mutex_.Lock();
const auto now = internal::CurrentTimePoint();
if (now > (last_cache_cleanup_timestamp_ +
std::chrono::duration<double>(cache_cleanup_period_seconds))) {
RemoveExpiredEntriesNoMutex();
last_cache_cleanup_timestamp_ =
now + std::chrono::duration<double>(cache_cleanup_period_seconds);
}
}
void RemoveExpiredEntriesFromCache() {
auto lock = mutex_.Lock();
RemoveExpiredEntriesNoMutex();
}
void Remove(const std::string& access_token) {
auto lock = mutex_.Lock();
cache_.erase(access_token);
}
void Clear() {
auto lock = mutex_.Lock();
cache_.clear();
}
private:
void RemoveExpiredEntriesNoMutex() {
for (auto it = cache_.begin(); it != cache_.end();) {
if (it->second.IsExpired()) {
it = cache_.erase(it);
} else {
++it;
}
}
}
std::unordered_map<std::string, internal::ExpiringCacheMapEntry<V>> cache_;
internal::TimePoint last_cache_cleanup_timestamp_;
::arrow::util::Mutex mutex_;
};
} // namespace encryption
} // namespace parquet

View File

@ -0,0 +1,158 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <exception>
#include <sstream>
#include <string>
#include <utility>
#include "arrow/type_fwd.h"
#include "arrow/util/string_builder.h"
#include "parquet/platform.h"
// PARQUET-1085
#if !defined(ARROW_UNUSED)
#define ARROW_UNUSED(x) UNUSED(x)
#endif
// Parquet exception to Arrow Status
#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
#define END_PARQUET_CATCH_EXCEPTIONS \
} \
catch (const ::parquet::ParquetStatusException& e) { \
return e.status(); \
} \
catch (const ::parquet::ParquetException& e) { \
return ::arrow::Status::IOError(e.what()); \
}
// clang-format off
#define PARQUET_CATCH_NOT_OK(s) \
BEGIN_PARQUET_CATCH_EXCEPTIONS \
(s); \
END_PARQUET_CATCH_EXCEPTIONS
// clang-format on
#define PARQUET_CATCH_AND_RETURN(s) \
BEGIN_PARQUET_CATCH_EXCEPTIONS \
return (s); \
END_PARQUET_CATCH_EXCEPTIONS
// Arrow Status to Parquet exception
#define PARQUET_IGNORE_NOT_OK(s) \
do { \
::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
ARROW_UNUSED(_s); \
} while (0)
#define PARQUET_THROW_NOT_OK(s) \
do { \
::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
if (!_s.ok()) { \
throw ::parquet::ParquetStatusException(std::move(_s)); \
} \
} while (0)
#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
auto status_name = (rexpr); \
PARQUET_THROW_NOT_OK(status_name.status()); \
lhs = std::move(status_name).ValueOrDie();
#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
lhs, rexpr);
namespace parquet {
class ParquetException : public std::exception {
public:
PARQUET_NORETURN static void EofException(const std::string& msg = "") {
static std::string prefix = "Unexpected end of stream";
if (msg.empty()) {
throw ParquetException(prefix);
}
throw ParquetException(prefix, ": ", msg);
}
PARQUET_NORETURN static void NYI(const std::string& msg = "") {
throw ParquetException("Not yet implemented: ", msg, ".");
}
template <typename... Args>
explicit ParquetException(Args&&... args)
: msg_(::arrow::util::StringBuilder(std::forward<Args>(args)...)) {}
explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
ParquetException(const ParquetException&) = default;
ParquetException& operator=(const ParquetException&) = default;
ParquetException(ParquetException&&) = default;
ParquetException& operator=(ParquetException&&) = default;
const char* what() const noexcept override { return msg_.c_str(); }
private:
std::string msg_;
};
// Support printing a ParquetException.
// This is needed for clang-on-MSVC as there operator<< is not defined for
// std::exception.
PARQUET_EXPORT
std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
class ParquetStatusException : public ParquetException {
public:
explicit ParquetStatusException(::arrow::Status status)
: ParquetException(status.ToString()), status_(std::move(status)) {}
const ::arrow::Status& status() const { return status_; }
private:
::arrow::Status status_;
};
// This class exists for the purpose of detecting an invalid or corrupted file.
class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
public:
ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
default;
template <typename Arg,
typename std::enable_if<
!std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
int>::type = 0,
typename... Args>
explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
: ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
std::forward<Args>(args)...)) {}
};
template <typename StatusReturnBlock>
void ThrowNotOk(StatusReturnBlock&& b) {
PARQUET_THROW_NOT_OK(b());
}
} // namespace parquet

View File

@ -0,0 +1,188 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/io/caching.h"
#include "arrow/util/type_fwd.h"
#include "parquet/metadata.h" // IWYU pragma: keep
#include "parquet/platform.h"
#include "parquet/properties.h"
namespace parquet {
class ColumnReader;
class FileMetaData;
class PageReader;
class RowGroupMetaData;
class PARQUET_EXPORT RowGroupReader {
public:
// Forward declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct Contents {
virtual ~Contents() {}
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
virtual const RowGroupMetaData* metadata() const = 0;
virtual const ReaderProperties* properties() const = 0;
};
explicit RowGroupReader(std::unique_ptr<Contents> contents);
// Returns the rowgroup metadata
const RowGroupMetaData* metadata() const;
// Construct a ColumnReader for the indicated row group-relative
// column. Ownership is shared with the RowGroupReader.
std::shared_ptr<ColumnReader> Column(int i);
// Construct a ColumnReader, trying to enable exposed encoding.
//
// For dictionary encoding, currently we only support column chunks that are fully
// dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
// If a column chunk uses dictionary encoding but then falls back to plain encoding, the
// encoding will not be exposed.
//
// The returned column reader provides an API GetExposedEncoding() for the
// users to check the exposed encoding and determine how to read the batches.
//
// \note API EXPERIMENTAL
std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
int i, ExposedEncoding encoding_to_expose);
std::unique_ptr<PageReader> GetColumnPageReader(int i);
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
};
class PARQUET_EXPORT ParquetFileReader {
public:
// Declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct PARQUET_EXPORT Contents {
static std::unique_ptr<Contents> Open(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
virtual ~Contents() = default;
// Perform any cleanup associated with the file contents
virtual void Close() = 0;
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
virtual std::shared_ptr<FileMetaData> metadata() const = 0;
};
ParquetFileReader();
~ParquetFileReader();
// Create a file reader instance from an Arrow file object. Thread-safety is
// the responsibility of the file implementation
static std::unique_ptr<ParquetFileReader> Open(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
// API Convenience to open a serialized Parquet file on disk, using Arrow IO
// interfaces.
static std::unique_ptr<ParquetFileReader> OpenFile(
const std::string& path, bool memory_map = true,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
// Asynchronously open a file reader from an Arrow file object.
// Does not throw - all errors are reported through the Future.
static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
void Open(std::unique_ptr<Contents> contents);
void Close();
// The RowGroupReader is owned by the FileReader
std::shared_ptr<RowGroupReader> RowGroup(int i);
// Returns the file metadata. Only one instance is ever created
std::shared_ptr<FileMetaData> metadata() const;
/// Pre-buffer the specified column indices in all row groups.
///
/// Readers can optionally call this to cache the necessary slices
/// of the file in-memory before deserialization. Arrow readers can
/// automatically do this via an option. This is intended to
/// increase performance when reading from high-latency filesystems
/// (e.g. Amazon S3).
///
/// After calling this, creating readers for row groups/column
/// indices that were not buffered may fail. Creating multiple
/// readers for the a subset of the buffered regions is
/// acceptable. This may be called again to buffer a different set
/// of row groups/columns.
///
/// If memory usage is a concern, note that data will remain
/// buffered in memory until either \a PreBuffer() is called again,
/// or the reader itself is destructed. Reading - and buffering -
/// only one row group at a time may be useful.
///
/// This method may throw.
void PreBuffer(const std::vector<int>& row_groups,
const std::vector<int>& column_indices,
const ::arrow::io::IOContext& ctx,
const ::arrow::io::CacheOptions& options);
/// Wait for the specified row groups and column indices to be pre-buffered.
///
/// After the returned Future completes, reading the specified row
/// groups/columns will not block.
///
/// PreBuffer must be called first. This method does not throw.
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
const std::vector<int>& column_indices) const;
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
};
// Read only Parquet file metadata
std::shared_ptr<FileMetaData> PARQUET_EXPORT
ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
/// \brief Scan all values in file. Useful for performance testing
/// \param[in] columns the column numbers to scan. If empty scans all
/// \param[in] column_batch_size number of values to read at a time when scanning column
/// \param[in] reader a ParquetFileReader instance
/// \return number of semantic rows in file
PARQUET_EXPORT
int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
ParquetFileReader* reader);
} // namespace parquet

View File

@ -0,0 +1,234 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <utility>
#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/properties.h"
#include "parquet/schema.h"
namespace parquet {
class ColumnWriter;
// FIXME: copied from reader-internal.cc
static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
class PARQUET_EXPORT RowGroupWriter {
public:
// Forward declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct Contents {
virtual ~Contents() = default;
virtual int num_columns() const = 0;
virtual int64_t num_rows() const = 0;
// to be used only with ParquetFileWriter::AppendRowGroup
virtual ColumnWriter* NextColumn() = 0;
// to be used only with ParquetFileWriter::AppendBufferedRowGroup
virtual ColumnWriter* column(int i) = 0;
virtual int current_column() const = 0;
virtual void Close() = 0;
// total bytes written by the page writer
virtual int64_t total_bytes_written() const = 0;
// total bytes still compressed but not written
virtual int64_t total_compressed_bytes() const = 0;
};
explicit RowGroupWriter(std::unique_ptr<Contents> contents);
/// Construct a ColumnWriter for the indicated row group-relative column.
///
/// To be used only with ParquetFileWriter::AppendRowGroup
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
/// valid until the next call to NextColumn or Close. As the contents are
/// directly written to the sink, once a new column is started, the contents
/// of the previous one cannot be modified anymore.
ColumnWriter* NextColumn();
/// Index of currently written column. Equal to -1 if NextColumn()
/// has not been called yet.
int current_column();
void Close();
int num_columns() const;
/// Construct a ColumnWriter for the indicated row group column.
///
/// To be used only with ParquetFileWriter::AppendBufferedRowGroup
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is
/// valid until Close. The contents are buffered in memory and written to sink
/// on Close
ColumnWriter* column(int i);
/**
* Number of rows that shall be written as part of this RowGroup.
*/
int64_t num_rows() const;
int64_t total_bytes_written() const;
int64_t total_compressed_bytes() const;
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
};
PARQUET_EXPORT
void WriteFileMetaData(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);
PARQUET_EXPORT
void WriteMetaDataFile(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);
PARQUET_EXPORT
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
ArrowOutputStream* sink,
const std::shared_ptr<Encryptor>& encryptor,
bool encrypt_footer);
PARQUET_EXPORT
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink,
const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
bool encrypt_footer = false);
PARQUET_EXPORT
void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
::arrow::io::OutputStream* sink);
class PARQUET_EXPORT ParquetFileWriter {
public:
// Forward declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct Contents {
Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
std::shared_ptr<const KeyValueMetadata> key_value_metadata)
: schema_(), key_value_metadata_(std::move(key_value_metadata)) {
schema_.Init(std::move(schema));
}
virtual ~Contents() {}
// Perform any cleanup associated with the file contents
virtual void Close() = 0;
/// \note Deprecated since 1.3.0
RowGroupWriter* AppendRowGroup(int64_t num_rows);
virtual RowGroupWriter* AppendRowGroup() = 0;
virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
virtual int64_t num_rows() const = 0;
virtual int num_columns() const = 0;
virtual int num_row_groups() const = 0;
virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
return key_value_metadata_;
}
// Return const-pointer to make it clear that this object is not to be copied
const SchemaDescriptor* schema() const { return &schema_; }
SchemaDescriptor schema_;
/// This should be the only place this is stored. Everything else is a const reference
std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
std::shared_ptr<FileMetaData> file_metadata_;
};
ParquetFileWriter();
~ParquetFileWriter();
static std::unique_ptr<ParquetFileWriter> Open(
std::shared_ptr<::arrow::io::OutputStream> sink,
std::shared_ptr<schema::GroupNode> schema,
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
void Open(std::unique_ptr<Contents> contents);
void Close();
// Construct a RowGroupWriter for the indicated number of rows.
//
// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
// @param num_rows The number of rows that are stored in the new RowGroup
//
// \deprecated Since 1.3.0
RowGroupWriter* AppendRowGroup(int64_t num_rows);
/// Construct a RowGroupWriter with an arbitrary number of rows.
///
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
RowGroupWriter* AppendRowGroup();
/// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
/// Use this if you want to write a RowGroup based on a certain size
///
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
RowGroupWriter* AppendBufferedRowGroup();
/// Number of columns.
///
/// This number is fixed during the lifetime of the writer as it is determined via
/// the schema.
int num_columns() const;
/// Number of rows in the yet started RowGroups.
///
/// Changes on the addition of a new RowGroup.
int64_t num_rows() const;
/// Number of started RowGroups.
int num_row_groups() const;
/// Configuration passed to the writer, e.g. the used Parquet format version.
const std::shared_ptr<WriterProperties>& properties() const;
/// Returns the file schema descriptor
const SchemaDescriptor* schema() const;
/// Returns a column descriptor in schema
const ColumnDescriptor* descr(int i) const;
/// Returns the file custom metadata
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
/// Returns the file metadata, only available after calling Close().
const std::shared_ptr<FileMetaData> metadata() const;
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
std::shared_ptr<FileMetaData> file_metadata_;
};
} // namespace parquet

View File

@ -0,0 +1,72 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include "parquet/types.h"
namespace parquet {
// Abstract class for hash
class Hasher {
public:
/// Compute hash for 32 bits value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(int32_t value) const = 0;
/// Compute hash for 64 bits value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(int64_t value) const = 0;
/// Compute hash for float value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(float value) const = 0;
/// Compute hash for double value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(double value) const = 0;
/// Compute hash for Int96 value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(const Int96* value) const = 0;
/// Compute hash for ByteArray value by using its plain encoding result.
///
/// @param value the value to hash.
/// @return hash result.
virtual uint64_t Hash(const ByteArray* value) const = 0;
/// Compute hash for fixed byte array value by using its plain encoding result.
///
/// @param value the value address.
/// @param len the value length.
virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
virtual ~Hasher() = default;
};
} // namespace parquet

View File

@ -0,0 +1,40 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include "parquet/platform.h"
namespace parquet {
namespace internal {
/// Builds a bitmap where each set bit indicates the corresponding level is greater
/// than rhs.
uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
int16_t rhs);
struct MinMax {
int16_t min;
int16_t max;
};
MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
} // namespace internal
} // namespace parquet

View File

@ -0,0 +1,65 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/bit_util.h"
#include "arrow/util/endian.h"
#include "parquet/level_comparison.h"
// Used to make sure ODR rule isn't violated.
#ifndef PARQUET_IMPL_NAMESPACE
#error "PARQUET_IMPL_NAMESPACE must be defined"
#endif
namespace parquet {
namespace internal {
namespace PARQUET_IMPL_NAMESPACE {
/// Builds a bitmap by applying predicate to the level vector provided.
///
/// \param[in] levels Rep or def level array.
/// \param[in] num_levels The number of levels to process (must be [0, 64])
/// \param[in] predicate The predicate to apply (must have the signature `bool
/// predicate(int16_t)`.
/// \returns The bitmap using least significant "bit" ordering.
///
template <typename Predicate>
inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
Predicate predicate) {
// Both clang and GCC can vectorize this automatically with SSE4/AVX2.
uint64_t mask = 0;
for (int x = 0; x < num_levels; x++) {
mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
}
return ::arrow::bit_util::ToLittleEndian(mask);
}
inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
for (int x = 0; x < num_levels; x++) {
out.min = std::min(levels[x], out.min);
out.max = std::max(levels[x], out.max);
}
return out;
}
inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
int16_t rhs) {
return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
}
} // namespace PARQUET_IMPL_NAMESPACE
} // namespace internal
} // namespace parquet

View File

@ -0,0 +1,199 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include "arrow/util/endian.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
namespace parquet {
namespace internal {
struct PARQUET_EXPORT LevelInfo {
LevelInfo()
: null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
int32_t repeated_ancestor_definition_level)
: null_slot_usage(null_slots),
def_level(definition_level),
rep_level(repetition_level),
repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
bool operator==(const LevelInfo& b) const {
return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
rep_level == b.rep_level &&
repeated_ancestor_def_level == b.repeated_ancestor_def_level;
}
bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
// How many slots an undefined but present (i.e. null) element in
// parquet consumes when decoding to Arrow.
// "Slot" is used in the same context as the Arrow specification
// (i.e. a value holder).
// This is only ever >1 for descendents of FixedSizeList.
int32_t null_slot_usage = 1;
// The definition level at which the value for the field
// is considered not null (definition levels greater than
// or equal to this value indicate a not-null
// value for the field). For list fields definition levels
// greater than or equal to this field indicate a present,
// possibly null, child value.
int16_t def_level = 0;
// The repetition level corresponding to this element
// or the closest repeated ancestor. Any repetition
// level less than this indicates either a new list OR
// an empty list (which is determined in conjunction
// with definition levels).
int16_t rep_level = 0;
// The definition level indicating the level at which the closest
// repeated ancestor is not empty. This is used to discriminate
// between a value less than |def_level| being null or excluded entirely.
// For instance if we have an arrow schema like:
// list(struct(f0: int)). Then then there are the following
// definition levels:
// 0 = null list
// 1 = present but empty list.
// 2 = a null value in the list
// 3 = a non null struct but null integer.
// 4 = a present integer.
// When reconstructing, the struct and integer arrays'
// repeated_ancestor_def_level would be 2. Any
// def_level < 2 indicates that there isn't a corresponding
// child value in the list.
// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
// has the def levels [0, 1, 2, 3, 4]. The actual
// struct array is only of length 3: [not-set, set, set] and
// the int array is also of length 3: [N/A, null, 1].
//
int16_t repeated_ancestor_def_level = 0;
/// Increments levels according to the cardinality of node.
void Increment(const schema::Node& node) {
if (node.is_repeated()) {
IncrementRepeated();
return;
}
if (node.is_optional()) {
IncrementOptional();
return;
}
}
/// Incremetns level for a optional node.
void IncrementOptional() { def_level++; }
/// Increments levels for the repeated node. Returns
/// the previous ancestor_list_def_level.
int16_t IncrementRepeated() {
int16_t last_repeated_ancestor = repeated_ancestor_def_level;
// Repeated fields add both a repetition and definition level. This is used
// to distinguish between an empty list and a list with an item in it.
++rep_level;
++def_level;
// For levels >= repeated_ancenstor_def_level it indicates the list was
// non-null and had at least one element. This is important
// for later decoding because we need to add a slot for these
// values. for levels < current_def_level no slots are added
// to arrays.
repeated_ancestor_def_level = def_level;
return last_repeated_ancestor;
}
friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
// This print method is to silence valgrind issues. What's printed
// is not important because all asserts happen directly on
// members.
os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
<< ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
if (levels.null_slot_usage > 1) {
os << ", null_slot_usage=" << levels.null_slot_usage;
}
os << "}";
return os;
}
};
// Input/Output structure for reconstructed validity bitmaps.
struct PARQUET_EXPORT ValidityBitmapInputOutput {
// Input only.
// The maximum number of values_read expected (actual
// values read must be less than or equal to this value).
// If this number is exceeded methods will throw a
// ParquetException. Exceeding this limit indicates
// either a corrupt or incorrectly written file.
int64_t values_read_upper_bound = 0;
// Output only. The number of values added to the encountered
// (this is logically the count of the number of elements
// for an Arrow array).
int64_t values_read = 0;
// Input/Output. The number of nulls encountered.
int64_t null_count = 0;
// Output only. The validity bitmap to populate. May be be null only
// for DefRepLevelsToListInfo (if all that is needed is list offsets).
uint8_t* valid_bits = NULLPTR;
// Input only, offset into valid_bits to start at.
int64_t valid_bits_offset = 0;
};
// Converts def_levels to validity bitmaps for non-list arrays and structs that have
// at least one member that is not a list and has no list descendents.
// For lists use DefRepLevelsToList and structs where all descendants contain
// a list use DefRepLevelsToBitmap.
void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
LevelInfo level_info,
ValidityBitmapInputOutput* output);
// Reconstructs a validity bitmap and list offsets for a list arrays based on
// def/rep levels. The first element of offsets will not be modified if rep_levels
// starts with a new list. The first element of offsets will be used when calculating
// the next offset. See documentation onf DefLevelsToBitmap for when to use this
// method vs the other ones in this file for reconstruction.
//
// Offsets must be sized to 1 + values_read_upper_bound.
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
const int16_t* rep_levels, int64_t num_def_levels,
LevelInfo level_info,
ValidityBitmapInputOutput* output,
int32_t* offsets);
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
const int16_t* rep_levels, int64_t num_def_levels,
LevelInfo level_info,
ValidityBitmapInputOutput* output,
int64_t* offsets);
// Reconstructs a validity bitmap for a struct every member is a list or has
// a list descendant. See documentation on DefLevelsToBitmap for when more
// details on this method compared to the other ones defined above.
void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
const int16_t* rep_levels,
int64_t num_def_levels, LevelInfo level_info,
ValidityBitmapInputOutput* output);
// This is exposed to ensure we can properly test a software simulated pext function
// (i.e. it isn't hidden by runtime dispatch).
uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
} // namespace internal
} // namespace parquet

View File

@ -0,0 +1,357 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "parquet/level_conversion.h"
#include <algorithm>
#include <cstdint>
#include <limits>
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_writer.h"
#include "arrow/util/logging.h"
#include "arrow/util/simd.h"
#include "parquet/exception.h"
#include "parquet/level_comparison.h"
namespace parquet {
namespace internal {
#ifndef PARQUET_IMPL_NAMESPACE
#error "PARQUET_IMPL_NAMESPACE must be defined"
#endif
namespace PARQUET_IMPL_NAMESPACE {
// clang-format off
/* Python code to generate lookup table:
kLookupBits = 5
count = 0
print('constexpr int kLookupBits = {};'.format(kLookupBits))
print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
print(' ', end = '')
for mask in range(1 << kLookupBits):
for data in range(1 << kLookupBits):
bit_value = 0
bit_len = 0
for i in range(kLookupBits):
if mask & (1 << i):
bit_value |= (((data >> i) & 1) << bit_len)
bit_len += 1
out = '0x{:02X},'.format(bit_value)
count += 1
if count % (1 << kLookupBits) == 1:
print(' {')
if count % 8 == 1:
print(' ', end = '')
if count % 8 == 0:
print(out, end = '\n')
else:
print(out, end = ' ')
if count % (1 << kLookupBits) == 0:
print(' },', end = '')
print('\n};')
*/
// clang-format on
constexpr int kLookupBits = 5;
constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
},
{
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
},
{
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
},
{
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
},
{
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
},
{
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
},
{
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
},
{
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
},
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
},
{
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
},
{
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
},
{
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
},
{
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
},
{
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
},
{
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
},
{
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
},
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
},
{
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
},
{
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
},
{
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
},
{
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
},
{
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
},
{
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
},
{
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
},
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
},
{
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
},
{
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
},
{
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
},
{
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
},
{
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
},
{
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
},
{
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
},
};
inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
// A software emulation of _pext_u64
// These checks should be inline and are likely to be common cases.
if (select_bitmap == ~uint64_t{0}) {
return bitmap;
} else if (select_bitmap == 0) {
return 0;
}
// Fallback to lookup table method
uint64_t bit_value = 0;
int bit_len = 0;
constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
while (select_bitmap != 0) {
const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
bit_value |= (value << bit_len);
bit_len += mask_len;
bitmap >>= kLookupBits;
select_bitmap >>= kLookupBits;
}
return bit_value;
}
#ifdef ARROW_HAVE_BMI2
// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
#if UINTPTR_MAX == 0xFFFFFFFF
using extract_bitmap_t = uint32_t;
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
extract_bitmap_t select_bitmap) {
return _pext_u32(bitmap, select_bitmap);
}
#else
using extract_bitmap_t = uint64_t;
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
extract_bitmap_t select_bitmap) {
return _pext_u64(bitmap, select_bitmap);
}
#endif
#else // !defined(ARROW_HAVE_BMI2)
// Use 64-bit pext emulation when BMI2 isn't available.
using extract_bitmap_t = uint64_t;
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
extract_bitmap_t select_bitmap) {
return ExtractBitsSoftware(bitmap, select_bitmap);
}
#endif
static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
template <bool has_repeated_parent>
int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
int64_t upper_bound_remaining, LevelInfo level_info,
::arrow::internal::FirstTimeBitmapWriter* writer) {
DCHECK_LE(batch_size, kExtractBitsSize);
// Greater than level_info.def_level - 1 implies >= the def_level
auto defined_bitmap = static_cast<extract_bitmap_t>(
internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
if (has_repeated_parent) {
// Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
// repeated_ancestor_def_level
auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
int64_t selected_count = ::arrow::bit_util::PopCount(present_bitmap);
if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
throw ParquetException("Values read exceeded upper bound");
}
writer->AppendWord(selected_bits, selected_count);
return ::arrow::bit_util::PopCount(selected_bits);
} else {
if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
std::stringstream ss;
ss << "Values read exceeded upper bound";
throw ParquetException(ss.str());
}
writer->AppendWord(defined_bitmap, batch_size);
return ::arrow::bit_util::PopCount(defined_bitmap);
}
}
template <bool has_repeated_parent>
void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
LevelInfo level_info, ValidityBitmapInputOutput* output) {
::arrow::internal::FirstTimeBitmapWriter writer(
output->valid_bits,
/*start_offset=*/output->valid_bits_offset,
/*length=*/output->values_read_upper_bound);
int64_t set_count = 0;
output->values_read = 0;
int64_t values_read_remaining = output->values_read_upper_bound;
while (num_def_levels > kExtractBitsSize) {
set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
def_levels += kExtractBitsSize;
num_def_levels -= kExtractBitsSize;
values_read_remaining = output->values_read_upper_bound - writer.position();
}
set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
def_levels, num_def_levels, values_read_remaining, level_info, &writer);
output->values_read = writer.position();
output->null_count += output->values_read - set_count;
writer.Finish();
}
} // namespace PARQUET_IMPL_NAMESPACE
} // namespace internal
} // namespace parquet

View File

@ -0,0 +1,489 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "parquet/platform.h"
#include "parquet/properties.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace parquet {
class ColumnDescriptor;
class EncodedStatistics;
class Statistics;
class SchemaDescriptor;
class FileCryptoMetaData;
class InternalFileDecryptor;
class Decryptor;
class Encryptor;
class FooterSigningEncryptor;
namespace schema {
class ColumnPath;
} // namespace schema
using KeyValueMetadata = ::arrow::KeyValueMetadata;
class PARQUET_EXPORT ApplicationVersion {
public:
// Known Versions with Issues
static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
// Application that wrote the file. e.g. "IMPALA"
std::string application_;
// Build name
std::string build_;
// Version of the application that wrote the file, expressed as
// (<major>.<minor>.<patch>). Unmatched parts default to 0.
// "1.2.3" => {1, 2, 3}
// "1.2" => {1, 2, 0}
// "1.2-cdh5" => {1, 2, 0}
struct {
int major;
int minor;
int patch;
std::string unknown;
std::string pre_release;
std::string build_info;
} version;
ApplicationVersion() = default;
explicit ApplicationVersion(const std::string& created_by);
ApplicationVersion(std::string application, int major, int minor, int patch);
// Returns true if version is strictly less than other_version
bool VersionLt(const ApplicationVersion& other_version) const;
// Returns true if version is strictly equal with other_version
bool VersionEq(const ApplicationVersion& other_version) const;
// Checks if the Version has the correct statistics for a given column
bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
SortOrder::type sort_order = SortOrder::SIGNED) const;
};
class PARQUET_EXPORT ColumnCryptoMetaData {
public:
static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
~ColumnCryptoMetaData();
bool Equals(const ColumnCryptoMetaData& other) const;
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
bool encrypted_with_footer_key() const;
const std::string& key_metadata() const;
private:
explicit ColumnCryptoMetaData(const uint8_t* metadata);
class ColumnCryptoMetaDataImpl;
std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
};
/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
struct PageEncodingStats {
PageType::type page_type;
Encoding::type encoding;
int32_t count;
};
/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
class PARQUET_EXPORT ColumnChunkMetaData {
public:
// API convenience to get a MetaData accessor
static std::unique_ptr<ColumnChunkMetaData> Make(
const void* metadata, const ColumnDescriptor* descr,
const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
int16_t column_ordinal = -1,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
~ColumnChunkMetaData();
bool Equals(const ColumnChunkMetaData& other) const;
// column chunk
int64_t file_offset() const;
// parameter is only used when a dataset is spread across multiple files
const std::string& file_path() const;
// column metadata
bool is_metadata_set() const;
Type::type type() const;
int64_t num_values() const;
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
bool is_stats_set() const;
std::shared_ptr<Statistics> statistics() const;
Compression::type compression() const;
// Indicate if the ColumnChunk compression is supported by the current
// compiled parquet library.
bool can_decompress() const;
const std::vector<Encoding::type>& encodings() const;
const std::vector<PageEncodingStats>& encoding_stats() const;
bool has_dictionary_page() const;
int64_t dictionary_page_offset() const;
int64_t data_page_offset() const;
bool has_index_page() const;
int64_t index_page_offset() const;
int64_t total_compressed_size() const;
int64_t total_uncompressed_size() const;
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
private:
explicit ColumnChunkMetaData(
const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
// PIMPL Idiom
class ColumnChunkMetaDataImpl;
std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
};
/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
class PARQUET_EXPORT RowGroupMetaData {
public:
/// \brief Create a RowGroupMetaData from a serialized thrift message.
static std::unique_ptr<RowGroupMetaData> Make(
const void* metadata, const SchemaDescriptor* schema,
const ApplicationVersion* writer_version = NULLPTR,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
~RowGroupMetaData();
bool Equals(const RowGroupMetaData& other) const;
/// \brief The number of columns in this row group. The order must match the
/// parent's column ordering.
int num_columns() const;
/// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
///
/// WARNING, the returned object references memory location in it's parent
/// (RowGroupMetaData) object. Hence, the parent must outlive the returned
/// object.
///
/// \param[in] index of the ColumnChunkMetaData to retrieve.
///
/// \throws ParquetException if the index is out of bound.
std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
/// \brief Number of rows in this row group.
int64_t num_rows() const;
/// \brief Total byte size of all the uncompressed column data in this row group.
int64_t total_byte_size() const;
/// \brief Total byte size of all the compressed (and potentially encrypted)
/// column data in this row group.
///
/// This information is optional and may be 0 if omitted.
int64_t total_compressed_size() const;
/// \brief Byte offset from beginning of file to first page (data or
/// dictionary) in this row group
///
/// The file_offset field that this method exposes is optional. This method
/// will return 0 if that field is not set to a meaningful value.
int64_t file_offset() const;
// Return const-pointer to make it clear that this object is not to be copied
const SchemaDescriptor* schema() const;
// Indicate if all of the RowGroup's ColumnChunks can be decompressed.
bool can_decompress() const;
private:
explicit RowGroupMetaData(
const void* metadata, const SchemaDescriptor* schema,
const ApplicationVersion* writer_version = NULLPTR,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
// PIMPL Idiom
class RowGroupMetaDataImpl;
std::unique_ptr<RowGroupMetaDataImpl> impl_;
};
class FileMetaDataBuilder;
/// \brief FileMetaData is a proxy around format::FileMetaData.
class PARQUET_EXPORT FileMetaData {
public:
/// \brief Create a FileMetaData from a serialized thrift message.
static std::shared_ptr<FileMetaData> Make(
const void* serialized_metadata, uint32_t* inout_metadata_len,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
~FileMetaData();
bool Equals(const FileMetaData& other) const;
/// \brief The number of top-level columns in the schema.
///
/// Parquet thrift definition requires that nested schema elements are
/// flattened. This method returns the number of columns in the un-flattened
/// version.
int num_columns() const;
/// \brief The number of flattened schema elements.
///
/// Parquet thrift definition requires that nested schema elements are
/// flattened. This method returns the total number of elements in the
/// flattened list.
int num_schema_elements() const;
/// \brief The total number of rows.
int64_t num_rows() const;
/// \brief The number of row groups in the file.
int num_row_groups() const;
/// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
///
/// WARNING, the returned object references memory location in it's parent
/// (FileMetaData) object. Hence, the parent must outlive the returned object.
///
/// \param[in] index of the RowGroup to retrieve.
///
/// \throws ParquetException if the index is out of bound.
std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
/// \brief Return the "version" of the file
///
/// WARNING: The value returned by this method is unreliable as 1) the Parquet
/// file metadata stores the version as a single integer and 2) some producers
/// are known to always write a hardcoded value. Therefore, you cannot use
/// this value to know which features are used in the file.
ParquetVersion::type version() const;
/// \brief Return the application's user-agent string of the writer.
const std::string& created_by() const;
/// \brief Return the application's version of the writer.
const ApplicationVersion& writer_version() const;
/// \brief Size of the original thrift encoded metadata footer.
uint32_t size() const;
/// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed.
///
/// This will return false if any of the RowGroup's page is compressed with a
/// compression format which is not compiled in the current parquet library.
bool can_decompress() const;
bool is_encryption_algorithm_set() const;
EncryptionAlgorithm encryption_algorithm() const;
const std::string& footer_signing_key_metadata() const;
/// \brief Verify signature of FileMetaData when file is encrypted but footer
/// is not encrypted (plaintext footer).
bool VerifySignature(const void* signature);
void WriteTo(::arrow::io::OutputStream* dst,
const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
/// \brief Return Thrift-serialized representation of the metadata as a
/// string
std::string SerializeToString() const;
// Return const-pointer to make it clear that this object is not to be copied
const SchemaDescriptor* schema() const;
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
/// \brief Set a path to all ColumnChunk for all RowGroups.
///
/// Commonly used by systems (Dask, Spark) who generates an metadata-only
/// parquet file. The path is usually relative to said index file.
///
/// \param[in] path to set.
void set_file_path(const std::string& path);
/// \brief Merge row groups from another metadata file into this one.
///
/// The schema of the input FileMetaData must be equal to the
/// schema of this object.
///
/// This is used by systems who creates an aggregate metadata-only file by
/// concatenating the row groups of multiple files. This newly created
/// metadata file acts as an index of all available row groups.
///
/// \param[in] other FileMetaData to merge the row groups from.
///
/// \throws ParquetException if schemas are not equal.
void AppendRowGroups(const FileMetaData& other);
/// \brief Return a FileMetaData containing a subset of the row groups in this
/// FileMetaData.
std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
private:
friend FileMetaDataBuilder;
friend class SerializedFile;
explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
// PIMPL Idiom
FileMetaData();
class FileMetaDataImpl;
std::unique_ptr<FileMetaDataImpl> impl_;
};
class PARQUET_EXPORT FileCryptoMetaData {
public:
// API convenience to get a MetaData accessor
static std::shared_ptr<FileCryptoMetaData> Make(const uint8_t* serialized_metadata,
uint32_t* metadata_len);
~FileCryptoMetaData();
EncryptionAlgorithm encryption_algorithm() const;
const std::string& key_metadata() const;
void WriteTo(::arrow::io::OutputStream* dst) const;
private:
friend FileMetaDataBuilder;
FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
// PIMPL Idiom
FileCryptoMetaData();
class FileCryptoMetaDataImpl;
std::unique_ptr<FileCryptoMetaDataImpl> impl_;
};
// Builder API
class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
public:
// API convenience to get a MetaData reader
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
void* contents);
~ColumnChunkMetaDataBuilder();
// column chunk
// Used when a dataset is spread across multiple files
void set_file_path(const std::string& path);
// column metadata
void SetStatistics(const EncodedStatistics& stats);
// get the column descriptor
const ColumnDescriptor* descr() const;
int64_t total_compressed_size() const;
// commit the metadata
void Finish(int64_t num_values, int64_t dictionary_page_offset,
int64_t index_page_offset, int64_t data_page_offset,
int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
bool dictionary_fallback,
const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
const std::map<Encoding::type, int32_t>& data_encoding_stats_,
const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
// The metadata contents, suitable for passing to ColumnChunkMetaData::Make
const void* contents() const;
// For writing metadata at end of column chunk
void WriteTo(::arrow::io::OutputStream* sink);
private:
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
const ColumnDescriptor* column);
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
const ColumnDescriptor* column, void* contents);
// PIMPL Idiom
class ColumnChunkMetaDataBuilderImpl;
std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
};
class PARQUET_EXPORT RowGroupMetaDataBuilder {
public:
// API convenience to get a MetaData reader
static std::unique_ptr<RowGroupMetaDataBuilder> Make(
std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
void* contents);
~RowGroupMetaDataBuilder();
ColumnChunkMetaDataBuilder* NextColumnChunk();
int num_columns();
int64_t num_rows();
int current_column() const;
void set_num_rows(int64_t num_rows);
// commit the metadata
void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
private:
explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
const SchemaDescriptor* schema_, void* contents);
// PIMPL Idiom
class RowGroupMetaDataBuilderImpl;
std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
};
class PARQUET_EXPORT FileMetaDataBuilder {
public:
// API convenience to get a MetaData reader
static std::unique_ptr<FileMetaDataBuilder> Make(
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
~FileMetaDataBuilder();
// The prior RowGroupMetaDataBuilder (if any) is destroyed
RowGroupMetaDataBuilder* AppendRowGroup();
// Complete the Thrift structure
std::unique_ptr<FileMetaData> Finish();
// crypto metadata
std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
private:
explicit FileMetaDataBuilder(
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
// PIMPL Idiom
class FileMetaDataBuilderImpl;
std::unique_ptr<FileMetaDataBuilderImpl> impl_;
};
PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
} // namespace parquet

View File

@ -0,0 +1,54 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
#pragma once
#include <cstdint>
#include "parquet/hasher.h"
#include "parquet/platform.h"
#include "parquet/types.h"
namespace parquet {
/// Source:
/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class)
class PARQUET_EXPORT MurmurHash3 : public Hasher {
public:
MurmurHash3() : seed_(DEFAULT_SEED) {}
uint64_t Hash(int32_t value) const override;
uint64_t Hash(int64_t value) const override;
uint64_t Hash(float value) const override;
uint64_t Hash(double value) const override;
uint64_t Hash(const Int96* value) const override;
uint64_t Hash(const ByteArray* value) const override;
uint64_t Hash(const FLBA* val, uint32_t len) const override;
private:
// Default seed for hash which comes from Bloom filter in parquet-mr, it is generated
// by System.nanoTime() of java.
static constexpr int DEFAULT_SEED = 1361930890;
uint32_t seed_;
};
} // namespace parquet

View File

@ -0,0 +1,31 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef PARQUET_VERSION_H
#define PARQUET_VERSION_H
#define PARQUET_VERSION_MAJOR 8
#define PARQUET_VERSION_MINOR 0
#define PARQUET_VERSION_PATCH 0
#define PARQUET_SO_VERSION "800"
#define PARQUET_FULL_SO_VERSION "800.0.0"
// define the parquet created by version
#define CREATED_BY_VERSION "parquet-cpp-arrow version 8.0.0"
#endif // PARQUET_VERSION_H

View File

@ -0,0 +1,28 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Often-used headers, for precompiling.
// If updating this header, please make sure you check compilation speed
// before checking in. Adding headers which are not used extremely often
// may incur a slowdown, since it makes the precompiled header heavier to load.
#include "parquet/encoding.h"
#include "parquet/exception.h"
#include "parquet/metadata.h"
#include "parquet/properties.h"
#include "parquet/schema.h"
#include "parquet/types.h"

View File

@ -0,0 +1,111 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/buffer.h" // IWYU pragma: export
#include "arrow/io/interfaces.h" // IWYU pragma: export
#include "arrow/status.h" // IWYU pragma: export
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/util/macros.h" // IWYU pragma: export
#if defined(_WIN32) || defined(__CYGWIN__)
#if defined(_MSC_VER)
#pragma warning(push)
// Disable warning for STL types usage in DLL interface
// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
#pragma warning(disable : 4275 4251)
// Disable diamond inheritance warnings
#pragma warning(disable : 4250)
// Disable macro redefinition warnings
#pragma warning(disable : 4005)
// Disable extern before exported template warnings
#pragma warning(disable : 4910)
#else
#pragma GCC diagnostic ignored "-Wattributes"
#endif
#ifdef PARQUET_STATIC
#define PARQUET_EXPORT
#elif defined(PARQUET_EXPORTING)
#define PARQUET_EXPORT __declspec(dllexport)
#else
#define PARQUET_EXPORT __declspec(dllimport)
#endif
#define PARQUET_NO_EXPORT
#else // Not Windows
#ifndef PARQUET_EXPORT
#define PARQUET_EXPORT __attribute__((visibility("default")))
#endif
#ifndef PARQUET_NO_EXPORT
#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
#endif
#endif // Non-Windows
// This is a complicated topic, some reading on it:
// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
#if defined(_MSC_VER) || defined(__clang__)
#define PARQUET_TEMPLATE_CLASS_EXPORT
#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
#else
#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
#define PARQUET_TEMPLATE_EXPORT
#endif
#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
#define PARQUET_NORETURN ARROW_NORETURN
#define PARQUET_DEPRECATED ARROW_DEPRECATED
// If ARROW_VALGRIND set when compiling unit tests, also define
// PARQUET_VALGRIND
#ifdef ARROW_VALGRIND
#define PARQUET_VALGRIND
#endif
namespace parquet {
using Buffer = ::arrow::Buffer;
using Codec = ::arrow::util::Codec;
using Compression = ::arrow::Compression;
using MemoryPool = ::arrow::MemoryPool;
using MutableBuffer = ::arrow::MutableBuffer;
using ResizableBuffer = ::arrow::ResizableBuffer;
using ResizableBuffer = ::arrow::ResizableBuffer;
using ArrowInputFile = ::arrow::io::RandomAccessFile;
using ArrowInputStream = ::arrow::io::InputStream;
using ArrowOutputStream = ::arrow::io::OutputStream;
constexpr int64_t kDefaultOutputStreamSize = 1024;
constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
PARQUET_EXPORT
std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
PARQUET_EXPORT
std::shared_ptr<ResizableBuffer> AllocateBuffer(
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
} // namespace parquet

View File

@ -0,0 +1,46 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <iosfwd>
#include <list>
#include "parquet/platform.h"
namespace parquet {
class ParquetFileReader;
class PARQUET_EXPORT ParquetFilePrinter {
private:
ParquetFileReader* fileReader;
public:
explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
~ParquetFilePrinter() {}
void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
bool print_values = false, bool format_dump = false,
bool print_key_value_metadata = false,
const char* filename = "No Name");
void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
const char* filename = "No Name");
};
} // namespace parquet

View File

@ -0,0 +1,837 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "arrow/io/caching.h"
#include "arrow/type.h"
#include "arrow/util/compression.h"
#include "parquet/encryption/encryption.h"
#include "parquet/exception.h"
#include "parquet/parquet_version.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
#include "parquet/type_fwd.h"
#include "parquet/types.h"
namespace parquet {
/// Controls serialization format of data pages. parquet-format v2.0.0
/// introduced a new data page metadata type DataPageV2 and serialized page
/// structure (for example, encoded levels are no longer compressed). Prior to
/// the completion of PARQUET-457 in 2020, this library did not implement
/// DataPageV2 correctly, so if you use the V2 data page format, you may have
/// forward compatibility issues (older versions of the library will be unable
/// to read the files). Note that some Parquet implementations do not implement
/// DataPageV2 at all.
enum class ParquetDataPageVersion { V1, V2 };
/// Align the default buffer size to a small multiple of a page size.
constexpr int64_t kDefaultBufferSize = 4096 * 4;
class PARQUET_EXPORT ReaderProperties {
public:
explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
: pool_(pool) {}
MemoryPool* memory_pool() const { return pool_; }
std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
int64_t start, int64_t num_bytes);
/// Buffered stream reading allows the user to control the memory usage of
/// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
/// wrapped in a buffered reader that uses a fix sized buffer (of size
/// `buffer_size()`) instead of the full size of the ReadAt.
///
/// The primary reason for this control knobs is for resource control and not
/// performance.
bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
void enable_buffered_stream() { buffered_stream_enabled_ = true; }
void disable_buffered_stream() { buffered_stream_enabled_ = false; }
int64_t buffer_size() const { return buffer_size_; }
void set_buffer_size(int64_t size) { buffer_size_ = size; }
void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
file_decryption_properties_ = std::move(decryption);
}
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
return file_decryption_properties_;
}
private:
MemoryPool* pool_;
int64_t buffer_size_ = kDefaultBufferSize;
bool buffered_stream_enabled_ = false;
std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
};
ReaderProperties PARQUET_EXPORT default_reader_properties();
static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
class PARQUET_EXPORT ColumnProperties {
public:
ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
Compression::type codec = DEFAULT_COMPRESSION_TYPE,
bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
: encoding_(encoding),
codec_(codec),
dictionary_enabled_(dictionary_enabled),
statistics_enabled_(statistics_enabled),
max_stats_size_(max_stats_size),
compression_level_(Codec::UseDefaultCompressionLevel()) {}
void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
void set_compression(Compression::type codec) { codec_ = codec; }
void set_dictionary_enabled(bool dictionary_enabled) {
dictionary_enabled_ = dictionary_enabled;
}
void set_statistics_enabled(bool statistics_enabled) {
statistics_enabled_ = statistics_enabled;
}
void set_max_statistics_size(size_t max_stats_size) {
max_stats_size_ = max_stats_size;
}
void set_compression_level(int compression_level) {
compression_level_ = compression_level;
}
Encoding::type encoding() const { return encoding_; }
Compression::type compression() const { return codec_; }
bool dictionary_enabled() const { return dictionary_enabled_; }
bool statistics_enabled() const { return statistics_enabled_; }
size_t max_statistics_size() const { return max_stats_size_; }
int compression_level() const { return compression_level_; }
private:
Encoding::type encoding_;
Compression::type codec_;
bool dictionary_enabled_;
bool statistics_enabled_;
size_t max_stats_size_;
int compression_level_;
};
class PARQUET_EXPORT WriterProperties {
public:
class Builder {
public:
Builder()
: pool_(::arrow::default_memory_pool()),
dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
pagesize_(kDefaultDataPageSize),
version_(ParquetVersion::PARQUET_1_0),
data_page_version_(ParquetDataPageVersion::V1),
created_by_(DEFAULT_CREATED_BY) {}
virtual ~Builder() {}
/// Specify the memory pool for the writer. Default default_memory_pool.
Builder* memory_pool(MemoryPool* pool) {
pool_ = pool;
return this;
}
/// Enable dictionary encoding in general for all columns. Default enabled.
Builder* enable_dictionary() {
default_column_properties_.set_dictionary_enabled(true);
return this;
}
/// Disable dictionary encoding in general for all columns. Default enabled.
Builder* disable_dictionary() {
default_column_properties_.set_dictionary_enabled(false);
return this;
}
/// Enable dictionary encoding for column specified by `path`. Default enabled.
Builder* enable_dictionary(const std::string& path) {
dictionary_enabled_[path] = true;
return this;
}
/// Enable dictionary encoding for column specified by `path`. Default enabled.
Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
return this->enable_dictionary(path->ToDotString());
}
/// Disable dictionary encoding for column specified by `path`. Default enabled.
Builder* disable_dictionary(const std::string& path) {
dictionary_enabled_[path] = false;
return this;
}
/// Disable dictionary encoding for column specified by `path`. Default enabled.
Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
return this->disable_dictionary(path->ToDotString());
}
/// Specify the dictionary page size limit per row group. Default 1MB.
Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
dictionary_pagesize_limit_ = dictionary_psize_limit;
return this;
}
/// Specify the write batch size while writing batches of Arrow values into Parquet.
/// Default 1024.
Builder* write_batch_size(int64_t write_batch_size) {
write_batch_size_ = write_batch_size;
return this;
}
/// Specify the max row group length.
/// Default 64M.
Builder* max_row_group_length(int64_t max_row_group_length) {
max_row_group_length_ = max_row_group_length;
return this;
}
/// Specify the data page size.
/// Default 1MB.
Builder* data_pagesize(int64_t pg_size) {
pagesize_ = pg_size;
return this;
}
/// Specify the data page version.
/// Default V1.
Builder* data_page_version(ParquetDataPageVersion data_page_version) {
data_page_version_ = data_page_version;
return this;
}
/// Specify the Parquet file version.
/// Default PARQUET_1_0.
Builder* version(ParquetVersion::type version) {
version_ = version;
return this;
}
Builder* created_by(const std::string& created_by) {
created_by_ = created_by;
return this;
}
/// \brief Define the encoding that is used when we don't utilise dictionary encoding.
//
/// This either apply if dictionary encoding is disabled or if we fallback
/// as the dictionary grew too large.
Builder* encoding(Encoding::type encoding_type) {
if (encoding_type == Encoding::PLAIN_DICTIONARY ||
encoding_type == Encoding::RLE_DICTIONARY) {
throw ParquetException("Can't use dictionary encoding as fallback encoding");
}
default_column_properties_.set_encoding(encoding_type);
return this;
}
/// \brief Define the encoding that is used when we don't utilise dictionary encoding.
//
/// This either apply if dictionary encoding is disabled or if we fallback
/// as the dictionary grew too large.
Builder* encoding(const std::string& path, Encoding::type encoding_type) {
if (encoding_type == Encoding::PLAIN_DICTIONARY ||
encoding_type == Encoding::RLE_DICTIONARY) {
throw ParquetException("Can't use dictionary encoding as fallback encoding");
}
encodings_[path] = encoding_type;
return this;
}
/// \brief Define the encoding that is used when we don't utilise dictionary encoding.
//
/// This either apply if dictionary encoding is disabled or if we fallback
/// as the dictionary grew too large.
Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
Encoding::type encoding_type) {
return this->encoding(path->ToDotString(), encoding_type);
}
/// Specify compression codec in general for all columns.
/// Default UNCOMPRESSED.
Builder* compression(Compression::type codec) {
default_column_properties_.set_compression(codec);
return this;
}
/// Specify max statistics size to store min max value.
/// Default 4KB.
Builder* max_statistics_size(size_t max_stats_sz) {
default_column_properties_.set_max_statistics_size(max_stats_sz);
return this;
}
/// Specify compression codec for the column specified by `path`.
/// Default UNCOMPRESSED.
Builder* compression(const std::string& path, Compression::type codec) {
codecs_[path] = codec;
return this;
}
/// Specify compression codec for the column specified by `path`.
/// Default UNCOMPRESSED.
Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
Compression::type codec) {
return this->compression(path->ToDotString(), codec);
}
/// \brief Specify the default compression level for the compressor in
/// every column. In case a column does not have an explicitly specified
/// compression level, the default one would be used.
///
/// The provided compression level is compressor specific. The user would
/// have to familiarize oneself with the available levels for the selected
/// compressor. If the compressor does not allow for selecting different
/// compression levels, calling this function would not have any effect.
/// Parquet and Arrow do not validate the passed compression level. If no
/// level is selected by the user or if the special
/// std::numeric_limits<int>::min() value is passed, then Arrow selects the
/// compression level.
Builder* compression_level(int compression_level) {
default_column_properties_.set_compression_level(compression_level);
return this;
}
/// \brief Specify a compression level for the compressor for the column
/// described by path.
///
/// The provided compression level is compressor specific. The user would
/// have to familiarize oneself with the available levels for the selected
/// compressor. If the compressor does not allow for selecting different
/// compression levels, calling this function would not have any effect.
/// Parquet and Arrow do not validate the passed compression level. If no
/// level is selected by the user or if the special
/// std::numeric_limits<int>::min() value is passed, then Arrow selects the
/// compression level.
Builder* compression_level(const std::string& path, int compression_level) {
codecs_compression_level_[path] = compression_level;
return this;
}
/// \brief Specify a compression level for the compressor for the column
/// described by path.
///
/// The provided compression level is compressor specific. The user would
/// have to familiarize oneself with the available levels for the selected
/// compressor. If the compressor does not allow for selecting different
/// compression levels, calling this function would not have any effect.
/// Parquet and Arrow do not validate the passed compression level. If no
/// level is selected by the user or if the special
/// std::numeric_limits<int>::min() value is passed, then Arrow selects the
/// compression level.
Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
int compression_level) {
return this->compression_level(path->ToDotString(), compression_level);
}
/// Define the file encryption properties.
/// Default NULL.
Builder* encryption(
std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
file_encryption_properties_ = std::move(file_encryption_properties);
return this;
}
/// Enable statistics in general.
/// Default enabled.
Builder* enable_statistics() {
default_column_properties_.set_statistics_enabled(true);
return this;
}
/// Disable statistics in general.
/// Default enabled.
Builder* disable_statistics() {
default_column_properties_.set_statistics_enabled(false);
return this;
}
/// Enable statistics for the column specified by `path`.
/// Default enabled.
Builder* enable_statistics(const std::string& path) {
statistics_enabled_[path] = true;
return this;
}
/// Enable statistics for the column specified by `path`.
/// Default enabled.
Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
return this->enable_statistics(path->ToDotString());
}
/// Disable statistics for the column specified by `path`.
/// Default enabled.
Builder* disable_statistics(const std::string& path) {
statistics_enabled_[path] = false;
return this;
}
/// Disable statistics for the column specified by `path`.
/// Default enabled.
Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
return this->disable_statistics(path->ToDotString());
}
/// \brief Build the WriterProperties with the builder parameters.
/// \return The WriterProperties defined by the builder.
std::shared_ptr<WriterProperties> build() {
std::unordered_map<std::string, ColumnProperties> column_properties;
auto get = [&](const std::string& key) -> ColumnProperties& {
auto it = column_properties.find(key);
if (it == column_properties.end())
return column_properties[key] = default_column_properties_;
else
return it->second;
};
for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
for (const auto& item : codecs_) get(item.first).set_compression(item.second);
for (const auto& item : codecs_compression_level_)
get(item.first).set_compression_level(item.second);
for (const auto& item : dictionary_enabled_)
get(item.first).set_dictionary_enabled(item.second);
for (const auto& item : statistics_enabled_)
get(item.first).set_statistics_enabled(item.second);
return std::shared_ptr<WriterProperties>(new WriterProperties(
pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
pagesize_, version_, created_by_, std::move(file_encryption_properties_),
default_column_properties_, column_properties, data_page_version_));
}
private:
MemoryPool* pool_;
int64_t dictionary_pagesize_limit_;
int64_t write_batch_size_;
int64_t max_row_group_length_;
int64_t pagesize_;
ParquetVersion::type version_;
ParquetDataPageVersion data_page_version_;
std::string created_by_;
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
// Settings used for each column unless overridden in any of the maps below
ColumnProperties default_column_properties_;
std::unordered_map<std::string, Encoding::type> encodings_;
std::unordered_map<std::string, Compression::type> codecs_;
std::unordered_map<std::string, int32_t> codecs_compression_level_;
std::unordered_map<std::string, bool> dictionary_enabled_;
std::unordered_map<std::string, bool> statistics_enabled_;
};
inline MemoryPool* memory_pool() const { return pool_; }
inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
inline int64_t write_batch_size() const { return write_batch_size_; }
inline int64_t max_row_group_length() const { return max_row_group_length_; }
inline int64_t data_pagesize() const { return pagesize_; }
inline ParquetDataPageVersion data_page_version() const {
return parquet_data_page_version_;
}
inline ParquetVersion::type version() const { return parquet_version_; }
inline std::string created_by() const { return parquet_created_by_; }
inline Encoding::type dictionary_index_encoding() const {
if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
return Encoding::PLAIN_DICTIONARY;
} else {
return Encoding::RLE_DICTIONARY;
}
}
inline Encoding::type dictionary_page_encoding() const {
if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
return Encoding::PLAIN_DICTIONARY;
} else {
return Encoding::PLAIN;
}
}
const ColumnProperties& column_properties(
const std::shared_ptr<schema::ColumnPath>& path) const {
auto it = column_properties_.find(path->ToDotString());
if (it != column_properties_.end()) return it->second;
return default_column_properties_;
}
Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
return column_properties(path).encoding();
}
Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
return column_properties(path).compression();
}
int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
return column_properties(path).compression_level();
}
bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
return column_properties(path).dictionary_enabled();
}
bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
return column_properties(path).statistics_enabled();
}
size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
return column_properties(path).max_statistics_size();
}
inline FileEncryptionProperties* file_encryption_properties() const {
return file_encryption_properties_.get();
}
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
const std::string& path) const {
if (file_encryption_properties_) {
return file_encryption_properties_->column_encryption_properties(path);
} else {
return NULLPTR;
}
}
private:
explicit WriterProperties(
MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
const std::string& created_by,
std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
const ColumnProperties& default_column_properties,
const std::unordered_map<std::string, ColumnProperties>& column_properties,
ParquetDataPageVersion data_page_version)
: pool_(pool),
dictionary_pagesize_limit_(dictionary_pagesize_limit),
write_batch_size_(write_batch_size),
max_row_group_length_(max_row_group_length),
pagesize_(pagesize),
parquet_data_page_version_(data_page_version),
parquet_version_(version),
parquet_created_by_(created_by),
file_encryption_properties_(file_encryption_properties),
default_column_properties_(default_column_properties),
column_properties_(column_properties) {}
MemoryPool* pool_;
int64_t dictionary_pagesize_limit_;
int64_t write_batch_size_;
int64_t max_row_group_length_;
int64_t pagesize_;
ParquetDataPageVersion parquet_data_page_version_;
ParquetVersion::type parquet_version_;
std::string parquet_created_by_;
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
ColumnProperties default_column_properties_;
std::unordered_map<std::string, ColumnProperties> column_properties_;
};
PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
// ----------------------------------------------------------------------
// Properties specific to Apache Arrow columnar read and write
static constexpr bool kArrowDefaultUseThreads = false;
// Default number of rows to read when using ::arrow::RecordBatchReader
static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
/// EXPERIMENTAL: Properties for configuring FileReader behavior.
class PARQUET_EXPORT ArrowReaderProperties {
public:
explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
: use_threads_(use_threads),
read_dict_indices_(),
batch_size_(kArrowDefaultBatchSize),
pre_buffer_(false),
cache_options_(::arrow::io::CacheOptions::Defaults()),
coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
bool use_threads() const { return use_threads_; }
void set_read_dictionary(int column_index, bool read_dict) {
if (read_dict) {
read_dict_indices_.insert(column_index);
} else {
read_dict_indices_.erase(column_index);
}
}
bool read_dictionary(int column_index) const {
if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
return true;
} else {
return false;
}
}
void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
int64_t batch_size() const { return batch_size_; }
/// Enable read coalescing.
///
/// When enabled, the Arrow reader will pre-buffer necessary regions
/// of the file in-memory. This is intended to improve performance on
/// high-latency filesystems (e.g. Amazon S3).
void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
bool pre_buffer() const { return pre_buffer_; }
/// Set options for read coalescing. This can be used to tune the
/// implementation for characteristics of different filesystems.
void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
/// Set execution context for read coalescing.
void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
const ::arrow::io::IOContext& io_context() const { return io_context_; }
/// Set timestamp unit to use for deprecated INT96-encoded timestamps
/// (default is NANO).
void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
coerce_int96_timestamp_unit_ = unit;
}
::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
return coerce_int96_timestamp_unit_;
}
private:
bool use_threads_;
std::unordered_set<int> read_dict_indices_;
int64_t batch_size_;
bool pre_buffer_;
::arrow::io::IOContext io_context_;
::arrow::io::CacheOptions cache_options_;
::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
};
/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
PARQUET_EXPORT
ArrowReaderProperties default_arrow_reader_properties();
class PARQUET_EXPORT ArrowWriterProperties {
public:
enum EngineVersion {
V1, // Supports only nested lists.
V2 // Full support for all nesting combinations
};
class Builder {
public:
Builder()
: write_timestamps_as_int96_(false),
coerce_timestamps_enabled_(false),
coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
truncated_timestamps_allowed_(false),
store_schema_(false),
// TODO: At some point we should flip this.
compliant_nested_types_(false),
engine_version_(V2) {}
virtual ~Builder() = default;
Builder* disable_deprecated_int96_timestamps() {
write_timestamps_as_int96_ = false;
return this;
}
Builder* enable_deprecated_int96_timestamps() {
write_timestamps_as_int96_ = true;
return this;
}
Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
coerce_timestamps_enabled_ = true;
coerce_timestamps_unit_ = unit;
return this;
}
Builder* allow_truncated_timestamps() {
truncated_timestamps_allowed_ = true;
return this;
}
Builder* disallow_truncated_timestamps() {
truncated_timestamps_allowed_ = false;
return this;
}
/// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
/// to enable certain read options (like "read_dictionary") to be set
/// automatically
Builder* store_schema() {
store_schema_ = true;
return this;
}
Builder* enable_compliant_nested_types() {
compliant_nested_types_ = true;
return this;
}
Builder* disable_compliant_nested_types() {
compliant_nested_types_ = false;
return this;
}
Builder* set_engine_version(EngineVersion version) {
engine_version_ = version;
return this;
}
std::shared_ptr<ArrowWriterProperties> build() {
return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
engine_version_));
}
private:
bool write_timestamps_as_int96_;
bool coerce_timestamps_enabled_;
::arrow::TimeUnit::type coerce_timestamps_unit_;
bool truncated_timestamps_allowed_;
bool store_schema_;
bool compliant_nested_types_;
EngineVersion engine_version_;
};
bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
::arrow::TimeUnit::type coerce_timestamps_unit() const {
return coerce_timestamps_unit_;
}
bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
bool store_schema() const { return store_schema_; }
/// \brief Enable nested type naming according to the parquet specification.
///
/// Older versions of arrow wrote out field names for nested lists based on the name
/// of the field. According to the parquet specification they should always be
/// "element".
bool compliant_nested_types() const { return compliant_nested_types_; }
/// \brief The underlying engine version to use when writing Arrow data.
///
/// V2 is currently the latest V1 is considered deprecated but left in
/// place in case there are bugs detected in V2.
EngineVersion engine_version() const { return engine_version_; }
private:
explicit ArrowWriterProperties(bool write_nanos_as_int96,
bool coerce_timestamps_enabled,
::arrow::TimeUnit::type coerce_timestamps_unit,
bool truncated_timestamps_allowed, bool store_schema,
bool compliant_nested_types,
EngineVersion engine_version)
: write_timestamps_as_int96_(write_nanos_as_int96),
coerce_timestamps_enabled_(coerce_timestamps_enabled),
coerce_timestamps_unit_(coerce_timestamps_unit),
truncated_timestamps_allowed_(truncated_timestamps_allowed),
store_schema_(store_schema),
compliant_nested_types_(compliant_nested_types),
engine_version_(engine_version) {}
const bool write_timestamps_as_int96_;
const bool coerce_timestamps_enabled_;
const ::arrow::TimeUnit::type coerce_timestamps_unit_;
const bool truncated_timestamps_allowed_;
const bool store_schema_;
const bool compliant_nested_types_;
const EngineVersion engine_version_;
};
/// \brief State object used for writing Arrow data directly to a Parquet
/// column chunk. API possibly not stable
struct ArrowWriteContext {
ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
: memory_pool(memory_pool),
properties(properties),
data_buffer(AllocateBuffer(memory_pool)),
def_levels_buffer(AllocateBuffer(memory_pool)) {}
template <typename T>
::arrow::Status GetScratchData(const int64_t num_values, T** out) {
ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
*out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
return ::arrow::Status::OK();
}
MemoryPool* memory_pool;
const ArrowWriterProperties* properties;
// Buffer used for storing the data of an array converted to the physical type
// as expected by parquet-cpp.
std::shared_ptr<ResizableBuffer> data_buffer;
// We use the shared ownership of this buffer
std::shared_ptr<ResizableBuffer> def_levels_buffer;
};
PARQUET_EXPORT
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
} // namespace parquet

View File

@ -0,0 +1,492 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This module contains the logical parquet-cpp types (independent of Thrift
// structures), schema nodes, and related type tools
#pragma once
#include <cstdint>
#include <memory>
#include <ostream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "parquet/platform.h"
#include "parquet/types.h"
#include "parquet/windows_fixup.h" // for OPTIONAL
namespace parquet {
class SchemaDescriptor;
namespace schema {
class Node;
// List encodings: using the terminology from Impala to define different styles
// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
// the converted type named in the Parquet metadata is ConvertedType::LIST we
// use that terminology here. It also helps distinguish from the *_ARRAY
// primitive types.
//
// One-level encoding: Only allows required lists with required cells
// repeated value_type name
//
// Two-level encoding: Enables optional lists with only required cells
// <required/optional> group list
// repeated value_type item
//
// Three-level encoding: Enables optional lists with optional cells
// <required/optional> group bag
// repeated group list
// <required/optional> value_type item
//
// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
// the non-repeated nodes set to required.
//
// The "official" encoding recommended in the Parquet spec is the 3-level, and
// we use that as the default when creating list types. For semantic completeness
// we allow the other two. Since all types of encodings will occur "in the
// wild" we need to be able to interpret the associated definition levels in
// the context of the actual encoding used in the file.
//
// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
// SchemaElement, which could make things challenging if we are trying to infer
// that a sequence of nodes semantically represents an array according to one
// of these encodings (versus a struct containing an array). We should refuse
// the temptation to guess, as they say.
struct ListEncoding {
enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
};
class PARQUET_EXPORT ColumnPath {
public:
ColumnPath() : path_() {}
explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
static std::shared_ptr<ColumnPath> FromNode(const Node& node);
std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
std::string ToDotString() const;
const std::vector<std::string>& ToDotVector() const;
protected:
std::vector<std::string> path_;
};
// Base class for logical schema types. A type has a name, repetition level,
// and optionally a logical type (ConvertedType in Parquet metadata parlance)
class PARQUET_EXPORT Node {
public:
enum type { PRIMITIVE, GROUP };
virtual ~Node() {}
bool is_primitive() const { return type_ == Node::PRIMITIVE; }
bool is_group() const { return type_ == Node::GROUP; }
bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
bool is_required() const { return repetition_ == Repetition::REQUIRED; }
virtual bool Equals(const Node* other) const = 0;
const std::string& name() const { return name_; }
Node::type node_type() const { return type_; }
Repetition::type repetition() const { return repetition_; }
ConvertedType::type converted_type() const { return converted_type_; }
const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
/// \brief The field_id value for the serialized SchemaElement. If the
/// field_id is less than 0 (e.g. -1), it will not be set when serialized to
/// Thrift.
int field_id() const { return field_id_; }
const Node* parent() const { return parent_; }
const std::shared_ptr<ColumnPath> path() const;
virtual void ToParquet(void* element) const = 0;
// Node::Visitor abstract class for walking schemas with the visitor pattern
class Visitor {
public:
virtual ~Visitor() {}
virtual void Visit(Node* node) = 0;
};
class ConstVisitor {
public:
virtual ~ConstVisitor() {}
virtual void Visit(const Node* node) = 0;
};
virtual void Visit(Visitor* visitor) = 0;
virtual void VisitConst(ConstVisitor* visitor) const = 0;
protected:
friend class GroupNode;
Node(Node::type type, const std::string& name, Repetition::type repetition,
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
: type_(type),
name_(name),
repetition_(repetition),
converted_type_(converted_type),
field_id_(field_id),
parent_(NULLPTR) {}
Node(Node::type type, const std::string& name, Repetition::type repetition,
std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
: type_(type),
name_(name),
repetition_(repetition),
logical_type_(std::move(logical_type)),
field_id_(field_id),
parent_(NULLPTR) {}
Node::type type_;
std::string name_;
Repetition::type repetition_;
ConvertedType::type converted_type_;
std::shared_ptr<const LogicalType> logical_type_;
int field_id_;
// Nodes should not be shared, they have a single parent.
const Node* parent_;
bool EqualsInternal(const Node* other) const;
void SetParent(const Node* p_parent);
private:
PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
};
// Save our breath all over the place with these typedefs
typedef std::shared_ptr<Node> NodePtr;
typedef std::vector<NodePtr> NodeVector;
// A type that is one of the primitive Parquet storage types. In addition to
// the other type metadata (name, repetition level, logical type), also has the
// physical storage type and their type-specific metadata (byte width, decimal
// parameters)
class PARQUET_EXPORT PrimitiveNode : public Node {
public:
static std::unique_ptr<Node> FromParquet(const void* opaque_element);
// A field_id -1 (or any negative value) will be serialized as null in Thrift
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
Type::type type,
ConvertedType::type converted_type = ConvertedType::NONE,
int length = -1, int precision = -1, int scale = -1,
int field_id = -1) {
return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
precision, scale, field_id));
}
// If no logical type, pass LogicalType::None() or nullptr
// A field_id -1 (or any negative value) will be serialized as null in Thrift
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
std::shared_ptr<const LogicalType> logical_type,
Type::type primitive_type, int primitive_length = -1,
int field_id = -1) {
return NodePtr(new PrimitiveNode(name, repetition, logical_type, primitive_type,
primitive_length, field_id));
}
bool Equals(const Node* other) const override;
Type::type physical_type() const { return physical_type_; }
ColumnOrder column_order() const { return column_order_; }
void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
int32_t type_length() const { return type_length_; }
const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
void ToParquet(void* element) const override;
void Visit(Visitor* visitor) override;
void VisitConst(ConstVisitor* visitor) const override;
private:
PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
int precision = -1, int scale = -1, int field_id = -1);
PrimitiveNode(const std::string& name, Repetition::type repetition,
std::shared_ptr<const LogicalType> logical_type,
Type::type primitive_type, int primitive_length = -1, int field_id = -1);
Type::type physical_type_;
int32_t type_length_;
DecimalMetadata decimal_metadata_;
ColumnOrder column_order_;
// For FIXED_LEN_BYTE_ARRAY
void SetTypeLength(int32_t length) { type_length_ = length; }
bool EqualsInternal(const PrimitiveNode* other) const;
FRIEND_TEST(TestPrimitiveNode, Attrs);
FRIEND_TEST(TestPrimitiveNode, Equals);
FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
FRIEND_TEST(TestPrimitiveNode, FromParquet);
};
class PARQUET_EXPORT GroupNode : public Node {
public:
static std::unique_ptr<Node> FromParquet(const void* opaque_element,
NodeVector fields = {});
// A field_id -1 (or any negative value) will be serialized as null in Thrift
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
const NodeVector& fields,
ConvertedType::type converted_type = ConvertedType::NONE,
int field_id = -1) {
return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
}
// If no logical type, pass nullptr
// A field_id -1 (or any negative value) will be serialized as null in Thrift
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
const NodeVector& fields,
std::shared_ptr<const LogicalType> logical_type,
int field_id = -1) {
return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id));
}
bool Equals(const Node* other) const override;
NodePtr field(int i) const { return fields_[i]; }
// Get the index of a field by its name, or negative value if not found.
// If several fields share the same name, it is unspecified which one
// is returned.
int FieldIndex(const std::string& name) const;
// Get the index of a field by its node, or negative value if not found.
int FieldIndex(const Node& node) const;
int field_count() const { return static_cast<int>(fields_.size()); }
void ToParquet(void* element) const override;
void Visit(Visitor* visitor) override;
void VisitConst(ConstVisitor* visitor) const override;
/// \brief Return true if this node or any child node has REPEATED repetition
/// type
bool HasRepeatedFields() const;
private:
GroupNode(const std::string& name, Repetition::type repetition,
const NodeVector& fields,
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
GroupNode(const std::string& name, Repetition::type repetition,
const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
int field_id = -1);
NodeVector fields_;
bool EqualsInternal(const GroupNode* other) const;
// Mapping between field name to the field index
std::unordered_multimap<std::string, int> field_name_to_idx_;
FRIEND_TEST(TestGroupNode, Attrs);
FRIEND_TEST(TestGroupNode, Equals);
FRIEND_TEST(TestGroupNode, FieldIndex);
FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
};
// ----------------------------------------------------------------------
// Convenience primitive type factory functions
#define PRIMITIVE_FACTORY(FuncName, TYPE) \
static inline NodePtr FuncName(const std::string& name, \
Repetition::type repetition = Repetition::OPTIONAL, \
int field_id = -1) { \
return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
/*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
}
PRIMITIVE_FACTORY(Boolean, BOOLEAN)
PRIMITIVE_FACTORY(Int32, INT32)
PRIMITIVE_FACTORY(Int64, INT64)
PRIMITIVE_FACTORY(Int96, INT96)
PRIMITIVE_FACTORY(Float, FLOAT)
PRIMITIVE_FACTORY(Double, DOUBLE)
PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
int indent_width = 2);
} // namespace schema
// The ColumnDescriptor encapsulates information necessary to interpret
// primitive column data in the context of a particular schema. We have to
// examine the node structure of a column's path to the root in the schema tree
// to be able to reassemble the nested structure from the repetition and
// definition levels.
class PARQUET_EXPORT ColumnDescriptor {
public:
ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
int16_t max_repetition_level,
const SchemaDescriptor* schema_descr = NULLPTR);
bool Equals(const ColumnDescriptor& other) const;
int16_t max_definition_level() const { return max_definition_level_; }
int16_t max_repetition_level() const { return max_repetition_level_; }
Type::type physical_type() const { return primitive_node_->physical_type(); }
ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
const std::shared_ptr<const LogicalType>& logical_type() const {
return primitive_node_->logical_type();
}
ColumnOrder column_order() const { return primitive_node_->column_order(); }
SortOrder::type sort_order() const {
auto la = logical_type();
auto pt = physical_type();
return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
}
const std::string& name() const { return primitive_node_->name(); }
const std::shared_ptr<schema::ColumnPath> path() const;
const schema::NodePtr& schema_node() const { return node_; }
std::string ToString() const;
int type_length() const;
int type_precision() const;
int type_scale() const;
private:
schema::NodePtr node_;
const schema::PrimitiveNode* primitive_node_;
int16_t max_definition_level_;
int16_t max_repetition_level_;
};
// Container for the converted Parquet schema with a computed information from
// the schema analysis needed for file reading
//
// * Column index to Node
// * Max repetition / definition levels for each primitive node
//
// The ColumnDescriptor objects produced by this class can be used to assist in
// the reconstruction of fully materialized data structures from the
// repetition-definition level encoding of nested data
//
// TODO(wesm): this object can be recomputed from a Schema
class PARQUET_EXPORT SchemaDescriptor {
public:
SchemaDescriptor() {}
~SchemaDescriptor() {}
// Analyze the schema
void Init(std::unique_ptr<schema::Node> schema);
void Init(schema::NodePtr schema);
const ColumnDescriptor* Column(int i) const;
// Get the index of a column by its dotstring path, or negative value if not found.
// If several columns share the same dotstring path, it is unspecified which one
// is returned.
int ColumnIndex(const std::string& node_path) const;
// Get the index of a column by its node, or negative value if not found.
int ColumnIndex(const schema::Node& node) const;
bool Equals(const SchemaDescriptor& other) const;
// The number of physical columns appearing in the file
int num_columns() const { return static_cast<int>(leaves_.size()); }
const schema::NodePtr& schema_root() const { return schema_; }
const schema::GroupNode* group_node() const { return group_node_; }
// Returns the root (child of the schema root) node of the leaf(column) node
const schema::Node* GetColumnRoot(int i) const;
const std::string& name() const { return group_node_->name(); }
std::string ToString() const;
void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
/// \brief Return column index corresponding to a particular
/// PrimitiveNode. Returns -1 if not found
int GetColumnIndex(const schema::PrimitiveNode& node) const;
/// \brief Return true if any field or their children have REPEATED repetition
/// type
bool HasRepeatedFields() const;
private:
friend class ColumnDescriptor;
// Root Node
schema::NodePtr schema_;
// Root Node
const schema::GroupNode* group_node_;
void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
int16_t max_rep_level, const schema::NodePtr& base);
// Result of leaf node / tree analysis
std::vector<ColumnDescriptor> leaves_;
std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
// Mapping between leaf nodes and root group of leaf (first node
// below the schema's root group)
//
// For example, the leaf `a.b.c.d` would have a link back to `a`
//
// -- a <------
// -- -- b |
// -- -- -- c |
// -- -- -- -- d
std::unordered_map<int, schema::NodePtr> leaf_to_base_;
// Mapping between ColumnPath DotString to the leaf index
std::unordered_multimap<std::string, int> leaf_to_idx_;
};
} // namespace parquet

View File

@ -0,0 +1,367 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include "parquet/platform.h"
#include "parquet/types.h"
namespace arrow {
class Array;
class BinaryArray;
} // namespace arrow
namespace parquet {
class ColumnDescriptor;
// ----------------------------------------------------------------------
// Value comparator interfaces
/// \brief Base class for value comparators. Generally used with
/// TypedComparator<T>
class PARQUET_EXPORT Comparator {
public:
virtual ~Comparator() {}
/// \brief Create a comparator explicitly from physical type and
/// sort order
/// \param[in] physical_type the physical type for the typed
/// comparator
/// \param[in] sort_order either SortOrder::SIGNED or
/// SortOrder::UNSIGNED
/// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
static std::shared_ptr<Comparator> Make(Type::type physical_type,
SortOrder::type sort_order,
int type_length = -1);
/// \brief Create typed comparator inferring default sort order from
/// ColumnDescriptor
/// \param[in] descr the Parquet column schema
static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
};
/// \brief Interface for comparison of physical types according to the
/// semantics of a particular logical type.
template <typename DType>
class TypedComparator : public Comparator {
public:
using T = typename DType::c_type;
/// \brief Scalar comparison of two elements, return true if first
/// is strictly less than the second
virtual bool Compare(const T& a, const T& b) = 0;
/// \brief Compute maximum and minimum elements in a batch of
/// elements without any nulls
virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
/// \brief Compute minimum and maximum elements from an Arrow array. Only
/// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
/// / arrow::BinaryArray
virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
/// \brief Compute maximum and minimum elements in a batch of
/// elements with accompanying bitmap indicating which elements are
/// included (bit set) and excluded (bit not set)
///
/// \param[in] values the sequence of values
/// \param[in] length the length of the sequence
/// \param[in] valid_bits a bitmap indicating which elements are
/// included (1) or excluded (0)
/// \param[in] valid_bits_offset the bit offset into the bitmap of
/// the first element in the sequence
virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
const uint8_t* valid_bits,
int64_t valid_bits_offset) = 0;
};
/// \brief Typed version of Comparator::Make
template <typename DType>
std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
SortOrder::type sort_order,
int type_length = -1) {
return std::static_pointer_cast<TypedComparator<DType>>(
Comparator::Make(physical_type, sort_order, type_length));
}
/// \brief Typed version of Comparator::Make
template <typename DType>
std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
}
// ----------------------------------------------------------------------
/// \brief Structure represented encoded statistics to be written to
/// and from Parquet serialized metadata
class PARQUET_EXPORT EncodedStatistics {
std::shared_ptr<std::string> max_, min_;
bool is_signed_ = false;
public:
EncodedStatistics()
: max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
const std::string& max() const { return *max_; }
const std::string& min() const { return *min_; }
int64_t null_count = 0;
int64_t distinct_count = 0;
bool has_min = false;
bool has_max = false;
bool has_null_count = false;
bool has_distinct_count = false;
// From parquet-mr
// Don't write stats larger than the max size rather than truncating. The
// rationale is that some engines may use the minimum value in the page as
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
void ApplyStatSizeLimits(size_t length) {
if (max_->length() > length) {
has_max = false;
}
if (min_->length() > length) {
has_min = false;
}
}
bool is_set() const {
return has_min || has_max || has_null_count || has_distinct_count;
}
bool is_signed() const { return is_signed_; }
void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
EncodedStatistics& set_max(const std::string& value) {
*max_ = value;
has_max = true;
return *this;
}
EncodedStatistics& set_min(const std::string& value) {
*min_ = value;
has_min = true;
return *this;
}
EncodedStatistics& set_null_count(int64_t value) {
null_count = value;
has_null_count = true;
return *this;
}
EncodedStatistics& set_distinct_count(int64_t value) {
distinct_count = value;
has_distinct_count = true;
return *this;
}
};
/// \brief Base type for computing column statistics while writing a file
class PARQUET_EXPORT Statistics {
public:
virtual ~Statistics() {}
/// \brief Create a new statistics instance given a column schema
/// definition
/// \param[in] descr the column schema
/// \param[in] pool a memory pool to use for any memory allocations, optional
static std::shared_ptr<Statistics> Make(
const ColumnDescriptor* descr,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
/// \brief Create a new statistics instance given a column schema
/// definition and pre-existing state
/// \param[in] descr the column schema
/// \param[in] encoded_min the encoded minimum value
/// \param[in] encoded_max the encoded maximum value
/// \param[in] num_values total number of values
/// \param[in] null_count number of null values
/// \param[in] distinct_count number of distinct values
/// \param[in] has_min_max whether the min/max statistics are set
/// \param[in] has_null_count whether the null_count statistics are set
/// \param[in] has_distinct_count whether the distinct_count statistics are set
/// \param[in] pool a memory pool to use for any memory allocations, optional
static std::shared_ptr<Statistics> Make(
const ColumnDescriptor* descr, const std::string& encoded_min,
const std::string& encoded_max, int64_t num_values, int64_t null_count,
int64_t distinct_count, bool has_min_max, bool has_null_count,
bool has_distinct_count,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
/// \brief Return true if the count of null values is set
virtual bool HasNullCount() const = 0;
/// \brief The number of null values, may not be set
virtual int64_t null_count() const = 0;
/// \brief Return true if the count of distinct values is set
virtual bool HasDistinctCount() const = 0;
/// \brief The number of distinct values, may not be set
virtual int64_t distinct_count() const = 0;
/// \brief The total number of values in the column
virtual int64_t num_values() const = 0;
/// \brief Return true if the min and max statistics are set. Obtain
/// with TypedStatistics<T>::min and max
virtual bool HasMinMax() const = 0;
/// \brief Reset state of object to initial (no data observed) state
virtual void Reset() = 0;
/// \brief Plain-encoded minimum value
virtual std::string EncodeMin() const = 0;
/// \brief Plain-encoded maximum value
virtual std::string EncodeMax() const = 0;
/// \brief The finalized encoded form of the statistics for transport
virtual EncodedStatistics Encode() = 0;
/// \brief The physical type of the column schema
virtual Type::type physical_type() const = 0;
/// \brief The full type descriptor from the column schema
virtual const ColumnDescriptor* descr() const = 0;
/// \brief Check two Statistics for equality
virtual bool Equals(const Statistics& other) const = 0;
protected:
static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
const void* max, int64_t num_values,
int64_t null_count, int64_t distinct_count);
};
/// \brief A typed implementation of Statistics
template <typename DType>
class TypedStatistics : public Statistics {
public:
using T = typename DType::c_type;
/// \brief The current minimum value
virtual const T& min() const = 0;
/// \brief The current maximum value
virtual const T& max() const = 0;
/// \brief Update state with state of another Statistics object
virtual void Merge(const TypedStatistics<DType>& other) = 0;
/// \brief Batch statistics update
virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
/// \brief Batch statistics update with supplied validity bitmap
/// \param[in] values pointer to column values
/// \param[in] valid_bits Pointer to bitmap representing if values are non-null.
/// \param[in] valid_bits_offset Offset offset into valid_bits where the slice of
/// data begins.
/// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
/// when calculating statistics. This can be smaller than
/// num_not_null+num_null as num_null can include nulls
/// from parents while num_spaced_values does not.
/// \param[in] num_not_null Number of values that are not null.
/// \param[in] num_null Number of values that are null.
virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
int64_t valid_bits_offset, int64_t num_spaced_values,
int64_t num_not_null, int64_t num_null) = 0;
/// \brief EXPERIMENTAL: Update statistics with an Arrow array without
/// conversion to a primitive Parquet C type. Only implemented for certain
/// Parquet type / Arrow type combinations like BYTE_ARRAY /
/// arrow::BinaryArray
///
/// If update_counts is true then the null_count and num_values will be updated
/// based on the null_count of values. Set to false if these are updated
/// elsewhere (e.g. when updating a dictionary where the counts are taken from
/// the indices and not the values)
virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0;
/// \brief Set min and max values to particular values
virtual void SetMinMax(const T& min, const T& max) = 0;
/// \brief Increments the null count directly
/// Use Update to extract the null count from data. Use this if you determine
/// the null count through some other means (e.g. dictionary arrays where the
/// null count is determined from the indices)
virtual void IncrementNullCount(int64_t n) = 0;
/// \brief Increments the number ov values directly
/// The same note on IncrementNullCount applies here
virtual void IncrementNumValues(int64_t n) = 0;
};
using BoolStatistics = TypedStatistics<BooleanType>;
using Int32Statistics = TypedStatistics<Int32Type>;
using Int64Statistics = TypedStatistics<Int64Type>;
using FloatStatistics = TypedStatistics<FloatType>;
using DoubleStatistics = TypedStatistics<DoubleType>;
using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
using FLBAStatistics = TypedStatistics<FLBAType>;
/// \brief Typed version of Statistics::Make
template <typename DType>
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
const ColumnDescriptor* descr,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
}
/// \brief Create Statistics initialized to a particular state
/// \param[in] min the minimum value
/// \param[in] max the minimum value
/// \param[in] num_values number of values
/// \param[in] null_count number of null values
/// \param[in] distinct_count number of distinct values
template <typename DType>
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
const typename DType::c_type& max,
int64_t num_values,
int64_t null_count,
int64_t distinct_count) {
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
DType::type_num, &min, &max, num_values, null_count, distinct_count));
}
/// \brief Typed version of Statistics::Make
template <typename DType>
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
const ColumnDescriptor* descr, const std::string& encoded_min,
const std::string& encoded_max, int64_t num_values, int64_t null_count,
int64_t distinct_count, bool has_min_max, bool has_null_count,
bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
has_min_max, has_null_count, has_distinct_count, pool));
}
} // namespace parquet

View File

@ -0,0 +1,299 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <array>
#include <chrono>
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>
#include <vector>
#include "arrow/util/optional.h"
#include "parquet/column_reader.h"
#include "parquet/file_reader.h"
#include "parquet/stream_writer.h"
namespace parquet {
/// \brief A class for reading Parquet files using an output stream type API.
///
/// The values given must be of the correct type i.e. the type must
/// match the file schema exactly otherwise a ParquetException will be
/// thrown.
///
/// The user must explicitly advance to the next row using the
/// EndRow() function or EndRow input manipulator.
///
/// Required and optional fields are supported:
/// - Required fields are read using operator>>(T)
/// - Optional fields are read with
/// operator>>(arrow::util::optional<T>)
///
/// Note that operator>>(arrow::util::optional<T>) can be used to read
/// required fields.
///
/// Similarly operator>>(T) can be used to read optional fields.
/// However, if the value is not present then a ParquetException will
/// be raised.
///
/// Currently there is no support for repeated fields.
///
class PARQUET_EXPORT StreamReader {
public:
template <typename T>
using optional = ::arrow::util::optional<T>;
// N.B. Default constructed objects are not usable. This
// constructor is provided so that the object may be move
// assigned afterwards.
StreamReader() = default;
explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
~StreamReader() = default;
bool eof() const { return eof_; }
int current_column() const { return column_index_; }
int64_t current_row() const { return current_row_; }
int num_columns() const;
int64_t num_rows() const;
// Moving is possible.
StreamReader(StreamReader&&) = default;
StreamReader& operator=(StreamReader&&) = default;
// Copying is not allowed.
StreamReader(const StreamReader&) = delete;
StreamReader& operator=(const StreamReader&) = delete;
StreamReader& operator>>(bool& v);
StreamReader& operator>>(int8_t& v);
StreamReader& operator>>(uint8_t& v);
StreamReader& operator>>(int16_t& v);
StreamReader& operator>>(uint16_t& v);
StreamReader& operator>>(int32_t& v);
StreamReader& operator>>(uint32_t& v);
StreamReader& operator>>(int64_t& v);
StreamReader& operator>>(uint64_t& v);
StreamReader& operator>>(std::chrono::milliseconds& v);
StreamReader& operator>>(std::chrono::microseconds& v);
StreamReader& operator>>(float& v);
StreamReader& operator>>(double& v);
StreamReader& operator>>(char& v);
template <int N>
StreamReader& operator>>(char (&v)[N]) {
ReadFixedLength(v, N);
return *this;
}
template <std::size_t N>
StreamReader& operator>>(std::array<char, N>& v) {
ReadFixedLength(v.data(), static_cast<int>(N));
return *this;
}
// N.B. Cannot allow for reading to a arbitrary char pointer as the
// length cannot be verified. Also it would overshadow the
// char[N] input operator.
// StreamReader& operator>>(char * v);
StreamReader& operator>>(std::string& v);
// Input operators for optional fields.
StreamReader& operator>>(optional<bool>& v);
StreamReader& operator>>(optional<int8_t>& v);
StreamReader& operator>>(optional<uint8_t>& v);
StreamReader& operator>>(optional<int16_t>& v);
StreamReader& operator>>(optional<uint16_t>& v);
StreamReader& operator>>(optional<int32_t>& v);
StreamReader& operator>>(optional<uint32_t>& v);
StreamReader& operator>>(optional<int64_t>& v);
StreamReader& operator>>(optional<uint64_t>& v);
StreamReader& operator>>(optional<float>& v);
StreamReader& operator>>(optional<double>& v);
StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
StreamReader& operator>>(optional<std::chrono::microseconds>& v);
StreamReader& operator>>(optional<char>& v);
StreamReader& operator>>(optional<std::string>& v);
template <std::size_t N>
StreamReader& operator>>(optional<std::array<char, N>>& v) {
CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
FixedLenByteArray flba;
if (ReadOptional(&flba)) {
v = std::array<char, N>{};
std::memcpy(v->data(), flba.ptr, N);
} else {
v.reset();
}
return *this;
}
/// \brief Terminate current row and advance to next one.
/// \throws ParquetException if all columns in the row were not
/// read or skipped.
void EndRow();
/// \brief Skip the data in the next columns.
/// If the number of columns exceeds the columns remaining on the
/// current row then skipping is terminated - it does _not_ continue
/// skipping columns on the next row.
/// Skipping of columns still requires the use 'EndRow' even if all
/// remaining columns were skipped.
/// \return Number of columns actually skipped.
int64_t SkipColumns(int64_t num_columns_to_skip);
/// \brief Skip the data in the next rows.
/// Skipping of rows is not allowed if reading of data for the
/// current row is not finished.
/// Skipping of rows will be terminated if the end of file is
/// reached.
/// \return Number of rows actually skipped.
int64_t SkipRows(int64_t num_rows_to_skip);
protected:
[[noreturn]] void ThrowReadFailedException(
const std::shared_ptr<schema::PrimitiveNode>& node);
template <typename ReaderType, typename T>
void Read(T* v) {
const auto& node = nodes_[column_index_];
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
int16_t def_level;
int16_t rep_level;
int64_t values_read;
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
if (values_read != 1) {
ThrowReadFailedException(node);
}
}
template <typename ReaderType, typename ReadType, typename T>
void Read(T* v) {
const auto& node = nodes_[column_index_];
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
int16_t def_level;
int16_t rep_level;
ReadType tmp;
int64_t values_read;
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
if (values_read == 1) {
*v = tmp;
} else {
ThrowReadFailedException(node);
}
}
template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
void ReadOptional(optional<T>* v) {
const auto& node = nodes_[column_index_];
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
int16_t def_level;
int16_t rep_level;
ReadType tmp;
int64_t values_read;
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
if (values_read == 1) {
*v = T(tmp);
} else if ((values_read == 0) && (def_level == 0)) {
v->reset();
} else {
ThrowReadFailedException(node);
}
}
void ReadFixedLength(char* ptr, int len);
void Read(ByteArray* v);
void Read(FixedLenByteArray* v);
bool ReadOptional(ByteArray* v);
bool ReadOptional(FixedLenByteArray* v);
void NextRowGroup();
void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
int length = 0);
void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
void SetEof();
private:
std::unique_ptr<ParquetFileReader> file_reader_;
std::shared_ptr<FileMetaData> file_metadata_;
std::shared_ptr<RowGroupReader> row_group_reader_;
std::vector<std::shared_ptr<ColumnReader>> column_readers_;
std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
bool eof_{true};
int row_group_index_{0};
int column_index_{0};
int64_t current_row_{0};
int64_t row_group_row_offset_{0};
static constexpr int64_t kBatchSizeOne = 1;
}; // namespace parquet
PARQUET_EXPORT
StreamReader& operator>>(StreamReader&, EndRowType);
} // namespace parquet

View File

@ -0,0 +1,243 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <array>
#include <chrono>
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/util/optional.h"
#include "arrow/util/string_view.h"
#include "parquet/column_writer.h"
#include "parquet/file_writer.h"
namespace parquet {
/// \brief A class for writing Parquet files using an output stream type API.
///
/// The values given must be of the correct type i.e. the type must
/// match the file schema exactly otherwise a ParquetException will be
/// thrown.
///
/// The user must explicitly indicate the end of the row using the
/// EndRow() function or EndRow output manipulator.
///
/// A maximum row group size can be configured, the default size is
/// 512MB. Alternatively the row group size can be set to zero and the
/// user can create new row groups by calling the EndRowGroup()
/// function or using the EndRowGroup output manipulator.
///
/// Required and optional fields are supported:
/// - Required fields are written using operator<<(T)
/// - Optional fields are written using
/// operator<<(arrow::util::optional<T>).
///
/// Note that operator<<(T) can be used to write optional fields.
///
/// Similarly, operator<<(arrow::util::optional<T>) can be used to
/// write required fields. However if the optional parameter does not
/// have a value (i.e. it is nullopt) then a ParquetException will be
/// raised.
///
/// Currently there is no support for repeated fields.
///
class PARQUET_EXPORT StreamWriter {
public:
template <typename T>
using optional = ::arrow::util::optional<T>;
// N.B. Default constructed objects are not usable. This
// constructor is provided so that the object may be move
// assigned afterwards.
StreamWriter() = default;
explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
~StreamWriter() = default;
static void SetDefaultMaxRowGroupSize(int64_t max_size);
void SetMaxRowGroupSize(int64_t max_size);
int current_column() const { return column_index_; }
int64_t current_row() const { return current_row_; }
int num_columns() const;
// Moving is possible.
StreamWriter(StreamWriter&&) = default;
StreamWriter& operator=(StreamWriter&&) = default;
// Copying is not allowed.
StreamWriter(const StreamWriter&) = delete;
StreamWriter& operator=(const StreamWriter&) = delete;
/// \brief Output operators for required fields.
/// These can also be used for optional fields when a value must be set.
StreamWriter& operator<<(bool v);
StreamWriter& operator<<(int8_t v);
StreamWriter& operator<<(uint8_t v);
StreamWriter& operator<<(int16_t v);
StreamWriter& operator<<(uint16_t v);
StreamWriter& operator<<(int32_t v);
StreamWriter& operator<<(uint32_t v);
StreamWriter& operator<<(int64_t v);
StreamWriter& operator<<(uint64_t v);
StreamWriter& operator<<(const std::chrono::milliseconds& v);
StreamWriter& operator<<(const std::chrono::microseconds& v);
StreamWriter& operator<<(float v);
StreamWriter& operator<<(double v);
StreamWriter& operator<<(char v);
/// \brief Helper class to write fixed length strings.
/// This is useful as the standard string view (such as
/// arrow::util::string_view) is for variable length data.
struct PARQUET_EXPORT FixedStringView {
FixedStringView() = default;
explicit FixedStringView(const char* data_ptr);
FixedStringView(const char* data_ptr, std::size_t data_len);
const char* data{NULLPTR};
std::size_t size{0};
};
/// \brief Output operators for fixed length strings.
template <int N>
StreamWriter& operator<<(const char (&v)[N]) {
return WriteFixedLength(v, N);
}
template <std::size_t N>
StreamWriter& operator<<(const std::array<char, N>& v) {
return WriteFixedLength(v.data(), N);
}
StreamWriter& operator<<(FixedStringView v);
/// \brief Output operators for variable length strings.
StreamWriter& operator<<(const char* v);
StreamWriter& operator<<(const std::string& v);
StreamWriter& operator<<(::arrow::util::string_view v);
/// \brief Output operator for optional fields.
template <typename T>
StreamWriter& operator<<(const optional<T>& v) {
if (v) {
return operator<<(*v);
}
SkipOptionalColumn();
return *this;
}
/// \brief Skip the next N columns of optional data. If there are
/// less than N columns remaining then the excess columns are
/// ignored.
/// \throws ParquetException if there is an attempt to skip any
/// required column.
/// \return Number of columns actually skipped.
int64_t SkipColumns(int num_columns_to_skip);
/// \brief Terminate the current row and advance to next one.
/// \throws ParquetException if all columns in the row were not
/// written or skipped.
void EndRow();
/// \brief Terminate the current row group and create new one.
void EndRowGroup();
protected:
template <typename WriterType, typename T>
StreamWriter& Write(const T v) {
auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
if (max_row_group_size_ > 0) {
row_group_size_ += writer->EstimatedBufferedValueBytes();
}
return *this;
}
StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len);
StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
int length = -1);
/// \brief Skip the next column which must be optional.
/// \throws ParquetException if the next column does not exist or is
/// not optional.
void SkipOptionalColumn();
void WriteNullValue(ColumnWriter* writer);
private:
using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
struct null_deleter {
void operator()(void*) {}
};
int32_t column_index_{0};
int64_t current_row_{0};
int64_t row_group_size_{0};
int64_t max_row_group_size_{default_row_group_size_};
std::unique_ptr<ParquetFileWriter> file_writer_;
std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
std::vector<node_ptr_type> nodes_;
static constexpr int16_t kDefLevelZero = 0;
static constexpr int16_t kDefLevelOne = 1;
static constexpr int16_t kRepLevelZero = 0;
static constexpr int64_t kBatchSizeOne = 1;
static int64_t default_row_group_size_;
};
struct PARQUET_EXPORT EndRowType {};
constexpr EndRowType EndRow = {};
struct PARQUET_EXPORT EndRowGroupType {};
constexpr EndRowGroupType EndRowGroup = {};
PARQUET_EXPORT
StreamWriter& operator<<(StreamWriter&, EndRowType);
PARQUET_EXPORT
StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
} // namespace parquet

View File

@ -0,0 +1,715 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This module defines an abstract interface for iterating through pages in a
// Parquet column chunk within a row group. It could be extended in the future
// to iterate through all data pages in all chunks in a file.
#pragma once
#include <algorithm>
#include <limits>
#include <memory>
#include <random>
#include <string>
#include <utility>
#include <vector>
#include <gtest/gtest.h>
#include "arrow/io/memory.h"
#include "arrow/testing/util.h"
#include "parquet/column_page.h"
#include "parquet/column_reader.h"
#include "parquet/column_writer.h"
#include "parquet/encoding.h"
#include "parquet/platform.h"
namespace parquet {
static constexpr int FLBA_LENGTH = 12;
inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
}
namespace test {
typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, FloatType,
DoubleType, ByteArrayType, FLBAType>
ParquetTypes;
class ParquetTestException : public parquet::ParquetException {
using ParquetException::ParquetException;
};
const char* get_data_dir();
std::string get_bad_data_dir();
std::string get_data_file(const std::string& filename, bool is_good = true);
template <typename T>
static inline void assert_vector_equal(const std::vector<T>& left,
const std::vector<T>& right) {
ASSERT_EQ(left.size(), right.size());
for (size_t i = 0; i < left.size(); ++i) {
ASSERT_EQ(left[i], right[i]) << i;
}
}
template <typename T>
static inline bool vector_equal(const std::vector<T>& left, const std::vector<T>& right) {
if (left.size() != right.size()) {
return false;
}
for (size_t i = 0; i < left.size(); ++i) {
if (left[i] != right[i]) {
std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i]
<< std::endl;
return false;
}
}
return true;
}
template <typename T>
static std::vector<T> slice(const std::vector<T>& values, int start, int end) {
if (end < start) {
return std::vector<T>(0);
}
std::vector<T> out(end - start);
for (int i = start; i < end; ++i) {
out[i - start] = values[i];
}
return out;
}
void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out);
void random_bools(int n, double p, uint32_t seed, bool* out);
template <typename T>
inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) {
std::default_random_engine gen(seed);
std::uniform_int_distribution<T> d(min_value, max_value);
for (int i = 0; i < n; ++i) {
out[i] = d(gen);
}
}
template <>
inline void random_numbers(int n, uint32_t seed, float min_value, float max_value,
float* out) {
std::default_random_engine gen(seed);
std::uniform_real_distribution<float> d(min_value, max_value);
for (int i = 0; i < n; ++i) {
out[i] = d(gen);
}
}
template <>
inline void random_numbers(int n, uint32_t seed, double min_value, double max_value,
double* out) {
std::default_random_engine gen(seed);
std::uniform_real_distribution<double> d(min_value, max_value);
for (int i = 0; i < n; ++i) {
out[i] = d(gen);
}
}
void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
Int96* out);
void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out);
void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
int max_size);
void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size);
template <typename Type, typename Sequence>
std::shared_ptr<Buffer> EncodeValues(Encoding::type encoding, bool use_dictionary,
const Sequence& values, int length,
const ColumnDescriptor* descr) {
auto encoder = MakeTypedEncoder<Type>(encoding, use_dictionary, descr);
encoder->Put(values, length);
return encoder->FlushValues();
}
template <typename T>
static void InitValues(int num_values, std::vector<T>& values,
std::vector<uint8_t>& buffer) {
random_numbers(num_values, 0, std::numeric_limits<T>::min(),
std::numeric_limits<T>::max(), values.data());
}
template <typename T>
static void InitDictValues(int num_values, int num_dicts, std::vector<T>& values,
std::vector<uint8_t>& buffer) {
int repeat_factor = num_values / num_dicts;
InitValues<T>(num_dicts, values, buffer);
// add some repeated values
for (int j = 1; j < repeat_factor; ++j) {
for (int i = 0; i < num_dicts; ++i) {
std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T));
}
}
// computed only dict_per_page * repeat_factor - 1 values < num_values
// compute remaining
for (int i = num_dicts * repeat_factor; i < num_values; ++i) {
std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T));
}
}
template <>
inline void InitDictValues<bool>(int num_values, int num_dicts, std::vector<bool>& values,
std::vector<uint8_t>& buffer) {
// No op for bool
}
class MockPageReader : public PageReader {
public:
explicit MockPageReader(const std::vector<std::shared_ptr<Page>>& pages)
: pages_(pages), page_index_(0) {}
std::shared_ptr<Page> NextPage() override {
if (page_index_ == static_cast<int>(pages_.size())) {
// EOS to consumer
return std::shared_ptr<Page>(nullptr);
}
return pages_[page_index_++];
}
// No-op
void set_max_page_header_size(uint32_t size) override {}
private:
std::vector<std::shared_ptr<Page>> pages_;
int page_index_;
};
// TODO(wesm): this is only used for testing for now. Refactor to form part of
// primary file write path
template <typename Type>
class DataPageBuilder {
public:
using c_type = typename Type::c_type;
// This class writes data and metadata to the passed inputs
explicit DataPageBuilder(ArrowOutputStream* sink)
: sink_(sink),
num_values_(0),
encoding_(Encoding::PLAIN),
definition_level_encoding_(Encoding::RLE),
repetition_level_encoding_(Encoding::RLE),
have_def_levels_(false),
have_rep_levels_(false),
have_values_(false) {}
void AppendDefLevels(const std::vector<int16_t>& levels, int16_t max_level,
Encoding::type encoding = Encoding::RLE) {
AppendLevels(levels, max_level, encoding);
num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
definition_level_encoding_ = encoding;
have_def_levels_ = true;
}
void AppendRepLevels(const std::vector<int16_t>& levels, int16_t max_level,
Encoding::type encoding = Encoding::RLE) {
AppendLevels(levels, max_level, encoding);
num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
repetition_level_encoding_ = encoding;
have_rep_levels_ = true;
}
void AppendValues(const ColumnDescriptor* d, const std::vector<c_type>& values,
Encoding::type encoding = Encoding::PLAIN) {
std::shared_ptr<Buffer> values_sink = EncodeValues<Type>(
encoding, false, values.data(), static_cast<int>(values.size()), d);
PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size()));
num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
encoding_ = encoding;
have_values_ = true;
}
int32_t num_values() const { return num_values_; }
Encoding::type encoding() const { return encoding_; }
Encoding::type rep_level_encoding() const { return repetition_level_encoding_; }
Encoding::type def_level_encoding() const { return definition_level_encoding_; }
private:
ArrowOutputStream* sink_;
int32_t num_values_;
Encoding::type encoding_;
Encoding::type definition_level_encoding_;
Encoding::type repetition_level_encoding_;
bool have_def_levels_;
bool have_rep_levels_;
bool have_values_;
// Used internally for both repetition and definition levels
void AppendLevels(const std::vector<int16_t>& levels, int16_t max_level,
Encoding::type encoding) {
if (encoding != Encoding::RLE) {
ParquetException::NYI("only rle encoding currently implemented");
}
std::vector<uint8_t> encode_buffer(LevelEncoder::MaxBufferSize(
Encoding::RLE, max_level, static_cast<int>(levels.size())));
// We encode into separate memory from the output stream because the
// RLE-encoded bytes have to be preceded in the stream by their absolute
// size.
LevelEncoder encoder;
encoder.Init(encoding, max_level, static_cast<int>(levels.size()),
encode_buffer.data(), static_cast<int>(encode_buffer.size()));
encoder.Encode(static_cast<int>(levels.size()), levels.data());
int32_t rle_bytes = encoder.len();
PARQUET_THROW_NOT_OK(
sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes), sizeof(int32_t)));
PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes));
}
};
template <>
inline void DataPageBuilder<BooleanType>::AppendValues(const ColumnDescriptor* d,
const std::vector<bool>& values,
Encoding::type encoding) {
if (encoding != Encoding::PLAIN) {
ParquetException::NYI("only plain encoding currently implemented");
}
auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN, false, d);
dynamic_cast<BooleanEncoder*>(encoder.get())
->Put(values, static_cast<int>(values.size()));
std::shared_ptr<Buffer> buffer = encoder->FlushValues();
PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size()));
num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
encoding_ = encoding;
have_values_ = true;
}
template <typename Type>
static std::shared_ptr<DataPageV1> MakeDataPage(
const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size,
const std::vector<int16_t>& def_levels, int16_t max_def_level,
const std::vector<int16_t>& rep_levels, int16_t max_rep_level) {
int num_values = 0;
auto page_stream = CreateOutputStream();
test::DataPageBuilder<Type> page_builder(page_stream.get());
if (!rep_levels.empty()) {
page_builder.AppendRepLevels(rep_levels, max_rep_level);
}
if (!def_levels.empty()) {
page_builder.AppendDefLevels(def_levels, max_def_level);
}
if (encoding == Encoding::PLAIN) {
page_builder.AppendValues(d, values, encoding);
num_values = std::max(page_builder.num_values(), num_vals);
} else { // DICTIONARY PAGES
PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size));
num_values = std::max(page_builder.num_values(), num_vals);
}
PARQUET_ASSIGN_OR_THROW(auto buffer, page_stream->Finish());
return std::make_shared<DataPageV1>(buffer, num_values, encoding,
page_builder.def_level_encoding(),
page_builder.rep_level_encoding(), buffer->size());
}
template <typename TYPE>
class DictionaryPageBuilder {
public:
typedef typename TYPE::c_type TC;
static constexpr int TN = TYPE::type_num;
using SpecializedEncoder = typename EncodingTraits<TYPE>::Encoder;
// This class writes data and metadata to the passed inputs
explicit DictionaryPageBuilder(const ColumnDescriptor* d)
: num_dict_values_(0), have_values_(false) {
auto encoder = MakeTypedEncoder<TYPE>(Encoding::PLAIN, true, d);
dict_traits_ = dynamic_cast<DictEncoder<TYPE>*>(encoder.get());
encoder_.reset(dynamic_cast<SpecializedEncoder*>(encoder.release()));
}
~DictionaryPageBuilder() {}
std::shared_ptr<Buffer> AppendValues(const std::vector<TC>& values) {
int num_values = static_cast<int>(values.size());
// Dictionary encoding
encoder_->Put(values.data(), num_values);
num_dict_values_ = dict_traits_->num_entries();
have_values_ = true;
return encoder_->FlushValues();
}
std::shared_ptr<Buffer> WriteDict() {
std::shared_ptr<Buffer> dict_buffer =
AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size());
dict_traits_->WriteDict(dict_buffer->mutable_data());
return dict_buffer;
}
int32_t num_values() const { return num_dict_values_; }
private:
DictEncoder<TYPE>* dict_traits_;
std::unique_ptr<SpecializedEncoder> encoder_;
int32_t num_dict_values_;
bool have_values_;
};
template <>
inline DictionaryPageBuilder<BooleanType>::DictionaryPageBuilder(
const ColumnDescriptor* d) {
ParquetException::NYI("only plain encoding currently implemented for boolean");
}
template <>
inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::WriteDict() {
ParquetException::NYI("only plain encoding currently implemented for boolean");
return nullptr;
}
template <>
inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::AppendValues(
const std::vector<TC>& values) {
ParquetException::NYI("only plain encoding currently implemented for boolean");
return nullptr;
}
template <typename Type>
inline static std::shared_ptr<DictionaryPage> MakeDictPage(
const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
const std::vector<int>& values_per_page, Encoding::type encoding,
std::vector<std::shared_ptr<Buffer>>& rle_indices) {
test::DictionaryPageBuilder<Type> page_builder(d);
int num_pages = static_cast<int>(values_per_page.size());
int value_start = 0;
for (int i = 0; i < num_pages; i++) {
rle_indices.push_back(page_builder.AppendValues(
slice(values, value_start, value_start + values_per_page[i])));
value_start += values_per_page[i];
}
auto buffer = page_builder.WriteDict();
return std::make_shared<DictionaryPage>(buffer, page_builder.num_values(),
Encoding::PLAIN);
}
// Given def/rep levels and values create multiple dict pages
template <typename Type>
inline static void PaginateDict(const ColumnDescriptor* d,
const std::vector<typename Type::c_type>& values,
const std::vector<int16_t>& def_levels,
int16_t max_def_level,
const std::vector<int16_t>& rep_levels,
int16_t max_rep_level, int num_levels_per_page,
const std::vector<int>& values_per_page,
std::vector<std::shared_ptr<Page>>& pages,
Encoding::type encoding = Encoding::RLE_DICTIONARY) {
int num_pages = static_cast<int>(values_per_page.size());
std::vector<std::shared_ptr<Buffer>> rle_indices;
std::shared_ptr<DictionaryPage> dict_page =
MakeDictPage<Type>(d, values, values_per_page, encoding, rle_indices);
pages.push_back(dict_page);
int def_level_start = 0;
int def_level_end = 0;
int rep_level_start = 0;
int rep_level_end = 0;
for (int i = 0; i < num_pages; i++) {
if (max_def_level > 0) {
def_level_start = i * num_levels_per_page;
def_level_end = (i + 1) * num_levels_per_page;
}
if (max_rep_level > 0) {
rep_level_start = i * num_levels_per_page;
rep_level_end = (i + 1) * num_levels_per_page;
}
std::shared_ptr<DataPageV1> data_page = MakeDataPage<Int32Type>(
d, {}, values_per_page[i], encoding, rle_indices[i]->data(),
static_cast<int>(rle_indices[i]->size()),
slice(def_levels, def_level_start, def_level_end), max_def_level,
slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
pages.push_back(data_page);
}
}
// Given def/rep levels and values create multiple plain pages
template <typename Type>
static inline void PaginatePlain(const ColumnDescriptor* d,
const std::vector<typename Type::c_type>& values,
const std::vector<int16_t>& def_levels,
int16_t max_def_level,
const std::vector<int16_t>& rep_levels,
int16_t max_rep_level, int num_levels_per_page,
const std::vector<int>& values_per_page,
std::vector<std::shared_ptr<Page>>& pages,
Encoding::type encoding = Encoding::PLAIN) {
int num_pages = static_cast<int>(values_per_page.size());
int def_level_start = 0;
int def_level_end = 0;
int rep_level_start = 0;
int rep_level_end = 0;
int value_start = 0;
for (int i = 0; i < num_pages; i++) {
if (max_def_level > 0) {
def_level_start = i * num_levels_per_page;
def_level_end = (i + 1) * num_levels_per_page;
}
if (max_rep_level > 0) {
rep_level_start = i * num_levels_per_page;
rep_level_end = (i + 1) * num_levels_per_page;
}
std::shared_ptr<DataPage> page = MakeDataPage<Type>(
d, slice(values, value_start, value_start + values_per_page[i]),
values_per_page[i], encoding, nullptr, 0,
slice(def_levels, def_level_start, def_level_end), max_def_level,
slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
pages.push_back(page);
value_start += values_per_page[i];
}
}
// Generates pages from randomly generated data
template <typename Type>
static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page,
std::vector<int16_t>& def_levels,
std::vector<int16_t>& rep_levels,
std::vector<typename Type::c_type>& values,
std::vector<uint8_t>& buffer,
std::vector<std::shared_ptr<Page>>& pages,
Encoding::type encoding = Encoding::PLAIN) {
int num_levels = levels_per_page * num_pages;
int num_values = 0;
uint32_t seed = 0;
int16_t zero = 0;
int16_t max_def_level = d->max_definition_level();
int16_t max_rep_level = d->max_repetition_level();
std::vector<int> values_per_page(num_pages, levels_per_page);
// Create definition levels
if (max_def_level > 0) {
def_levels.resize(num_levels);
random_numbers(num_levels, seed, zero, max_def_level, def_levels.data());
for (int p = 0; p < num_pages; p++) {
int num_values_per_page = 0;
for (int i = 0; i < levels_per_page; i++) {
if (def_levels[i + p * levels_per_page] == max_def_level) {
num_values_per_page++;
num_values++;
}
}
values_per_page[p] = num_values_per_page;
}
} else {
num_values = num_levels;
}
// Create repetition levels
if (max_rep_level > 0) {
rep_levels.resize(num_levels);
random_numbers(num_levels, seed, zero, max_rep_level, rep_levels.data());
}
// Create values
values.resize(num_values);
if (encoding == Encoding::PLAIN) {
InitValues<typename Type::c_type>(num_values, values, buffer);
PaginatePlain<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
levels_per_page, values_per_page, pages);
} else if (encoding == Encoding::RLE_DICTIONARY ||
encoding == Encoding::PLAIN_DICTIONARY) {
// Calls InitValues and repeats the data
InitDictValues<typename Type::c_type>(num_values, levels_per_page, values, buffer);
PaginateDict<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
levels_per_page, values_per_page, pages);
}
return num_values;
}
// ----------------------------------------------------------------------
// Test data generation
template <>
void inline InitValues<bool>(int num_values, std::vector<bool>& values,
std::vector<uint8_t>& buffer) {
values = {};
::arrow::random_is_valid(num_values, 0.5, &values,
static_cast<int>(::arrow::random_seed()));
}
template <>
inline void InitValues<ByteArray>(int num_values, std::vector<ByteArray>& values,
std::vector<uint8_t>& buffer) {
int max_byte_array_len = 12;
int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t));
size_t nbytes = num_values * num_bytes;
buffer.resize(nbytes);
random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len);
}
inline void InitWideByteArrayValues(int num_values, std::vector<ByteArray>& values,
std::vector<uint8_t>& buffer, int min_len,
int max_len) {
int num_bytes = static_cast<int>(max_len + sizeof(uint32_t));
size_t nbytes = num_values * num_bytes;
buffer.resize(nbytes);
random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len);
}
template <>
inline void InitValues<FLBA>(int num_values, std::vector<FLBA>& values,
std::vector<uint8_t>& buffer) {
size_t nbytes = num_values * FLBA_LENGTH;
buffer.resize(nbytes);
random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data());
}
template <>
inline void InitValues<Int96>(int num_values, std::vector<Int96>& values,
std::vector<uint8_t>& buffer) {
random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
std::numeric_limits<int32_t>::max(), values.data());
}
inline std::string TestColumnName(int i) {
std::stringstream col_name;
col_name << "column_" << i;
return col_name.str();
}
// This class lives here because of its dependency on the InitValues specializations.
template <typename TestType>
class PrimitiveTypedTest : public ::testing::Test {
public:
using c_type = typename TestType::c_type;
void SetUpSchema(Repetition::type repetition, int num_columns = 1) {
std::vector<schema::NodePtr> fields;
for (int i = 0; i < num_columns; ++i) {
std::string name = TestColumnName(i);
fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num,
ConvertedType::NONE, FLBA_LENGTH));
}
node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
schema_.Init(node_);
}
void GenerateData(int64_t num_values);
void SetupValuesOut(int64_t num_values);
void SyncValuesOut();
protected:
schema::NodePtr node_;
SchemaDescriptor schema_;
// Input buffers
std::vector<c_type> values_;
std::vector<int16_t> def_levels_;
std::vector<uint8_t> buffer_;
// Pointer to the values, needed as we cannot use std::vector<bool>::data()
c_type* values_ptr_;
std::vector<uint8_t> bool_buffer_;
// Output buffers
std::vector<c_type> values_out_;
std::vector<uint8_t> bool_buffer_out_;
c_type* values_out_ptr_;
};
template <typename TestType>
inline void PrimitiveTypedTest<TestType>::SyncValuesOut() {}
template <>
inline void PrimitiveTypedTest<BooleanType>::SyncValuesOut() {
std::vector<uint8_t>::const_iterator source_iterator = bool_buffer_out_.begin();
std::vector<c_type>::iterator destination_iterator = values_out_.begin();
while (source_iterator != bool_buffer_out_.end()) {
*destination_iterator++ = *source_iterator++ != 0;
}
}
template <typename TestType>
inline void PrimitiveTypedTest<TestType>::SetupValuesOut(int64_t num_values) {
values_out_.clear();
values_out_.resize(num_values);
values_out_ptr_ = values_out_.data();
}
template <>
inline void PrimitiveTypedTest<BooleanType>::SetupValuesOut(int64_t num_values) {
values_out_.clear();
values_out_.resize(num_values);
bool_buffer_out_.clear();
bool_buffer_out_.resize(num_values);
// Write once to all values so we can copy it without getting Valgrind errors
// about uninitialised values.
std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true);
values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data());
}
template <typename TestType>
inline void PrimitiveTypedTest<TestType>::GenerateData(int64_t num_values) {
def_levels_.resize(num_values);
values_.resize(num_values);
InitValues<c_type>(static_cast<int>(num_values), values_, buffer_);
values_ptr_ = values_.data();
std::fill(def_levels_.begin(), def_levels_.end(), 1);
}
template <>
inline void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values) {
def_levels_.resize(num_values);
values_.resize(num_values);
InitValues<c_type>(static_cast<int>(num_values), values_, buffer_);
bool_buffer_.resize(num_values);
std::copy(values_.begin(), values_.end(), bool_buffer_.begin());
values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data());
std::fill(def_levels_.begin(), def_levels_.end(), 1);
}
} // namespace test
} // namespace parquet

View File

@ -0,0 +1,88 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
namespace parquet {
/// \brief Feature selection when writing Parquet files
///
/// `ParquetVersion::type` governs which data types are allowed and how they
/// are represented. For example, uint32_t data will be written differently
/// depending on this value (as INT64 for PARQUET_1_0, as UINT32 for other
/// versions).
///
/// However, some features - such as compression algorithms, encryption,
/// or the improved "v2" data page format - must be enabled separately in
/// ArrowWriterProperties.
struct ParquetVersion {
enum type : int {
/// Enable only pre-2.2 Parquet format features when writing
///
/// This setting is useful for maximum compatibility with legacy readers.
/// Note that logical types may still be emitted, as long they have a
/// corresponding converted type.
PARQUET_1_0,
/// DEPRECATED: Enable Parquet format 2.6 features
///
/// This misleadingly named enum value is roughly similar to PARQUET_2_6.
PARQUET_2_0 ARROW_DEPRECATED_ENUM_VALUE("use PARQUET_2_4 or PARQUET_2_6 "
"for fine-grained feature selection"),
/// Enable Parquet format 2.4 and earlier features when writing
///
/// This enables UINT32 as well as logical types which don't have
/// a corresponding converted type.
///
/// Note: Parquet format 2.4.0 was released in October 2017.
PARQUET_2_4,
/// Enable Parquet format 2.6 and earlier features when writing
///
/// This enables the NANOS time unit in addition to the PARQUET_2_4
/// features.
///
/// Note: Parquet format 2.6.0 was released in September 2018.
PARQUET_2_6,
/// Enable latest Parquet format 2.x features
///
/// This value is equal to the greatest 2.x version supported by
/// this library.
PARQUET_2_LATEST = PARQUET_2_6
};
};
class FileMetaData;
class SchemaDescriptor;
class ReaderProperties;
class ArrowReaderProperties;
class WriterProperties;
class WriterPropertiesBuilder;
class ArrowWriterProperties;
class ArrowWriterPropertiesBuilder;
namespace arrow {
class FileWriter;
class FileReader;
} // namespace arrow
} // namespace parquet

View File

@ -0,0 +1,758 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <memory>
#include <sstream>
#include <string>
#include "arrow/util/string_view.h"
#include "parquet/platform.h"
#include "parquet/type_fwd.h"
#include "parquet/windows_fixup.h" // for OPTIONAL
namespace arrow {
namespace util {
class Codec;
} // namespace util
} // namespace arrow
namespace parquet {
// ----------------------------------------------------------------------
// Metadata enums to match Thrift metadata
//
// The reason we maintain our own enums is to avoid transitive dependency on
// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
// public API. After building parquet-cpp, you should not need to include
// Thrift headers in your application. This means some boilerplate to convert
// between our types and Parquet's Thrift types.
//
// We can also add special values like NONE to distinguish between metadata
// values being set and not set. As an example consider ConvertedType and
// CompressionCodec
// Mirrors parquet::Type
struct Type {
enum type {
BOOLEAN = 0,
INT32 = 1,
INT64 = 2,
INT96 = 3,
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7,
// Should always be last element.
UNDEFINED = 8
};
};
// Mirrors parquet::ConvertedType
struct ConvertedType {
enum type {
NONE, // Not a real converted type, but means no converted type is specified
UTF8,
MAP,
MAP_KEY_VALUE,
LIST,
ENUM,
DECIMAL,
DATE,
TIME_MILLIS,
TIME_MICROS,
TIMESTAMP_MILLIS,
TIMESTAMP_MICROS,
UINT_8,
UINT_16,
UINT_32,
UINT_64,
INT_8,
INT_16,
INT_32,
INT_64,
JSON,
BSON,
INTERVAL,
// DEPRECATED INVALID ConvertedType for all-null data.
// Only useful for reading legacy files written out by interim Parquet C++ releases.
// For writing, always emit LogicalType::Null instead.
// See PARQUET-1990.
NA = 25,
UNDEFINED = 26 // Not a real converted type; should always be last element
};
};
// forward declaration
namespace format {
class LogicalType;
}
// Mirrors parquet::FieldRepetitionType
struct Repetition {
enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
};
// Reference:
// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
// format/converter/ParquetMetadataConverter.java
// Sort order for page and column statistics. Types are associated with sort
// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
// aggregated using a sort order. As of parquet-format version 2.3.1, the
// order used to aggregate stats is always SIGNED and is not stored in the
// Parquet file. These stats are discarded for types that need unsigned.
// See PARQUET-686.
struct SortOrder {
enum type { SIGNED, UNSIGNED, UNKNOWN };
};
namespace schema {
struct DecimalMetadata {
bool isset;
int32_t scale;
int32_t precision;
};
} // namespace schema
/// \brief Implementation of parquet.thrift LogicalType types.
class PARQUET_EXPORT LogicalType {
public:
struct Type {
enum type {
UNDEFINED = 0, // Not a real logical type
STRING = 1,
MAP,
LIST,
ENUM,
DECIMAL,
DATE,
TIME,
TIMESTAMP,
INTERVAL,
INT,
NIL, // Thrift NullType: annotates data that is always null
JSON,
BSON,
UUID,
NONE // Not a real logical type; should always be last element
};
};
struct TimeUnit {
enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
};
/// \brief If possible, return a logical type equivalent to the given legacy
/// converted type (and decimal metadata if applicable).
static std::shared_ptr<const LogicalType> FromConvertedType(
const parquet::ConvertedType::type converted_type,
const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
-1});
/// \brief Return the logical type represented by the Thrift intermediary object.
static std::shared_ptr<const LogicalType> FromThrift(
const parquet::format::LogicalType& thrift_logical_type);
/// \brief Return the explicitly requested logical type.
static std::shared_ptr<const LogicalType> String();
static std::shared_ptr<const LogicalType> Map();
static std::shared_ptr<const LogicalType> List();
static std::shared_ptr<const LogicalType> Enum();
static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
static std::shared_ptr<const LogicalType> Date();
static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
LogicalType::TimeUnit::unit time_unit);
/// \brief Create a Timestamp logical type
/// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
/// \param[in] time_unit the resolution of the timestamp
/// \param[in] is_from_converted_type if true, the timestamp was generated
/// by translating a legacy converted type of TIMESTAMP_MILLIS or
/// TIMESTAMP_MICROS. Default is false.
/// \param[in] force_set_converted_type if true, always set the
/// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
/// metadata. Default is false
static std::shared_ptr<const LogicalType> Timestamp(
bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
bool is_from_converted_type = false, bool force_set_converted_type = false);
static std::shared_ptr<const LogicalType> Interval();
static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
/// \brief Create a logical type for data that's always null
///
/// Any physical type can be annotated with this logical type.
static std::shared_ptr<const LogicalType> Null();
static std::shared_ptr<const LogicalType> JSON();
static std::shared_ptr<const LogicalType> BSON();
static std::shared_ptr<const LogicalType> UUID();
/// \brief Create a placeholder for when no logical type is specified
static std::shared_ptr<const LogicalType> None();
/// \brief Return true if this logical type is consistent with the given underlying
/// physical type.
bool is_applicable(parquet::Type::type primitive_type,
int32_t primitive_length = -1) const;
/// \brief Return true if this logical type is equivalent to the given legacy converted
/// type (and decimal metadata if applicable).
bool is_compatible(parquet::ConvertedType::type converted_type,
parquet::schema::DecimalMetadata converted_decimal_metadata = {
false, -1, -1}) const;
/// \brief If possible, return the legacy converted type (and decimal metadata if
/// applicable) equivalent to this logical type.
parquet::ConvertedType::type ToConvertedType(
parquet::schema::DecimalMetadata* out_decimal_metadata) const;
/// \brief Return a printable representation of this logical type.
std::string ToString() const;
/// \brief Return a JSON representation of this logical type.
std::string ToJSON() const;
/// \brief Return a serializable Thrift object for this logical type.
parquet::format::LogicalType ToThrift() const;
/// \brief Return true if the given logical type is equivalent to this logical type.
bool Equals(const LogicalType& other) const;
/// \brief Return the enumerated type of this logical type.
LogicalType::Type::type type() const;
/// \brief Return the appropriate sort order for this logical type.
SortOrder::type sort_order() const;
// Type checks ...
bool is_string() const;
bool is_map() const;
bool is_list() const;
bool is_enum() const;
bool is_decimal() const;
bool is_date() const;
bool is_time() const;
bool is_timestamp() const;
bool is_interval() const;
bool is_int() const;
bool is_null() const;
bool is_JSON() const;
bool is_BSON() const;
bool is_UUID() const;
bool is_none() const;
/// \brief Return true if this logical type is of a known type.
bool is_valid() const;
bool is_invalid() const;
/// \brief Return true if this logical type is suitable for a schema GroupNode.
bool is_nested() const;
bool is_nonnested() const;
/// \brief Return true if this logical type is included in the Thrift output for its
/// node.
bool is_serialized() const;
LogicalType(const LogicalType&) = delete;
LogicalType& operator=(const LogicalType&) = delete;
virtual ~LogicalType() noexcept;
protected:
LogicalType();
class Impl;
std::unique_ptr<const Impl> impl_;
};
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
class PARQUET_EXPORT StringLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
StringLogicalType() = default;
};
/// \brief Allowed for group nodes only.
class PARQUET_EXPORT MapLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
MapLogicalType() = default;
};
/// \brief Allowed for group nodes only.
class PARQUET_EXPORT ListLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
ListLogicalType() = default;
};
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
class PARQUET_EXPORT EnumLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
EnumLogicalType() = default;
};
/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
/// depending on the precision.
class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
int32_t precision() const;
int32_t scale() const;
private:
DecimalLogicalType() = default;
};
/// \brief Allowed for physical type INT32.
class PARQUET_EXPORT DateLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
DateLogicalType() = default;
};
/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
class PARQUET_EXPORT TimeLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
LogicalType::TimeUnit::unit time_unit);
bool is_adjusted_to_utc() const;
LogicalType::TimeUnit::unit time_unit() const;
private:
TimeLogicalType() = default;
};
/// \brief Allowed for physical type INT64.
class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
LogicalType::TimeUnit::unit time_unit,
bool is_from_converted_type = false,
bool force_set_converted_type = false);
bool is_adjusted_to_utc() const;
LogicalType::TimeUnit::unit time_unit() const;
/// \brief If true, will not set LogicalType in Thrift metadata
bool is_from_converted_type() const;
/// \brief If true, will set ConvertedType for micros and millis
/// resolution in legacy ConvertedType Thrift metadata
bool force_set_converted_type() const;
private:
TimestampLogicalType() = default;
};
/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
IntervalLogicalType() = default;
};
/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
/// (for bit width 64).
class PARQUET_EXPORT IntLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
int bit_width() const;
bool is_signed() const;
private:
IntLogicalType() = default;
};
/// \brief Allowed for any physical type.
class PARQUET_EXPORT NullLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
NullLogicalType() = default;
};
/// \brief Allowed for physical type BYTE_ARRAY.
class PARQUET_EXPORT JSONLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
JSONLogicalType() = default;
};
/// \brief Allowed for physical type BYTE_ARRAY.
class PARQUET_EXPORT BSONLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
BSONLogicalType() = default;
};
/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
/// must encode raw UUID bytes.
class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
UUIDLogicalType() = default;
};
/// \brief Allowed for any physical type.
class PARQUET_EXPORT NoLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
NoLogicalType() = default;
};
// Internal API, for unrecognized logical types
class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
UndefinedLogicalType() = default;
};
// Data encodings. Mirrors parquet::Encoding
struct Encoding {
enum type {
PLAIN = 0,
PLAIN_DICTIONARY = 2,
RLE = 3,
BIT_PACKED = 4,
DELTA_BINARY_PACKED = 5,
DELTA_LENGTH_BYTE_ARRAY = 6,
DELTA_BYTE_ARRAY = 7,
RLE_DICTIONARY = 8,
BYTE_STREAM_SPLIT = 9,
// Should always be last element (except UNKNOWN)
UNDEFINED = 10,
UNKNOWN = 999
};
};
// Exposed data encodings. It is the encoding of the data read from the file,
// rather than the encoding of the data in the file. E.g., the data encoded as
// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
// decoding, in which case the data read from the file is DICTIONARY encoded.
enum class ExposedEncoding {
NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
DICTIONARY = 1
};
/// \brief Return true if Parquet supports indicated compression type
PARQUET_EXPORT
bool IsCodecSupported(Compression::type codec);
PARQUET_EXPORT
std::unique_ptr<Codec> GetCodec(Compression::type codec);
PARQUET_EXPORT
std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
struct ParquetCipher {
enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
};
struct AadMetadata {
std::string aad_prefix;
std::string aad_file_unique;
bool supply_aad_prefix;
};
struct EncryptionAlgorithm {
ParquetCipher::type algorithm;
AadMetadata aad;
};
// parquet::PageType
struct PageType {
enum type {
DATA_PAGE,
INDEX_PAGE,
DICTIONARY_PAGE,
DATA_PAGE_V2,
// Should always be last element
UNDEFINED
};
};
class ColumnOrder {
public:
enum type { UNDEFINED, TYPE_DEFINED_ORDER };
explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
// Default to Type Defined Order
ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
ColumnOrder::type get_order() { return column_order_; }
static ColumnOrder undefined_;
static ColumnOrder type_defined_;
private:
ColumnOrder::type column_order_;
};
// ----------------------------------------------------------------------
struct ByteArray {
ByteArray() : len(0), ptr(NULLPTR) {}
ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
: ByteArray(static_cast<uint32_t>(view.size()),
reinterpret_cast<const uint8_t*>(view.data())) {}
uint32_t len;
const uint8_t* ptr;
};
inline bool operator==(const ByteArray& left, const ByteArray& right) {
return left.len == right.len &&
(left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
}
inline bool operator!=(const ByteArray& left, const ByteArray& right) {
return !(left == right);
}
struct FixedLenByteArray {
FixedLenByteArray() : ptr(NULLPTR) {}
explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
const uint8_t* ptr;
};
using FLBA = FixedLenByteArray;
// Julian day at unix epoch.
//
// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
// the Julian day count starting from noon Universal time, with Julian day
// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
// calendar),
constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
STRUCT_END(Int96, 12);
inline bool operator==(const Int96& left, const Int96& right) {
return std::equal(left.value, left.value + 3, right.value);
}
inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
static inline std::string ByteArrayToString(const ByteArray& a) {
return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
}
static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
}
struct DecodedInt96 {
uint64_t days_since_epoch;
uint64_t nanoseconds;
};
static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
// We do the computations in the unsigned domain to avoid unsigned behaviour
// on overflow.
DecodedInt96 result;
result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
result.nanoseconds = 0;
memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
return result;
}
static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
const auto decoded = DecodeInt96Timestamp(i96);
return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
decoded.nanoseconds);
}
static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
const auto decoded = DecodeInt96Timestamp(i96);
uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
microseconds);
}
static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
const auto decoded = DecodeInt96Timestamp(i96);
uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
milliseconds);
}
static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
const auto decoded = DecodeInt96Timestamp(i96);
uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
}
static inline std::string Int96ToString(const Int96& a) {
std::ostringstream result;
std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
return result.str();
}
static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
std::ostringstream result;
std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
return result.str();
}
template <Type::type TYPE>
struct type_traits {};
template <>
struct type_traits<Type::BOOLEAN> {
using value_type = bool;
static constexpr int value_byte_size = 1;
static constexpr const char* printf_code = "d";
};
template <>
struct type_traits<Type::INT32> {
using value_type = int32_t;
static constexpr int value_byte_size = 4;
static constexpr const char* printf_code = "d";
};
template <>
struct type_traits<Type::INT64> {
using value_type = int64_t;
static constexpr int value_byte_size = 8;
static constexpr const char* printf_code =
(sizeof(long) == 64) ? "ld" : "lld"; // NOLINT: runtime/int
};
template <>
struct type_traits<Type::INT96> {
using value_type = Int96;
static constexpr int value_byte_size = 12;
static constexpr const char* printf_code = "s";
};
template <>
struct type_traits<Type::FLOAT> {
using value_type = float;
static constexpr int value_byte_size = 4;
static constexpr const char* printf_code = "f";
};
template <>
struct type_traits<Type::DOUBLE> {
using value_type = double;
static constexpr int value_byte_size = 8;
static constexpr const char* printf_code = "lf";
};
template <>
struct type_traits<Type::BYTE_ARRAY> {
using value_type = ByteArray;
static constexpr int value_byte_size = sizeof(ByteArray);
static constexpr const char* printf_code = "s";
};
template <>
struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
using value_type = FixedLenByteArray;
static constexpr int value_byte_size = sizeof(FixedLenByteArray);
static constexpr const char* printf_code = "s";
};
template <Type::type TYPE>
struct PhysicalType {
using c_type = typename type_traits<TYPE>::value_type;
static constexpr Type::type type_num = TYPE;
};
using BooleanType = PhysicalType<Type::BOOLEAN>;
using Int32Type = PhysicalType<Type::INT32>;
using Int64Type = PhysicalType<Type::INT64>;
using Int96Type = PhysicalType<Type::INT96>;
using FloatType = PhysicalType<Type::FLOAT>;
using DoubleType = PhysicalType<Type::DOUBLE>;
using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
template <typename Type>
inline std::string format_fwf(int width) {
std::stringstream ss;
ss << "%-" << width << type_traits<Type::type_num>::printf_code;
return ss.str();
}
PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
PARQUET_EXPORT std::string TypeToString(Type::type t);
PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
::arrow::util::string_view val);
PARQUET_EXPORT int GetTypeByteSize(Type::type t);
PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
Type::type primitive);
PARQUET_EXPORT SortOrder::type GetSortOrder(
const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
} // namespace parquet

View File

@ -0,0 +1,21 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/windows_compatibility.h"
#include "parquet/windows_fixup.h"

View File

@ -0,0 +1,29 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This header needs to be included multiple times.
#include "arrow/util/windows_fixup.h"
#ifdef _WIN32
// parquet.thrift's OPTIONAL RepetitionType conflicts with a Windows #define
#ifdef OPTIONAL
#undef OPTIONAL
#endif
#endif // _WIN32