mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-23 02:40:43 +00:00
161 lines
5.7 KiB
C++
161 lines
5.7 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
// This module defines an abstract interface for iterating through pages in a
|
|
// Parquet column chunk within a row group. It could be extended in the future
|
|
// to iterate through all data pages in all chunks in a file.
|
|
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <string>
|
|
|
|
#include "parquet/statistics.h"
|
|
#include "parquet/types.h"
|
|
|
|
namespace parquet {
|
|
|
|
// TODO: Parallel processing is not yet safe because of memory-ownership
|
|
// semantics (the PageReader may or may not own the memory referenced by a
|
|
// page)
|
|
//
|
|
// TODO(wesm): In the future Parquet implementations may store the crc code
|
|
// in format::PageHeader. parquet-mr currently does not, so we also skip it
|
|
// here, both on the read and write path
|
|
class Page {
|
|
public:
|
|
Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
|
|
: buffer_(buffer), type_(type) {}
|
|
|
|
PageType::type type() const { return type_; }
|
|
|
|
std::shared_ptr<Buffer> buffer() const { return buffer_; }
|
|
|
|
// @returns: a pointer to the page's data
|
|
const uint8_t* data() const { return buffer_->data(); }
|
|
|
|
// @returns: the total size in bytes of the page's data buffer
|
|
int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
|
|
|
|
private:
|
|
std::shared_ptr<Buffer> buffer_;
|
|
PageType::type type_;
|
|
};
|
|
|
|
/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
|
|
class DataPage : public Page {
|
|
public:
|
|
int32_t num_values() const { return num_values_; }
|
|
Encoding::type encoding() const { return encoding_; }
|
|
int64_t uncompressed_size() const { return uncompressed_size_; }
|
|
const EncodedStatistics& statistics() const { return statistics_; }
|
|
|
|
virtual ~DataPage() = default;
|
|
|
|
protected:
|
|
DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
|
Encoding::type encoding, int64_t uncompressed_size,
|
|
const EncodedStatistics& statistics = EncodedStatistics())
|
|
: Page(buffer, type),
|
|
num_values_(num_values),
|
|
encoding_(encoding),
|
|
uncompressed_size_(uncompressed_size),
|
|
statistics_(statistics) {}
|
|
|
|
int32_t num_values_;
|
|
Encoding::type encoding_;
|
|
int64_t uncompressed_size_;
|
|
EncodedStatistics statistics_;
|
|
};
|
|
|
|
class DataPageV1 : public DataPage {
|
|
public:
|
|
DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
|
Encoding::type encoding, Encoding::type definition_level_encoding,
|
|
Encoding::type repetition_level_encoding, int64_t uncompressed_size,
|
|
const EncodedStatistics& statistics = EncodedStatistics())
|
|
: DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
|
|
statistics),
|
|
definition_level_encoding_(definition_level_encoding),
|
|
repetition_level_encoding_(repetition_level_encoding) {}
|
|
|
|
Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
|
|
|
|
Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
|
|
|
|
private:
|
|
Encoding::type definition_level_encoding_;
|
|
Encoding::type repetition_level_encoding_;
|
|
};
|
|
|
|
class DataPageV2 : public DataPage {
|
|
public:
|
|
DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
|
|
int32_t num_rows, Encoding::type encoding,
|
|
int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
|
|
int64_t uncompressed_size, bool is_compressed = false,
|
|
const EncodedStatistics& statistics = EncodedStatistics())
|
|
: DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
|
|
statistics),
|
|
num_nulls_(num_nulls),
|
|
num_rows_(num_rows),
|
|
definition_levels_byte_length_(definition_levels_byte_length),
|
|
repetition_levels_byte_length_(repetition_levels_byte_length),
|
|
is_compressed_(is_compressed) {}
|
|
|
|
int32_t num_nulls() const { return num_nulls_; }
|
|
|
|
int32_t num_rows() const { return num_rows_; }
|
|
|
|
int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
|
|
|
|
int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
|
|
|
|
bool is_compressed() const { return is_compressed_; }
|
|
|
|
private:
|
|
int32_t num_nulls_;
|
|
int32_t num_rows_;
|
|
int32_t definition_levels_byte_length_;
|
|
int32_t repetition_levels_byte_length_;
|
|
bool is_compressed_;
|
|
};
|
|
|
|
class DictionaryPage : public Page {
|
|
public:
|
|
DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
|
Encoding::type encoding, bool is_sorted = false)
|
|
: Page(buffer, PageType::DICTIONARY_PAGE),
|
|
num_values_(num_values),
|
|
encoding_(encoding),
|
|
is_sorted_(is_sorted) {}
|
|
|
|
int32_t num_values() const { return num_values_; }
|
|
|
|
Encoding::type encoding() const { return encoding_; }
|
|
|
|
bool is_sorted() const { return is_sorted_; }
|
|
|
|
private:
|
|
int32_t num_values_;
|
|
Encoding::type encoding_;
|
|
bool is_sorted_;
|
|
};
|
|
|
|
} // namespace parquet
|