first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@@ -0,0 +1,135 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "parquet/encryption/encryption.h"
#include "parquet/encryption/file_key_wrapper.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
ParquetCipher::AES_GCM_V1;
static constexpr bool kDefaultPlaintextFooter = false;
static constexpr bool kDefaultDoubleWrapping = true;
static constexpr double kDefaultCacheLifetimeSeconds = 600; // 10 minutes
static constexpr bool kDefaultInternalKeyMaterial = true;
static constexpr bool kDefaultUniformEncryption = false;
static constexpr int32_t kDefaultDataKeyLengthBits = 128;
struct PARQUET_EXPORT EncryptionConfiguration {
explicit EncryptionConfiguration(const std::string& footer_key)
: footer_key(footer_key) {}
/// ID of the master key for footer encryption/signing
std::string footer_key;
/// List of columns to encrypt, with master key IDs (see HIVE-21848).
/// Format: "masterKeyID:colName,colName;masterKeyID:colName..."
/// Either
/// (1) column_keys must be set
/// or
/// (2) uniform_encryption must be set to true
/// If none of (1) and (2) are true, or if both are true, an exception will be
/// thrown.
std::string column_keys;
/// Encrypt footer and all columns with the same encryption key.
bool uniform_encryption = kDefaultUniformEncryption;
/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1".
ParquetCipher::type encryption_algorithm = kDefaultEncryptionAlgorithm;
/// Write files with plaintext footer.
/// The default is false - files are written with encrypted footer.
bool plaintext_footer = kDefaultPlaintextFooter;
/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key
/// encryption keys (KEKs), which in turn are encrypted with master keys.
/// The default is true. If set to false, use single wrapping - where DEKs are
/// encrypted directly with master keys.
bool double_wrapping = kDefaultDoubleWrapping;
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
/// objects).
/// The default is 600 (10 minutes).
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
/// Store key material inside Parquet file footers; this mode doesnt produce
/// additional files. By default, true. If set to false, key material is stored in
/// separate files in the same folder, which enables key rotation for immutable
/// Parquet files.
bool internal_key_material = kDefaultInternalKeyMaterial;
/// Length of data encryption keys (DEKs), randomly generated by parquet key
/// management tools. Can be 128, 192 or 256 bits.
/// The default is 128 bits.
int32_t data_key_length_bits = kDefaultDataKeyLengthBits;
};
struct PARQUET_EXPORT DecryptionConfiguration {
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
/// objects).
/// The default is 600 (10 minutes).
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
};
/// This is a core class, that translates the parameters of high level encryption (like
/// the names of encrypted columns, names of master keys, etc), into parameters of low
/// level encryption (like the key metadata, DEK, etc). A factory that produces the low
/// level FileEncryptionProperties and FileDecryptionProperties objects, from the high
/// level parameters.
class PARQUET_EXPORT CryptoFactory {
public:
/// a KmsClientFactory object must be registered via this method before calling any of
/// GetFileEncryptionProperties()/GetFileDecryptionProperties() methods.
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory);
std::shared_ptr<FileEncryptionProperties> GetFileEncryptionProperties(
const KmsConnectionConfig& kms_connection_config,
const EncryptionConfiguration& encryption_config);
/// The returned FileDecryptionProperties object will use the cache inside this
/// CryptoFactory object, so please keep this
/// CryptoFactory object alive along with the returned
/// FileDecryptionProperties object.
std::shared_ptr<FileDecryptionProperties> GetFileDecryptionProperties(
const KmsConnectionConfig& kms_connection_config,
const DecryptionConfiguration& decryption_config);
void RemoveCacheEntriesForToken(const std::string& access_token) {
key_toolkit_.RemoveCacheEntriesForToken(access_token);
}
void RemoveCacheEntriesForAllTokens() { key_toolkit_.RemoveCacheEntriesForAllTokens(); }
private:
ColumnPathToEncryptionPropertiesMap GetColumnEncryptionProperties(
int dek_length, const std::string& column_keys, FileKeyWrapper* key_wrapper);
/// Key utilities object for kms client initialization and cache control
KeyToolkit key_toolkit_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,510 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <utility>
#include "parquet/exception.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace parquet {
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
ParquetCipher::AES_GCM_V1;
static constexpr int32_t kMaximalAadMetadataLength = 256;
static constexpr bool kDefaultEncryptedFooter = true;
static constexpr bool kDefaultCheckSignature = true;
static constexpr bool kDefaultAllowPlaintextFiles = false;
static constexpr int32_t kAadFileUniqueLength = 8;
class ColumnDecryptionProperties;
using ColumnPathToDecryptionPropertiesMap =
std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
class ColumnEncryptionProperties;
using ColumnPathToEncryptionPropertiesMap =
std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
class PARQUET_EXPORT DecryptionKeyRetriever {
public:
virtual std::string GetKey(const std::string& key_metadata) = 0;
virtual ~DecryptionKeyRetriever() {}
};
/// Simple integer key retriever
class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
public:
void PutKey(uint32_t key_id, const std::string& key);
std::string GetKey(const std::string& key_metadata) override;
private:
std::map<uint32_t, std::string> key_map_;
};
// Simple string key retriever
class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
public:
void PutKey(const std::string& key_id, const std::string& key);
std::string GetKey(const std::string& key_metadata) override;
private:
std::map<std::string, std::string> key_map_;
};
class PARQUET_EXPORT HiddenColumnException : public ParquetException {
public:
explicit HiddenColumnException(const std::string& columnPath)
: ParquetException(columnPath.c_str()) {}
};
class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
public:
explicit KeyAccessDeniedException(const std::string& columnPath)
: ParquetException(columnPath.c_str()) {}
};
inline const uint8_t* str2bytes(const std::string& str) {
if (str.empty()) return NULLPTR;
char* cbytes = const_cast<char*>(str.c_str());
return reinterpret_cast<const uint8_t*>(cbytes);
}
class PARQUET_EXPORT ColumnEncryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
/// Convenience builder for encrypted columns.
explicit Builder(const std::string& name) : Builder(name, true) {}
/// Convenience builder for encrypted columns.
explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
: Builder(path->ToDotString(), true) {}
/// Set a column-specific key.
/// If key is not set on an encrypted column, the column will
/// be encrypted with the footer key.
/// keyBytes Key length must be either 16, 24 or 32 bytes.
/// The key is cloned, and will be wiped out (array values set to 0) upon completion
/// of file writing.
/// Caller is responsible for wiping out the input key array.
Builder* key(std::string column_key);
/// Set a key retrieval metadata.
/// use either key_metadata() or key_id(), not both
Builder* key_metadata(const std::string& key_metadata);
/// A convenience function to set key metadata using a string id.
/// Set a key retrieval metadata (converted from String).
/// use either key_metadata() or key_id(), not both
/// key_id will be converted to metadata (UTF-8 array).
Builder* key_id(const std::string& key_id);
std::shared_ptr<ColumnEncryptionProperties> build() {
return std::shared_ptr<ColumnEncryptionProperties>(
new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
}
private:
const std::string column_path_;
bool encrypted_;
std::string key_;
std::string key_metadata_;
Builder(const std::string path, bool encrypted)
: column_path_(path), encrypted_(encrypted) {}
};
std::string column_path() const { return column_path_; }
bool is_encrypted() const { return encrypted_; }
bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
std::string key() const { return key_; }
std::string key_metadata() const { return key_metadata_; }
/// Upon completion of file writing, the encryption key
/// will be wiped out.
void WipeOutEncryptionKey() { key_.clear(); }
bool is_utilized() {
if (key_.empty())
return false; // can re-use column properties without encryption keys
return utilized_;
}
/// ColumnEncryptionProperties object can be used for writing one file only.
/// Mark ColumnEncryptionProperties as utilized once it is used in
/// FileEncryptionProperties as the encryption key will be wiped out upon
/// completion of file writing.
void set_utilized() { utilized_ = true; }
std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
std::string key_copy = key_;
return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
encrypted_, column_path_, key_copy, key_metadata_));
}
ColumnEncryptionProperties() = default;
ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
private:
const std::string column_path_;
bool encrypted_;
bool encrypted_with_footer_key_;
std::string key_;
std::string key_metadata_;
bool utilized_;
explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
const std::string& key,
const std::string& key_metadata);
};
class PARQUET_EXPORT ColumnDecryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
explicit Builder(const std::string& name) : column_path_(name) {}
explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
: Builder(path->ToDotString()) {}
/// Set an explicit column key. If applied on a file that contains
/// key metadata for this column the metadata will be ignored,
/// the column will be decrypted with this key.
/// key length must be either 16, 24 or 32 bytes.
Builder* key(const std::string& key);
std::shared_ptr<ColumnDecryptionProperties> build();
private:
const std::string column_path_;
std::string key_;
};
ColumnDecryptionProperties() = default;
ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
std::string column_path() const { return column_path_; }
std::string key() const { return key_; }
bool is_utilized() { return utilized_; }
/// ColumnDecryptionProperties object can be used for reading one file only.
/// Mark ColumnDecryptionProperties as utilized once it is used in
/// FileDecryptionProperties as the encryption key will be wiped out upon
/// completion of file reading.
void set_utilized() { utilized_ = true; }
/// Upon completion of file reading, the encryption key
/// will be wiped out.
void WipeOutDecryptionKey();
std::shared_ptr<ColumnDecryptionProperties> DeepClone();
private:
const std::string column_path_;
std::string key_;
bool utilized_;
/// This class is only required for setting explicit column decryption keys -
/// to override key retriever (or to provide keys when key metadata and/or
/// key retriever are not available)
explicit ColumnDecryptionProperties(const std::string& column_path,
const std::string& key);
};
class PARQUET_EXPORT AADPrefixVerifier {
public:
/// Verifies identity (AAD Prefix) of individual file,
/// or of file collection in a data set.
/// Throws exception if an AAD prefix is wrong.
/// In a data set, AAD Prefixes should be collected,
/// and then checked for missing files.
virtual void Verify(const std::string& aad_prefix) = 0;
virtual ~AADPrefixVerifier() {}
};
class PARQUET_EXPORT FileDecryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
Builder() {
check_plaintext_footer_integrity_ = kDefaultCheckSignature;
plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
}
/// Set an explicit footer key. If applied on a file that contains
/// footer key metadata the metadata will be ignored, the footer
/// will be decrypted/verified with this key.
/// If explicit key is not set, footer key will be fetched from
/// key retriever.
/// With explicit keys or AAD prefix, new encryption properties object must be
/// created for each encrypted file.
/// Explicit encryption keys (footer and column) are cloned.
/// Upon completion of file reading, the cloned encryption keys in the properties
/// will be wiped out (array values set to 0).
/// Caller is responsible for wiping out the input key array.
/// param footerKey Key length must be either 16, 24 or 32 bytes.
Builder* footer_key(const std::string footer_key);
/// Set explicit column keys (decryption properties).
/// Its also possible to set a key retriever on this property object.
/// Upon file decryption, availability of explicit keys is checked before
/// invocation of the retriever callback.
/// If an explicit key is available for a footer or a column,
/// its key metadata will be ignored.
Builder* column_keys(
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
/// Set a key retriever callback. Its also possible to
/// set explicit footer or column keys on this file property object.
/// Upon file decryption, availability of explicit keys is checked before
/// invocation of the retriever callback.
/// If an explicit key is available for a footer or a column,
/// its key metadata will be ignored.
Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
/// Skip integrity verification of plaintext footers.
/// If not called, integrity of plaintext footers will be checked in runtime,
/// and an exception will be thrown in the following situations:
/// - footer signing key is not available
/// (not passed, or not found by key retriever)
/// - footer content and signature don't match
Builder* disable_footer_signature_verification() {
check_plaintext_footer_integrity_ = false;
return this;
}
/// Explicitly supply the file AAD prefix.
/// A must when a prefix is used for file encryption, but not stored in file.
/// If AAD prefix is stored in file, it will be compared to the explicitly
/// supplied value and an exception will be thrown if they differ.
Builder* aad_prefix(const std::string& aad_prefix);
/// Set callback for verification of AAD Prefixes stored in file.
Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
/// By default, reading plaintext (unencrypted) files is not
/// allowed when using a decryptor
/// - in order to detect files that were not encrypted by mistake.
/// However, the default behavior can be overridden by calling this method.
/// The caller should use then a different method to ensure encryption
/// of files with sensitive data.
Builder* plaintext_files_allowed() {
plaintext_files_allowed_ = true;
return this;
}
std::shared_ptr<FileDecryptionProperties> build() {
return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
}
private:
std::string footer_key_;
std::string aad_prefix_;
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
bool check_plaintext_footer_integrity_;
bool plaintext_files_allowed_;
};
std::string column_key(const std::string& column_path) const;
std::string footer_key() const { return footer_key_; }
std::string aad_prefix() const { return aad_prefix_; }
const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
return key_retriever_;
}
bool check_plaintext_footer_integrity() const {
return check_plaintext_footer_integrity_;
}
bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
return aad_prefix_verifier_;
}
/// Upon completion of file reading, the encryption keys in the properties
/// will be wiped out (array values set to 0).
void WipeOutDecryptionKeys();
bool is_utilized();
/// FileDecryptionProperties object can be used for reading one file only.
/// Mark FileDecryptionProperties as utilized once it is used to read a file as the
/// encryption keys will be wiped out upon completion of file reading.
void set_utilized() { utilized_ = true; }
/// FileDecryptionProperties object can be used for reading one file only.
/// (unless this object keeps the keyRetrieval callback only, and no explicit
/// keys or aadPrefix).
/// At the end, keys are wiped out in the memory.
/// This method allows to clone identical properties for another file,
/// with an option to update the aadPrefix (if newAadPrefix is null,
/// aadPrefix will be cloned too)
std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
private:
std::string footer_key_;
std::string aad_prefix_;
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
const std::string empty_string_ = "";
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
bool check_plaintext_footer_integrity_;
bool plaintext_files_allowed_;
bool utilized_;
FileDecryptionProperties(
const std::string& footer_key,
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
bool check_plaintext_footer_integrity, const std::string& aad_prefix,
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
bool plaintext_files_allowed);
};
class PARQUET_EXPORT FileEncryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
explicit Builder(const std::string& footer_key)
: parquet_cipher_(kDefaultEncryptionAlgorithm),
encrypted_footer_(kDefaultEncryptedFooter) {
footer_key_ = footer_key;
store_aad_prefix_in_file_ = false;
}
/// Create files with plaintext footer.
/// If not called, the files will be created with encrypted footer (default).
Builder* set_plaintext_footer() {
encrypted_footer_ = false;
return this;
}
/// Set encryption algorithm.
/// If not called, files will be encrypted with AES_GCM_V1 (default).
Builder* algorithm(ParquetCipher::type parquet_cipher) {
parquet_cipher_ = parquet_cipher;
return this;
}
/// Set a key retrieval metadata (converted from String).
/// use either footer_key_metadata or footer_key_id, not both.
Builder* footer_key_id(const std::string& key_id);
/// Set a key retrieval metadata.
/// use either footer_key_metadata or footer_key_id, not both.
Builder* footer_key_metadata(const std::string& footer_key_metadata);
/// Set the file AAD Prefix.
Builder* aad_prefix(const std::string& aad_prefix);
/// Skip storing AAD Prefix in file.
/// If not called, and if AAD Prefix is set, it will be stored.
Builder* disable_aad_prefix_storage();
/// Set the list of encrypted columns and their properties (keys etc).
/// If not called, all columns will be encrypted with the footer key.
/// If called, the file columns not in the list will be left unencrypted.
Builder* encrypted_columns(
const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
std::shared_ptr<FileEncryptionProperties> build() {
return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
}
private:
ParquetCipher::type parquet_cipher_;
bool encrypted_footer_;
std::string footer_key_;
std::string footer_key_metadata_;
std::string aad_prefix_;
bool store_aad_prefix_in_file_;
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
};
bool encrypted_footer() const { return encrypted_footer_; }
EncryptionAlgorithm algorithm() const { return algorithm_; }
std::string footer_key() const { return footer_key_; }
std::string footer_key_metadata() const { return footer_key_metadata_; }
std::string file_aad() const { return file_aad_; }
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
const std::string& column_path);
bool is_utilized() const { return utilized_; }
/// FileEncryptionProperties object can be used for writing one file only.
/// Mark FileEncryptionProperties as utilized once it is used to write a file as the
/// encryption keys will be wiped out upon completion of file writing.
void set_utilized() { utilized_ = true; }
/// Upon completion of file writing, the encryption keys
/// will be wiped out (array values set to 0).
void WipeOutEncryptionKeys();
/// FileEncryptionProperties object can be used for writing one file only.
/// (at the end, keys are wiped out in the memory).
/// This method allows to clone identical properties for another file,
/// with an option to update the aadPrefix (if newAadPrefix is null,
/// aadPrefix will be cloned too)
std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
return encrypted_columns_;
}
private:
EncryptionAlgorithm algorithm_;
std::string footer_key_;
std::string footer_key_metadata_;
bool encrypted_footer_;
std::string file_aad_;
std::string aad_prefix_;
bool utilized_;
bool store_aad_prefix_in_file_;
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
const std::string& footer_key_metadata, bool encrypted_footer,
const std::string& aad_prefix, bool store_aad_prefix_in_file,
const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
};
} // namespace parquet

View File

@@ -0,0 +1,31 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License") = 0; you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
namespace parquet {
namespace encryption {
// Key material can be stored outside the Parquet file, for example in a separate small
// file in the same folder. This is important for “key rotation”, when MEKs have to be
// changed (if compromised; or periodically, just in case) - without modifying the Parquet
// files (often immutable).
// TODO: details will be implemented later
class FileKeyMaterialStore {};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,66 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/encryption.h"
#include "parquet/encryption/key_material.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/key_toolkit_internal.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// This class will retrieve the key from "key metadata", following these steps:
// 1. Parse "key metadata" (see structure in KeyMetadata class).
// 2. Retrieve "key material" which can be stored inside or outside "key metadata"
// Currently we don't support the case "key material" stores outside "key metadata"
// yet.
// 3. Unwrap the "data encryption key" from "key material". There are 2 modes:
// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master
// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted
// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above
// "key encryption key"
class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever {
public:
/// key_toolkit and kms_connection_config is to get KmsClient from cache or create
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
/// KmsClient in the cache.
FileKeyUnwrapper(KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
double cache_lifetime_seconds);
std::string GetKey(const std::string& key_metadata) override;
private:
internal::KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material);
std::shared_ptr<KmsClient> GetKmsClientFromConfigOrKeyMaterial(
const KeyMaterial& key_material);
/// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, std::string>> kek_per_kek_id_;
KeyToolkit* key_toolkit_;
KmsConnectionConfig kms_connection_config_;
const double cache_entry_lifetime_seconds_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,82 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/file_key_material_store.h"
#include "parquet/encryption/key_encryption_key.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// This class will generate "key metadata" from "data encryption key" and "master key",
// following these steps:
// 1. Wrap "data encryption key". There are 2 modes:
// 1.1. single wrapping: encrypt "data encryption key" directly with "master encryption
// key"
// 1.2. double wrapping: 2 steps:
// 1.2.1. "key encryption key" is randomized (see KeyEncryptionKey class)
// 1.2.2. "data encryption key" is encrypted with the above "key encryption key"
// 2. Create "key material" (see structure in KeyMaterial class)
// 3. Create "key metadata" with "key material" inside or a reference to outside "key
// material" (see structure in KeyMetadata class).
// We don't support the case "key material" stores outside "key metadata" yet.
class PARQUET_EXPORT FileKeyWrapper {
public:
static constexpr int kKeyEncryptionKeyLength = 16;
static constexpr int kKeyEncryptionKeyIdLength = 16;
/// key_toolkit and kms_connection_config is to get KmsClient from the cache or create
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
/// KmsClient in the cache. key_material_store is to store "key material" outside
/// parquet file, NULL if "key material" is stored inside parquet file.
FileKeyWrapper(KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
std::shared_ptr<FileKeyMaterialStore> key_material_store,
double cache_entry_lifetime_seconds, bool double_wrapping);
/// Creates key_metadata field for a given data key, via wrapping the key with the
/// master key
std::string GetEncryptionKeyMetadata(const std::string& data_key,
const std::string& master_key_id,
bool is_footer_key);
private:
KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id);
/// A map of Master Encryption Key ID -> KeyEncryptionKey, for the current token
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, KeyEncryptionKey>>
kek_per_master_key_id_;
std::shared_ptr<KmsClient> kms_client_;
KmsConnectionConfig kms_connection_config_;
std::shared_ptr<FileKeyMaterialStore> key_material_store_;
const double cache_entry_lifetime_seconds_;
const bool double_wrapping_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,59 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <vector>
#include "arrow/util/base64.h"
namespace parquet {
namespace encryption {
// In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key
// encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK).
// In a writer process, a random KEK is generated for each MEK ID, and cached in a <MEK-ID
// : KEK> map. This allows to perform an interaction with a KMS server only once for each
// MEK, in order to wrap its KEK. "Data encryption key" (DEK) wrapping is performed
// locally, and does not involve an interaction with a KMS server.
class KeyEncryptionKey {
public:
KeyEncryptionKey(std::string kek_bytes, std::string kek_id,
std::string encoded_wrapped_kek)
: kek_bytes_(std::move(kek_bytes)),
kek_id_(std::move(kek_id)),
encoded_kek_id_(::arrow::util::base64_encode(kek_id_)),
encoded_wrapped_kek_(std::move(encoded_wrapped_kek)) {}
const std::string& kek_bytes() const { return kek_bytes_; }
const std::string& kek_id() const { return kek_id_; }
const std::string& encoded_kek_id() const { return encoded_kek_id_; }
const std::string& encoded_wrapped_kek() const { return encoded_wrapped_kek_; }
private:
std::string kek_bytes_;
std::string kek_id_;
std::string encoded_kek_id_;
std::string encoded_wrapped_kek_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,131 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "parquet/platform.h"
namespace arrow {
namespace json {
namespace internal {
class ObjectParser;
} // namespace internal
} // namespace json
} // namespace arrow
namespace parquet {
namespace encryption {
// KeyMaterial class represents the "key material", keeping the information that allows
// readers to recover an encryption key (see description of the KeyMetadata class). The
// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a
// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key
// material is generated by encrypting the "data encryption key" (DEK) by a "master key".
// In the double wrapping mode, the key material is generated by encrypting the DEK by a
// "key encryption key" (KEK), that in turn is encrypted by a "master key".
//
// Key material is kept in a flat json object, with the following fields:
// 1. "keyMaterialType" - a String, with the type of key material. In the current
// version, only one value is allowed - "PKMT1" (stands
// for "parquet key management tools, version 1"). For external key material storage,
// this field is written in both "key metadata" and "key material" jsons. For internal
// key material storage, this field is written only once in the common json.
// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer
// key, and keeps additional information (such as
// KMS instance ID and URL). If false, means that the material belongs to a column
// key.
// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key
// material.
// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key
// material.
// 5. "masterKeyID" - a String, with the ID of the master key used to generate the
// material.
// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding).
// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in
// double wrapping mode.
// If false - in single wrapping mode.
// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the
// material. Written only in double wrapping mode.
// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in
// double wrapping mode.
class PARQUET_EXPORT KeyMaterial {
public:
// these fields are defined in a specification and should never be changed
static constexpr const char kKeyMaterialTypeField[] = "keyMaterialType";
static constexpr const char kKeyMaterialType1[] = "PKMT1";
static constexpr const char kFooterKeyIdInFile[] = "footerKey";
static constexpr const char kColumnKeyIdInFilePrefix[] = "columnKey";
static constexpr const char kIsFooterKeyField[] = "isFooterKey";
static constexpr const char kDoubleWrappingField[] = "doubleWrapping";
static constexpr const char kKmsInstanceIdField[] = "kmsInstanceID";
static constexpr const char kKmsInstanceUrlField[] = "kmsInstanceURL";
static constexpr const char kMasterKeyIdField[] = "masterKeyID";
static constexpr const char kWrappedDataEncryptionKeyField[] = "wrappedDEK";
static constexpr const char kKeyEncryptionKeyIdField[] = "keyEncryptionKeyID";
static constexpr const char kWrappedKeyEncryptionKeyField[] = "wrappedKEK";
public:
KeyMaterial() = default;
static KeyMaterial Parse(const std::string& key_material_string);
static KeyMaterial Parse(
const ::arrow::json::internal::ObjectParser* key_material_json);
/// This method returns a json string that will be stored either inside a parquet file
/// or in a key material store outside the parquet file.
static std::string SerializeToJson(bool is_footer_key,
const std::string& kms_instance_id,
const std::string& kms_instance_url,
const std::string& master_key_id,
bool is_double_wrapped, const std::string& kek_id,
const std::string& encoded_wrapped_kek,
const std::string& encoded_wrapped_dek,
bool is_internal_storage);
bool is_footer_key() const { return is_footer_key_; }
bool is_double_wrapped() const { return is_double_wrapped_; }
const std::string& master_key_id() const { return master_key_id_; }
const std::string& wrapped_dek() const { return encoded_wrapped_dek_; }
const std::string& kek_id() const { return kek_id_; }
const std::string& wrapped_kek() const { return encoded_wrapped_kek_; }
const std::string& kms_instance_id() const { return kms_instance_id_; }
const std::string& kms_instance_url() const { return kms_instance_url_; }
private:
KeyMaterial(bool is_footer_key, const std::string& kms_instance_id,
const std::string& kms_instance_url, const std::string& master_key_id,
bool is_double_wrapped, const std::string& kek_id,
const std::string& encoded_wrapped_kek,
const std::string& encoded_wrapped_dek);
bool is_footer_key_;
std::string kms_instance_id_;
std::string kms_instance_url_;
std::string master_key_id_;
bool is_double_wrapped_;
std::string kek_id_;
std::string encoded_wrapped_kek_;
std::string encoded_wrapped_dek_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,94 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "arrow/util/variant.h"
#include "parquet/encryption/key_material.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// Parquet encryption specification defines "key metadata" as an arbitrary byte array,
// generated by file writers for each encryption key, and passed to the low level API for
// storage in the file footer. The "key metadata" field is made available to file readers
// to enable recovery of the key. This interface can be utilized for implementation
// of any key management scheme.
//
// The keytools package (PARQUET-1373) implements one approach, of many possible, to key
// management and to generation of the "key metadata" fields. This approach, based on the
// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual
// material, required to recover a key, in a "key material" object (see the KeyMaterial
// class for details). This class is implemented to support version 1 of the parquet key
// management tools specification.
//
// KeyMetadata writes (and reads) the "key metadata" field as a flat json object,
// with the following fields:
// 1. "keyMaterialType" - a String, with the type of key material.
// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the
// "key metadata" field. If false, "key material" is kept externally (outside Parquet
// files) - in this case, "key metadata" keeps a reference to the external "key material".
// 3. "keyReference" - a String, with the reference to the external "key material".
// Written only if internalStorage is false.
//
// If internalStorage is true, "key material" is a part of "key metadata", and the json
// keeps additional fields, described in the KeyMaterial class.
class PARQUET_EXPORT KeyMetadata {
public:
static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage";
static constexpr const char kKeyReferenceField[] = "keyReference";
/// key_metadata_bytes is the key metadata field stored in the parquet file,
/// in the serialized json object format.
static KeyMetadata Parse(const std::string& key_metadata_bytes);
static std::string CreateSerializedForExternalMaterial(
const std::string& key_reference);
bool key_material_stored_internally() const { return is_internal_storage_; }
const KeyMaterial& key_material() const {
if (!is_internal_storage_) {
throw ParquetException("key material is stored externally.");
}
return ::arrow::util::get<KeyMaterial>(key_material_or_reference_);
}
const std::string& key_reference() const {
if (is_internal_storage_) {
throw ParquetException("key material is stored internally.");
}
return ::arrow::util::get<std::string>(key_material_or_reference_);
}
private:
explicit KeyMetadata(const KeyMaterial& key_material);
explicit KeyMetadata(const std::string& key_reference);
bool is_internal_storage_;
/// If is_internal_storage_ is true, KeyMaterial is set,
/// else a string referencing to an outside "key material" is set.
::arrow::util::Variant<KeyMaterial, std::string> key_material_or_reference_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include "parquet/encryption/key_encryption_key.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/encryption/two_level_cache_with_expiration.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// KeyToolkit is a utility that keeps various tools for key management (such as key
// rotation, kms client instantiation, cache control, etc), plus a number of auxiliary
// classes for internal use.
class PARQUET_EXPORT KeyToolkit {
public:
/// KMS client two level cache: token -> KMSInstanceId -> KmsClient
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>>& kms_client_cache_per_token() {
return kms_client_cache_;
}
/// Key encryption key two level cache for wrapping: token -> MasterEncryptionKeyId ->
/// KeyEncryptionKey
TwoLevelCacheWithExpiration<KeyEncryptionKey>& kek_write_cache_per_token() {
return key_encryption_key_write_cache_;
}
/// Key encryption key two level cache for unwrapping: token -> KeyEncryptionKeyId ->
/// KeyEncryptionKeyBytes
TwoLevelCacheWithExpiration<std::string>& kek_read_cache_per_token() {
return key_encryption_key_read_cache_;
}
std::shared_ptr<KmsClient> GetKmsClient(
const KmsConnectionConfig& kms_connection_config, double cache_entry_lifetime_ms);
/// Flush any caches that are tied to the (compromised) access_token
void RemoveCacheEntriesForToken(const std::string& access_token);
void RemoveCacheEntriesForAllTokens();
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory) {
if (kms_client_factory_ != NULL) {
throw ParquetException("KMS client factory has already been registered.");
}
kms_client_factory_ = kms_client_factory;
}
private:
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>> kms_client_cache_;
TwoLevelCacheWithExpiration<KeyEncryptionKey> key_encryption_key_write_cache_;
TwoLevelCacheWithExpiration<std::string> key_encryption_key_read_cache_;
std::shared_ptr<KmsClientFactory> kms_client_factory_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,95 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "arrow/util/mutex.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
/// This class wraps the key access token of a KMS server. If your token changes over
/// time, you should keep the reference to the KeyAccessToken object and call Refresh()
/// method every time you have a new token.
class PARQUET_EXPORT KeyAccessToken {
public:
KeyAccessToken() = default;
explicit KeyAccessToken(const std::string value) : value_(value) {}
void Refresh(const std::string& new_value) {
auto lock = mutex_.Lock();
value_ = new_value;
}
const std::string& value() const {
auto lock = mutex_.Lock();
return value_;
}
private:
std::string value_;
mutable ::arrow::util::Mutex mutex_;
};
struct PARQUET_EXPORT KmsConnectionConfig {
std::string kms_instance_id;
std::string kms_instance_url;
/// If the access token is changed in the future, you should keep a reference to
/// this object and call Refresh() on it whenever there is a new access token.
std::shared_ptr<KeyAccessToken> refreshable_key_access_token;
std::unordered_map<std::string, std::string> custom_kms_conf;
KmsConnectionConfig();
const std::string& key_access_token() const {
if (refreshable_key_access_token == NULL ||
refreshable_key_access_token->value().empty()) {
throw ParquetException("key access token is not set!");
}
return refreshable_key_access_token->value();
}
void SetDefaultIfEmpty();
};
class PARQUET_EXPORT KmsClient {
public:
static constexpr const char kKmsInstanceIdDefault[] = "DEFAULT";
static constexpr const char kKmsInstanceUrlDefault[] = "DEFAULT";
static constexpr const char kKeyAccessTokenDefault[] = "DEFAULT";
/// Wraps a key - encrypts it with the master key, encodes the result
/// and potentially adds a KMS-specific metadata.
virtual std::string WrapKey(const std::string& key_bytes,
const std::string& master_key_identifier) = 0;
/// Decrypts (unwraps) a key with the master key.
virtual std::string UnwrapKey(const std::string& wrapped_key,
const std::string& master_key_identifier) = 0;
virtual ~KmsClient() {}
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,40 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
class PARQUET_EXPORT KmsClientFactory {
public:
explicit KmsClientFactory(bool wrap_locally = false) : wrap_locally_(wrap_locally) {}
virtual ~KmsClientFactory() = default;
virtual std::shared_ptr<KmsClient> CreateKmsClient(
const KmsConnectionConfig& kms_connection_config) = 0;
protected:
bool wrap_locally_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,96 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <unordered_map>
#include <vector>
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
/// This class supports local wrapping mode, master keys will be fetched from the KMS
/// server and used to encrypt other keys (data encryption keys or key encryption keys).
class PARQUET_EXPORT LocalWrapKmsClient : public KmsClient {
public:
static constexpr const char kLocalWrapNoKeyVersion[] = "NO_VERSION";
explicit LocalWrapKmsClient(const KmsConnectionConfig& kms_connection_config);
std::string WrapKey(const std::string& key_bytes,
const std::string& master_key_identifier) override;
std::string UnwrapKey(const std::string& wrapped_key,
const std::string& master_key_identifier) override;
protected:
/// Get master key from the remote KMS server.
/// Note: this function might be called by multiple threads
virtual std::string GetMasterKeyFromServer(
const std::string& master_key_identifier) = 0;
private:
/// KMS systems wrap keys by encrypting them by master keys, and attaching additional
/// information (such as the version number of the masker key) to the result of
/// encryption. The master key version is required in key rotation. Currently, the
/// local wrapping mode does not support key rotation (because not all KMS systems allow
/// to fetch a master key by its ID and version number). Still, the local wrapping mode
/// adds a placeholder for the master key version, that will enable support for key
/// rotation in this mode in the future, with appropriate KMS systems. This will also
/// enable backward compatibility, where future readers will be able to extract master
/// key version in the files written by the current code.
///
/// LocalKeyWrap class writes (and reads) the "key wrap" as a flat json with the
/// following fields:
/// 1. "masterKeyVersion" - a String, with the master key version. In the current
/// version, only one value is allowed - "NO_VERSION".
/// 2. "encryptedKey" - a String, with the key encrypted by the master key
/// (base64-encoded).
class LocalKeyWrap {
public:
static constexpr const char kLocalWrapKeyVersionField[] = "masterKeyVersion";
static constexpr const char kLocalWrapEncryptedKeyField[] = "encryptedKey";
LocalKeyWrap(std::string master_key_version, std::string encrypted_encoded_key);
static std::string CreateSerialized(const std::string& encrypted_encoded_key);
static LocalKeyWrap Parse(const std::string& wrapped_key);
const std::string& master_key_version() const { return master_key_version_; }
const std::string& encrypted_encoded_key() const { return encrypted_encoded_key_; }
private:
std::string encrypted_encoded_key_;
std::string master_key_version_;
};
std::string GetKeyFromServer(const std::string& key_identifier);
protected:
KmsConnectionConfig kms_connection_config_;
::arrow::util::ConcurrentMap<std::string, std::string> master_key_cache_;
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,118 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This module defines an abstract interface for iterating through pages in a
// Parquet column chunk within a row group. It could be extended in the future
// to iterate through all data pages in all chunks in a file.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <gtest/gtest.h>
#include "arrow/util/io_util.h"
#include "parquet/encryption/encryption.h"
#include "parquet/test_util.h"
namespace parquet {
class ParquetFileReader;
namespace encryption {
namespace test {
using ::arrow::internal::TemporaryDir;
constexpr int kFixedLength = 10;
const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16
const char kColumnEncryptionKey1[] = "1234567890123450";
const char kColumnEncryptionKey2[] = "1234567890123451";
const char kFileName[] = "tester";
// Get the path of file inside parquet test data directory
std::string data_file(const char* file);
// A temporary directory that contains the encrypted files generated in the tests.
extern std::unique_ptr<TemporaryDir> temp_dir;
inline ::arrow::Result<std::unique_ptr<TemporaryDir>> temp_data_dir() {
return TemporaryDir::Make("parquet-encryption-test-");
}
const char kDoubleFieldName[] = "double_field";
const char kFloatFieldName[] = "float_field";
const char kBooleanFieldName[] = "boolean_field";
const char kInt32FieldName[] = "int32_field";
const char kInt64FieldName[] = "int64_field";
const char kInt96FieldName[] = "int96_field";
const char kByteArrayFieldName[] = "ba_field";
const char kFixedLenByteArrayFieldName[] = "flba_field";
const char kFooterMasterKey[] = "0123456789112345";
const char kFooterMasterKeyId[] = "kf";
const char* const kColumnMasterKeys[] = {"1234567890123450", "1234567890123451",
"1234567890123452", "1234567890123453",
"1234567890123454", "1234567890123455"};
const char* const kColumnMasterKeyIds[] = {"kc1", "kc2", "kc3", "kc4", "kc5", "kc6"};
// The result of this function will be used to set into TestOnlyInMemoryKmsClientFactory
// as the key mapping to look at.
std::unordered_map<std::string, std::string> BuildKeyMap(const char* const* column_ids,
const char* const* column_keys,
const char* footer_id,
const char* footer_key);
// The result of this function will be used to set into EncryptionConfiguration
// as colum keys.
std::string BuildColumnKeyMapping();
// FileEncryptor and FileDecryptor are helper classes to write/read an encrypted parquet
// file corresponding to each pair of FileEncryptionProperties/FileDecryptionProperties.
// FileEncryptor writes the file with fixed data values and FileDecryptor reads the file
// and verify the correctness of data values.
class FileEncryptor {
public:
FileEncryptor();
void EncryptFile(
std::string file,
std::shared_ptr<parquet::FileEncryptionProperties> encryption_configurations);
private:
std::shared_ptr<schema::GroupNode> SetupEncryptionSchema();
int num_rowgroups_ = 5;
int rows_per_rowgroup_ = 50;
std::shared_ptr<schema::GroupNode> schema_;
};
class FileDecryptor {
public:
void DecryptFile(std::string file_name,
std::shared_ptr<FileDecryptionProperties> file_decryption_properties);
private:
void CheckFile(parquet::ParquetFileReader* file_reader,
FileDecryptionProperties* file_decryption_properties);
};
} // namespace test
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,89 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <unordered_map>
#include "arrow/util/base64.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/encryption/local_wrap_kms_client.h"
#include "parquet/platform.h"
namespace parquet {
namespace encryption {
// This is a mock class, built for testing only. Don't use it as an example of
// LocalWrapKmsClient implementation.
class TestOnlyLocalWrapInMemoryKms : public LocalWrapKmsClient {
public:
explicit TestOnlyLocalWrapInMemoryKms(const KmsConnectionConfig& kms_connection_config);
static void InitializeMasterKeys(
const std::unordered_map<std::string, std::string>& master_keys_map);
protected:
std::string GetMasterKeyFromServer(const std::string& master_key_identifier) override;
private:
static std::unordered_map<std::string, std::string> master_key_map_;
};
// This is a mock class, built for testing only. Don't use it as an example of KmsClient
// implementation.
class TestOnlyInServerWrapKms : public KmsClient {
public:
static void InitializeMasterKeys(
const std::unordered_map<std::string, std::string>& master_keys_map);
std::string WrapKey(const std::string& key_bytes,
const std::string& master_key_identifier) override;
std::string UnwrapKey(const std::string& wrapped_key,
const std::string& master_key_identifier) override;
private:
std::string GetMasterKeyFromServer(const std::string& master_key_identifier);
static std::unordered_map<std::string, std::string> master_key_map_;
};
// This is a mock class, built for testing only. Don't use it as an example of
// KmsClientFactory implementation.
class TestOnlyInMemoryKmsClientFactory : public KmsClientFactory {
public:
TestOnlyInMemoryKmsClientFactory(
bool wrap_locally,
const std::unordered_map<std::string, std::string>& master_keys_map)
: KmsClientFactory(wrap_locally) {
TestOnlyLocalWrapInMemoryKms::InitializeMasterKeys(master_keys_map);
TestOnlyInServerWrapKms::InitializeMasterKeys(master_keys_map);
}
std::shared_ptr<KmsClient> CreateKmsClient(
const KmsConnectionConfig& kms_connection_config) {
if (wrap_locally_) {
return std::make_shared<TestOnlyLocalWrapInMemoryKms>(kms_connection_config);
} else {
return std::make_shared<TestOnlyInServerWrapKms>();
}
}
};
} // namespace encryption
} // namespace parquet

View File

@@ -0,0 +1,159 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <chrono>
#include <unordered_map>
#include "arrow/util/concurrent_map.h"
#include "arrow/util/mutex.h"
namespace parquet {
namespace encryption {
using ::arrow::util::ConcurrentMap;
namespace internal {
using TimePoint =
std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<double>>;
inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); }
template <typename E>
class ExpiringCacheEntry {
public:
ExpiringCacheEntry() = default;
ExpiringCacheEntry(E cached_item, double expiration_interval_seconds)
: expiration_timestamp_(CurrentTimePoint() +
std::chrono::duration<double>(expiration_interval_seconds)),
cached_item_(std::move(cached_item)) {}
bool IsExpired() const {
const auto now = CurrentTimePoint();
return (now > expiration_timestamp_);
}
E cached_item() { return cached_item_; }
private:
const TimePoint expiration_timestamp_;
E cached_item_;
};
// This class is to avoid the below warning when compiling KeyToolkit class with VS2015
// warning C4503: decorated name length exceeded, name was truncated
template <typename V>
class ExpiringCacheMapEntry {
public:
ExpiringCacheMapEntry() = default;
explicit ExpiringCacheMapEntry(
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item,
double expiration_interval_seconds)
: map_cache_(cached_item, expiration_interval_seconds) {}
bool IsExpired() { return map_cache_.IsExpired(); }
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item() {
return map_cache_.cached_item();
}
private:
// ConcurrentMap object may be accessed and modified at many places at the same time,
// from multiple threads, or even removed from cache.
ExpiringCacheEntry<std::shared_ptr<ConcurrentMap<std::string, V>>> map_cache_;
};
} // namespace internal
// Two-level cache with expiration of internal caches according to token lifetime.
// External cache is per token, internal is per string key.
// Wrapper class around:
// std::unordered_map<std::string,
// internal::ExpiringCacheEntry<std::unordered_map<std::string, V>>>
// This cache is safe to be shared between threads.
template <typename V>
class TwoLevelCacheWithExpiration {
public:
TwoLevelCacheWithExpiration() {
last_cache_cleanup_timestamp_ = internal::CurrentTimePoint();
}
std::shared_ptr<ConcurrentMap<std::string, V>> GetOrCreateInternalCache(
const std::string& access_token, double cache_entry_lifetime_seconds) {
auto lock = mutex_.Lock();
auto external_cache_entry = cache_.find(access_token);
if (external_cache_entry == cache_.end() ||
external_cache_entry->second.IsExpired()) {
cache_.insert({access_token, internal::ExpiringCacheMapEntry<V>(
std::shared_ptr<ConcurrentMap<std::string, V>>(
new ConcurrentMap<std::string, V>()),
cache_entry_lifetime_seconds)});
}
return cache_[access_token].cached_item();
}
void CheckCacheForExpiredTokens(double cache_cleanup_period_seconds) {
auto lock = mutex_.Lock();
const auto now = internal::CurrentTimePoint();
if (now > (last_cache_cleanup_timestamp_ +
std::chrono::duration<double>(cache_cleanup_period_seconds))) {
RemoveExpiredEntriesNoMutex();
last_cache_cleanup_timestamp_ =
now + std::chrono::duration<double>(cache_cleanup_period_seconds);
}
}
void RemoveExpiredEntriesFromCache() {
auto lock = mutex_.Lock();
RemoveExpiredEntriesNoMutex();
}
void Remove(const std::string& access_token) {
auto lock = mutex_.Lock();
cache_.erase(access_token);
}
void Clear() {
auto lock = mutex_.Lock();
cache_.clear();
}
private:
void RemoveExpiredEntriesNoMutex() {
for (auto it = cache_.begin(); it != cache_.end();) {
if (it->second.IsExpired()) {
it = cache_.erase(it);
} else {
++it;
}
}
}
std::unordered_map<std::string, internal::ExpiringCacheMapEntry<V>> cache_;
internal::TimePoint last_cache_cleanup_timestamp_;
::arrow::util::Mutex mutex_;
};
} // namespace encryption
} // namespace parquet