mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-01 22:13:01 +00:00
first commit
This commit is contained in:
@ -0,0 +1,28 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/config.h" // IWYU pragma: export
|
||||
|
||||
#include "arrow/filesystem/filesystem.h" // IWYU pragma: export
|
||||
#include "arrow/filesystem/hdfs.h" // IWYU pragma: export
|
||||
#include "arrow/filesystem/localfs.h" // IWYU pragma: export
|
||||
#include "arrow/filesystem/mockfs.h" // IWYU pragma: export
|
||||
#ifdef ARROW_S3
|
||||
#include "arrow/filesystem/s3fs.h" // IWYU pragma: export
|
||||
#endif
|
@ -0,0 +1,541 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/type_fwd.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/compare.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
#include "arrow/util/windows_fixup.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
|
||||
// A system clock time point expressed as a 64-bit (or more) number of
|
||||
// nanoseconds since the epoch.
|
||||
using TimePoint =
|
||||
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
|
||||
|
||||
ARROW_EXPORT std::string ToString(FileType);
|
||||
|
||||
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType);
|
||||
|
||||
static const int64_t kNoSize = -1;
|
||||
static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1));
|
||||
|
||||
/// \brief FileSystem entry info
|
||||
struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> {
|
||||
FileInfo() = default;
|
||||
FileInfo(FileInfo&&) = default;
|
||||
FileInfo& operator=(FileInfo&&) = default;
|
||||
FileInfo(const FileInfo&) = default;
|
||||
FileInfo& operator=(const FileInfo&) = default;
|
||||
|
||||
explicit FileInfo(std::string path, FileType type = FileType::Unknown)
|
||||
: path_(std::move(path)), type_(type) {}
|
||||
|
||||
/// The file type
|
||||
FileType type() const { return type_; }
|
||||
void set_type(FileType type) { type_ = type; }
|
||||
|
||||
/// The full file path in the filesystem
|
||||
const std::string& path() const { return path_; }
|
||||
void set_path(std::string path) { path_ = std::move(path); }
|
||||
|
||||
/// The file base name (component after the last directory separator)
|
||||
std::string base_name() const;
|
||||
|
||||
// The directory base name (component before the file base name).
|
||||
std::string dir_name() const;
|
||||
|
||||
/// The size in bytes, if available
|
||||
///
|
||||
/// Only regular files are guaranteed to have a size.
|
||||
int64_t size() const { return size_; }
|
||||
void set_size(int64_t size) { size_ = size; }
|
||||
|
||||
/// The file extension (excluding the dot)
|
||||
std::string extension() const;
|
||||
|
||||
/// The time of last modification, if available
|
||||
TimePoint mtime() const { return mtime_; }
|
||||
void set_mtime(TimePoint mtime) { mtime_ = mtime; }
|
||||
|
||||
bool IsFile() const { return type_ == FileType::File; }
|
||||
bool IsDirectory() const { return type_ == FileType::Directory; }
|
||||
|
||||
bool Equals(const FileInfo& other) const {
|
||||
return type() == other.type() && path() == other.path() && size() == other.size() &&
|
||||
mtime() == other.mtime();
|
||||
}
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
/// Function object implementing less-than comparison and hashing by
|
||||
/// path, to support sorting infos, using them as keys, and other
|
||||
/// interactions with the STL.
|
||||
struct ByPath {
|
||||
bool operator()(const FileInfo& l, const FileInfo& r) const {
|
||||
return l.path() < r.path();
|
||||
}
|
||||
|
||||
size_t operator()(const FileInfo& i) const {
|
||||
return std::hash<std::string>{}(i.path());
|
||||
}
|
||||
};
|
||||
|
||||
protected:
|
||||
std::string path_;
|
||||
FileType type_ = FileType::Unknown;
|
||||
int64_t size_ = kNoSize;
|
||||
TimePoint mtime_ = kNoTime;
|
||||
};
|
||||
|
||||
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&);
|
||||
|
||||
/// \brief File selector for filesystem APIs
|
||||
struct ARROW_EXPORT FileSelector {
|
||||
/// The directory in which to select files.
|
||||
/// If the path exists but doesn't point to a directory, this should be an error.
|
||||
std::string base_dir;
|
||||
/// The behavior if `base_dir` isn't found in the filesystem. If false,
|
||||
/// an error is returned. If true, an empty selection is returned.
|
||||
bool allow_not_found;
|
||||
/// Whether to recurse into subdirectories.
|
||||
bool recursive;
|
||||
/// The maximum number of subdirectories to recurse into.
|
||||
int32_t max_recursion;
|
||||
|
||||
FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {}
|
||||
};
|
||||
|
||||
/// \brief FileSystem, path pair
|
||||
struct ARROW_EXPORT FileLocator {
|
||||
std::shared_ptr<FileSystem> filesystem;
|
||||
std::string path;
|
||||
};
|
||||
|
||||
using FileInfoVector = std::vector<FileInfo>;
|
||||
using FileInfoGenerator = std::function<Future<FileInfoVector>()>;
|
||||
|
||||
} // namespace fs
|
||||
|
||||
template <>
|
||||
struct IterationTraits<fs::FileInfoVector> {
|
||||
static fs::FileInfoVector End() { return {}; }
|
||||
static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); }
|
||||
};
|
||||
|
||||
namespace fs {
|
||||
|
||||
/// \brief Abstract file system API
|
||||
class ARROW_EXPORT FileSystem : public std::enable_shared_from_this<FileSystem> {
|
||||
public:
|
||||
virtual ~FileSystem();
|
||||
|
||||
virtual std::string type_name() const = 0;
|
||||
|
||||
/// EXPERIMENTAL: The IOContext associated with this filesystem.
|
||||
const io::IOContext& io_context() const { return io_context_; }
|
||||
|
||||
/// Normalize path for the given filesystem
|
||||
///
|
||||
/// The default implementation of this method is a no-op, but subclasses
|
||||
/// may allow normalizing irregular path forms (such as Windows local paths).
|
||||
virtual Result<std::string> NormalizePath(std::string path);
|
||||
|
||||
virtual bool Equals(const FileSystem& other) const = 0;
|
||||
|
||||
virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
|
||||
return Equals(*other);
|
||||
}
|
||||
|
||||
/// Get info for the given target.
|
||||
///
|
||||
/// Any symlink is automatically dereferenced, recursively.
|
||||
/// A nonexistent or unreachable file returns an Ok status and
|
||||
/// has a FileType of value NotFound. An error status indicates
|
||||
/// a truly exceptional condition (low-level I/O error, etc.).
|
||||
virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0;
|
||||
/// Same, for many targets at once.
|
||||
virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>& paths);
|
||||
/// Same, according to a selector.
|
||||
///
|
||||
/// The selector's base directory will not be part of the results, even if
|
||||
/// it exists.
|
||||
/// If it doesn't exist, see `FileSelector::allow_not_found`.
|
||||
virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0;
|
||||
|
||||
/// Async version of GetFileInfo
|
||||
virtual Future<FileInfoVector> GetFileInfoAsync(const std::vector<std::string>& paths);
|
||||
|
||||
/// Streaming async version of GetFileInfo
|
||||
///
|
||||
/// The returned generator is not async-reentrant, i.e. you need to wait for
|
||||
/// the returned future to complete before calling the generator again.
|
||||
virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select);
|
||||
|
||||
/// Create a directory and subdirectories.
|
||||
///
|
||||
/// This function succeeds if the directory already exists.
|
||||
virtual Status CreateDir(const std::string& path, bool recursive = true) = 0;
|
||||
|
||||
/// Delete a directory and its contents, recursively.
|
||||
virtual Status DeleteDir(const std::string& path) = 0;
|
||||
|
||||
/// Delete a directory's contents, recursively.
|
||||
///
|
||||
/// Like DeleteDir, but doesn't delete the directory itself.
|
||||
/// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents.
|
||||
virtual Status DeleteDirContents(const std::string& path,
|
||||
bool missing_dir_ok = false) = 0;
|
||||
|
||||
/// Async version of DeleteDirContents.
|
||||
virtual Future<> DeleteDirContentsAsync(const std::string& path,
|
||||
bool missing_dir_ok = false);
|
||||
|
||||
/// EXPERIMENTAL: Delete the root directory's contents, recursively.
|
||||
///
|
||||
/// Implementations may decide to raise an error if this operation is
|
||||
/// too dangerous.
|
||||
// NOTE: may decide to remove this if it's deemed not useful
|
||||
virtual Status DeleteRootDirContents() = 0;
|
||||
|
||||
/// Delete a file.
|
||||
virtual Status DeleteFile(const std::string& path) = 0;
|
||||
/// Delete many files.
|
||||
///
|
||||
/// The default implementation issues individual delete operations in sequence.
|
||||
virtual Status DeleteFiles(const std::vector<std::string>& paths);
|
||||
|
||||
/// Move / rename a file or directory.
|
||||
///
|
||||
/// If the destination exists:
|
||||
/// - if it is a non-empty directory, an error is returned
|
||||
/// - otherwise, if it has the same type as the source, it is replaced
|
||||
/// - otherwise, behavior is unspecified (implementation-dependent).
|
||||
virtual Status Move(const std::string& src, const std::string& dest) = 0;
|
||||
|
||||
/// Copy a file.
|
||||
///
|
||||
/// If the destination exists and is a directory, an error is returned.
|
||||
/// Otherwise, it is replaced.
|
||||
virtual Status CopyFile(const std::string& src, const std::string& dest) = 0;
|
||||
|
||||
/// Open an input stream for sequential reading.
|
||||
virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) = 0;
|
||||
/// Open an input stream for sequential reading.
|
||||
///
|
||||
/// This override assumes the given FileInfo validly represents the file's
|
||||
/// characteristics, and may optimize access depending on them (for example
|
||||
/// avoid querying the file size or its existence).
|
||||
virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info);
|
||||
|
||||
/// Open an input file for random access reading.
|
||||
virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) = 0;
|
||||
/// Open an input file for random access reading.
|
||||
///
|
||||
/// This override assumes the given FileInfo validly represents the file's
|
||||
/// characteristics, and may optimize access depending on them (for example
|
||||
/// avoid querying the file size or its existence).
|
||||
virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const FileInfo& info);
|
||||
|
||||
/// Async version of OpenInputStream
|
||||
virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
|
||||
const std::string& path);
|
||||
/// Async version of OpenInputStream
|
||||
virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
|
||||
const FileInfo& info);
|
||||
|
||||
/// Async version of OpenInputFile
|
||||
virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
|
||||
const std::string& path);
|
||||
/// Async version of OpenInputFile
|
||||
virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
|
||||
const FileInfo& info);
|
||||
|
||||
/// Open an output stream for sequential writing.
|
||||
///
|
||||
/// If the target already exists, existing data is truncated.
|
||||
virtual Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(const std::string& path);
|
||||
|
||||
/// Open an output stream for appending.
|
||||
///
|
||||
/// If the target doesn't exist, a new empty file is created.
|
||||
///
|
||||
/// Note: some filesystem implementations do not support efficient appending
|
||||
/// to an existing file, in which case this method will return NotImplemented.
|
||||
/// Consider writing to multiple files (using e.g. the dataset layer) instead.
|
||||
virtual Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const std::string& path);
|
||||
|
||||
protected:
|
||||
explicit FileSystem(const io::IOContext& io_context = io::default_io_context())
|
||||
: io_context_(io_context) {}
|
||||
|
||||
io::IOContext io_context_;
|
||||
// Whether metadata operations (such as GetFileInfo or OpenInputStream)
|
||||
// are cheap enough that the default async variants don't bother with
|
||||
// a thread pool.
|
||||
bool default_async_is_sync_ = true;
|
||||
};
|
||||
|
||||
/// \brief A FileSystem implementation that delegates to another
|
||||
/// implementation after prepending a fixed base path.
|
||||
///
|
||||
/// This is useful to expose a logical view of a subtree of a filesystem,
|
||||
/// for example a directory in a LocalFileSystem.
|
||||
/// This works on abstract paths, i.e. paths using forward slashes and
|
||||
/// and a single root "/". Windows paths are not guaranteed to work.
|
||||
/// This makes no security guarantee. For example, symlinks may allow to
|
||||
/// "escape" the subtree and access other parts of the underlying filesystem.
|
||||
class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
|
||||
public:
|
||||
// This constructor may abort if base_path is invalid.
|
||||
explicit SubTreeFileSystem(const std::string& base_path,
|
||||
std::shared_ptr<FileSystem> base_fs);
|
||||
~SubTreeFileSystem() override;
|
||||
|
||||
std::string type_name() const override { return "subtree"; }
|
||||
std::string base_path() const { return base_path_; }
|
||||
std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
|
||||
|
||||
Result<std::string> NormalizePath(std::string path) override;
|
||||
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
/// \cond FALSE
|
||||
using FileSystem::GetFileInfo;
|
||||
/// \endcond
|
||||
Result<FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
|
||||
|
||||
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive = true) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const FileInfo& info) override;
|
||||
|
||||
Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
|
||||
const std::string& path) override;
|
||||
Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
|
||||
const FileInfo& info) override;
|
||||
Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
|
||||
const std::string& path) override;
|
||||
Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
|
||||
const FileInfo& info) override;
|
||||
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
protected:
|
||||
SubTreeFileSystem() {}
|
||||
|
||||
const std::string base_path_;
|
||||
std::shared_ptr<FileSystem> base_fs_;
|
||||
|
||||
Result<std::string> PrependBase(const std::string& s) const;
|
||||
Result<std::string> PrependBaseNonEmpty(const std::string& s) const;
|
||||
Result<std::string> StripBase(const std::string& s) const;
|
||||
Status FixInfo(FileInfo* info) const;
|
||||
|
||||
static Result<std::string> NormalizeBasePath(
|
||||
std::string base_path, const std::shared_ptr<FileSystem>& base_fs);
|
||||
};
|
||||
|
||||
/// \brief A FileSystem implementation that delegates to another
|
||||
/// implementation but inserts latencies at various points.
|
||||
class ARROW_EXPORT SlowFileSystem : public FileSystem {
|
||||
public:
|
||||
SlowFileSystem(std::shared_ptr<FileSystem> base_fs,
|
||||
std::shared_ptr<io::LatencyGenerator> latencies);
|
||||
SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency);
|
||||
SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency,
|
||||
int32_t seed);
|
||||
|
||||
std::string type_name() const override { return "slow"; }
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
using FileSystem::GetFileInfo;
|
||||
Result<FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive = true) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const FileInfo& info) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
protected:
|
||||
std::shared_ptr<FileSystem> base_fs_;
|
||||
std::shared_ptr<io::LatencyGenerator> latencies_;
|
||||
};
|
||||
|
||||
/// \defgroup filesystem-factories Functions for creating FileSystem instances
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Create a new FileSystem by URI
|
||||
///
|
||||
/// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
|
||||
///
|
||||
/// \param[in] uri a URI-based path, ex: file:///some/local/path
|
||||
/// \param[out] out_path (optional) Path inside the filesystem.
|
||||
/// \return out_fs FileSystem instance.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
|
||||
std::string* out_path = NULLPTR);
|
||||
|
||||
/// \brief Create a new FileSystem by URI with a custom IO context
|
||||
///
|
||||
/// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
|
||||
///
|
||||
/// \param[in] uri a URI-based path, ex: file:///some/local/path
|
||||
/// \param[in] io_context an IOContext which will be associated with the filesystem
|
||||
/// \param[out] out_path (optional) Path inside the filesystem.
|
||||
/// \return out_fs FileSystem instance.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
|
||||
const io::IOContext& io_context,
|
||||
std::string* out_path = NULLPTR);
|
||||
|
||||
/// \brief Create a new FileSystem by URI
|
||||
///
|
||||
/// Same as FileSystemFromUri, but in addition also recognize non-URIs
|
||||
/// and treat them as local filesystem paths. Only absolute local filesystem
|
||||
/// paths are allowed.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
|
||||
const std::string& uri, std::string* out_path = NULLPTR);
|
||||
|
||||
/// \brief Create a new FileSystem by URI with a custom IO context
|
||||
///
|
||||
/// Same as FileSystemFromUri, but in addition also recognize non-URIs
|
||||
/// and treat them as local filesystem paths. Only absolute local filesystem
|
||||
/// paths are allowed.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
|
||||
const std::string& uri, const io::IOContext& io_context,
|
||||
std::string* out_path = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Copy files, including from one FileSystem to another
|
||||
///
|
||||
/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile
|
||||
/// will be used, otherwise the file will be opened as a stream in both FileSystems and
|
||||
/// chunks copied from the source to the destination. No directories will be created.
|
||||
ARROW_EXPORT
|
||||
Status CopyFiles(const std::vector<FileLocator>& sources,
|
||||
const std::vector<FileLocator>& destinations,
|
||||
const io::IOContext& io_context = io::default_io_context(),
|
||||
int64_t chunk_size = 1024 * 1024, bool use_threads = true);
|
||||
|
||||
/// \brief Copy selected files, including from one FileSystem to another
|
||||
///
|
||||
/// Directories will be created under the destination base directory as needed.
|
||||
ARROW_EXPORT
|
||||
Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs,
|
||||
const FileSelector& source_sel,
|
||||
const std::shared_ptr<FileSystem>& destination_fs,
|
||||
const std::string& destination_base_dir,
|
||||
const io::IOContext& io_context = io::default_io_context(),
|
||||
int64_t chunk_size = 1024 * 1024, bool use_threads = true);
|
||||
|
||||
struct FileSystemGlobalOptions {
|
||||
/// Path to a single PEM file holding all TLS CA certificates
|
||||
///
|
||||
/// If empty, the underlying TLS library's defaults will be used.
|
||||
std::string tls_ca_file_path;
|
||||
|
||||
/// Path to a directory holding TLS CA certificates in individual PEM files
|
||||
/// named along the OpenSSL "hashed" format.
|
||||
///
|
||||
/// If empty, the underlying TLS library's defaults will be used.
|
||||
std::string tls_ca_dir_path;
|
||||
};
|
||||
|
||||
/// EXPERIMENTAL: optional global initialization routine
|
||||
///
|
||||
/// This is for environments (such as manylinux) where the path
|
||||
/// to TLS CA certificates needs to be configured at runtime.
|
||||
ARROW_EXPORT
|
||||
Status Initialize(const FileSystemGlobalOptions& options);
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
197
.venv/Lib/site-packages/pyarrow/include/arrow/filesystem/gcsfs.h
Normal file
197
.venv/Lib/site-packages/pyarrow/include/arrow/filesystem/gcsfs.h
Normal file
@ -0,0 +1,197 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/util/uri.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
|
||||
struct GcsCredentials;
|
||||
|
||||
/// Options for the GcsFileSystem implementation.
|
||||
struct ARROW_EXPORT GcsOptions {
|
||||
std::shared_ptr<GcsCredentials> credentials;
|
||||
|
||||
std::string endpoint_override;
|
||||
std::string scheme;
|
||||
/// \brief Location to use for creating buckets.
|
||||
std::string default_bucket_location;
|
||||
|
||||
/// \brief Default metadata for OpenOutputStream.
|
||||
///
|
||||
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
|
||||
std::shared_ptr<const KeyValueMetadata> default_metadata;
|
||||
|
||||
bool Equals(const GcsOptions& other) const;
|
||||
|
||||
/// \brief Initialize with Google Default Credentials
|
||||
///
|
||||
/// Create options configured to use [Application Default Credentials][aip/4110]. The
|
||||
/// details of this mechanism are too involved to describe here, but suffice is to say
|
||||
/// that applications can override any defaults using an environment variable
|
||||
/// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google
|
||||
/// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have
|
||||
/// the same behavior as the `gcloud` CLI tool on your workstation.
|
||||
///
|
||||
/// \see https://cloud.google.com/docs/authentication
|
||||
///
|
||||
/// [aip/4110]: https://google.aip.dev/auth/4110
|
||||
static GcsOptions Defaults();
|
||||
|
||||
/// \brief Initialize with anonymous credentials
|
||||
static GcsOptions Anonymous();
|
||||
|
||||
/// \brief Initialize with access token
|
||||
///
|
||||
/// These credentials are useful when using an out-of-band mechanism to fetch access
|
||||
/// tokens. Note that access tokens are time limited, you will need to manually refresh
|
||||
/// the tokens created by the out-of-band mechanism.
|
||||
static GcsOptions FromAccessToken(const std::string& access_token,
|
||||
std::chrono::system_clock::time_point expiration);
|
||||
|
||||
/// \brief Initialize with service account impersonation
|
||||
///
|
||||
/// Service account impersonation allows one principal (a user or service account) to
|
||||
/// impersonate a service account. It requires that the calling principal has the
|
||||
/// necessary permissions *on* the service account.
|
||||
static GcsOptions FromImpersonatedServiceAccount(
|
||||
const GcsCredentials& base_credentials, const std::string& target_service_account);
|
||||
|
||||
/// Creates service account credentials from a JSON object in string form.
|
||||
///
|
||||
/// The @p json_object is expected to be in the format described by [aip/4112]. Such an
|
||||
/// object contains the identity of a service account, as well as a private key that can
|
||||
/// be used to sign tokens, showing the caller was holding the private key.
|
||||
///
|
||||
/// In GCP one can create several "keys" for each service account, and these keys are
|
||||
/// downloaded as a JSON "key file". The contents of such a file are in the format
|
||||
/// required by this function. Remember that key files and their contents should be
|
||||
/// treated as any other secret with security implications, think of them as passwords
|
||||
/// (because they are!), don't store them or output them where unauthorized persons may
|
||||
/// read them.
|
||||
///
|
||||
/// Most applications should probably use default credentials, maybe pointing them to a
|
||||
/// file with these contents. Using this function may be useful when the json object is
|
||||
/// obtained from a Cloud Secret Manager or a similar service.
|
||||
///
|
||||
/// [aip/4112]: https://google.aip.dev/auth/4112
|
||||
static GcsOptions FromServiceAccountCredentials(const std::string& json_object);
|
||||
|
||||
/// Initialize from URIs such as "gs://bucket/object".
|
||||
static Result<GcsOptions> FromUri(const arrow::internal::Uri& uri,
|
||||
std::string* out_path);
|
||||
static Result<GcsOptions> FromUri(const std::string& uri, std::string* out_path);
|
||||
};
|
||||
|
||||
/// \brief GCS-backed FileSystem implementation.
|
||||
///
|
||||
/// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object
|
||||
/// storage system for any amount of data. The main abstractions in GCS are buckets and
|
||||
/// objects. A bucket is a namespace for objects, buckets can store any number of objects,
|
||||
/// tens of millions and even billions is not uncommon. Each object contains a single
|
||||
/// blob of data, up to 5TiB in size. Buckets are typically configured to keep a single
|
||||
/// version of each object, but versioning can be enabled. Versioning is important because
|
||||
/// objects are immutable, once created one cannot append data to the object or modify the
|
||||
/// object data in any way.
|
||||
///
|
||||
/// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket
|
||||
/// named `foo` no other customer can create a bucket with the same name. Note that a
|
||||
/// principal (a user or service account) may only list the buckets they are entitled to,
|
||||
/// and then only within a project. It is not possible to list "all" the buckets.
|
||||
///
|
||||
/// Within each bucket objects are in flat namespace. GCS does not have folders or
|
||||
/// directories. However, following some conventions it is possible to emulate
|
||||
/// directories. To this end, this class:
|
||||
///
|
||||
/// - All buckets are treated as directories at the "root"
|
||||
/// - Creating a root directory results in a new bucket being created, this may be slower
|
||||
/// than most GCS operations.
|
||||
/// - The class creates marker objects for a directory, using a metadata attribute to
|
||||
/// annotate the file.
|
||||
/// - GCS can list all the objects with a given prefix, this is used to emulate listing
|
||||
/// of directories.
|
||||
/// - In object lists GCS can summarize all the objects with a common prefix as a single
|
||||
/// entry, this is used to emulate non-recursive lists. Note that GCS list time is
|
||||
/// proportional to the number of objects in the prefix. Listing recursively takes
|
||||
/// almost the same time as non-recursive lists.
|
||||
///
|
||||
class ARROW_EXPORT GcsFileSystem : public FileSystem {
|
||||
public:
|
||||
~GcsFileSystem() override = default;
|
||||
|
||||
std::string type_name() const override;
|
||||
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
Result<FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
|
||||
|
||||
/// This is not implemented in GcsFileSystem, as it would be too dangerous.
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
|
||||
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const FileInfo& info) override;
|
||||
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
|
||||
|
||||
ARROW_DEPRECATED(
|
||||
"Deprecated. "
|
||||
"OpenAppendStream is unsupported on the GCS FileSystem.")
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
|
||||
|
||||
/// Create a GcsFileSystem instance from the given options.
|
||||
static std::shared_ptr<GcsFileSystem> Make(
|
||||
const GcsOptions& options, const io::IOContext& = io::default_io_context());
|
||||
|
||||
private:
|
||||
explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context);
|
||||
|
||||
class Impl;
|
||||
std::shared_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
113
.venv/Lib/site-packages/pyarrow/include/arrow/filesystem/hdfs.h
Normal file
113
.venv/Lib/site-packages/pyarrow/include/arrow/filesystem/hdfs.h
Normal file
@ -0,0 +1,113 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/io/hdfs.h"
|
||||
#include "arrow/util/uri.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
|
||||
/// Options for the HDFS implementation.
|
||||
struct ARROW_EXPORT HdfsOptions {
|
||||
HdfsOptions() = default;
|
||||
~HdfsOptions() = default;
|
||||
|
||||
/// Hdfs configuration options, contains host, port, driver
|
||||
io::HdfsConnectionConfig connection_config;
|
||||
|
||||
/// Used by Hdfs OpenWritable Interface.
|
||||
int32_t buffer_size = 0;
|
||||
int16_t replication = 3;
|
||||
int64_t default_block_size = 0;
|
||||
|
||||
void ConfigureEndPoint(std::string host, int port);
|
||||
void ConfigureReplication(int16_t replication);
|
||||
void ConfigureUser(std::string user_name);
|
||||
void ConfigureBufferSize(int32_t buffer_size);
|
||||
void ConfigureBlockSize(int64_t default_block_size);
|
||||
void ConfigureKerberosTicketCachePath(std::string path);
|
||||
void ConfigureExtraConf(std::string key, std::string val);
|
||||
|
||||
bool Equals(const HdfsOptions& other) const;
|
||||
|
||||
static Result<HdfsOptions> FromUri(const ::arrow::internal::Uri& uri);
|
||||
static Result<HdfsOptions> FromUri(const std::string& uri);
|
||||
};
|
||||
|
||||
/// HDFS-backed FileSystem implementation.
|
||||
///
|
||||
/// implementation notes:
|
||||
/// - This is a wrapper of arrow/io/hdfs, so we can use FileSystem API to handle hdfs.
|
||||
class ARROW_EXPORT HadoopFileSystem : public FileSystem {
|
||||
public:
|
||||
~HadoopFileSystem() override;
|
||||
|
||||
std::string type_name() const override { return "hdfs"; }
|
||||
HdfsOptions options() const;
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
/// \cond FALSE
|
||||
using FileSystem::GetFileInfo;
|
||||
/// \endcond
|
||||
Result<FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive = true) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
|
||||
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
/// Create a HdfsFileSystem instance from the given options.
|
||||
static Result<std::shared_ptr<HadoopFileSystem>> Make(
|
||||
const HdfsOptions& options, const io::IOContext& = io::default_io_context());
|
||||
|
||||
protected:
|
||||
HadoopFileSystem(const HdfsOptions& options, const io::IOContext&);
|
||||
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
@ -0,0 +1,113 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
class Uri;
|
||||
|
||||
}
|
||||
|
||||
namespace fs {
|
||||
|
||||
/// Options for the LocalFileSystem implementation.
|
||||
struct ARROW_EXPORT LocalFileSystemOptions {
|
||||
/// Whether OpenInputStream and OpenInputFile return a mmap'ed file,
|
||||
/// or a regular one.
|
||||
bool use_mmap = false;
|
||||
|
||||
/// \brief Initialize with defaults
|
||||
static LocalFileSystemOptions Defaults();
|
||||
|
||||
bool Equals(const LocalFileSystemOptions& other) const;
|
||||
|
||||
static Result<LocalFileSystemOptions> FromUri(const ::arrow::internal::Uri& uri,
|
||||
std::string* out_path);
|
||||
};
|
||||
|
||||
/// \brief A FileSystem implementation accessing files on the local machine.
|
||||
///
|
||||
/// This class handles only `/`-separated paths. If desired, conversion
|
||||
/// from Windows backslash-separated paths should be done by the caller.
|
||||
/// Details such as symlinks are abstracted away (symlinks are always
|
||||
/// followed, except when deleting an entry).
|
||||
class ARROW_EXPORT LocalFileSystem : public FileSystem {
|
||||
public:
|
||||
explicit LocalFileSystem(const io::IOContext& = io::default_io_context());
|
||||
explicit LocalFileSystem(const LocalFileSystemOptions&,
|
||||
const io::IOContext& = io::default_io_context());
|
||||
~LocalFileSystem() override;
|
||||
|
||||
std::string type_name() const override { return "local"; }
|
||||
|
||||
Result<std::string> NormalizePath(std::string path) override;
|
||||
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
LocalFileSystemOptions options() const { return options_; }
|
||||
|
||||
/// \cond FALSE
|
||||
using FileSystem::GetFileInfo;
|
||||
/// \endcond
|
||||
Result<FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive = true) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
protected:
|
||||
LocalFileSystemOptions options_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Return whether the string is detected as a local absolute path.
|
||||
ARROW_EXPORT
|
||||
bool DetectAbsolutePath(const std::string& s);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
@ -0,0 +1,132 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/util/string_view.h"
|
||||
#include "arrow/util/windows_fixup.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
namespace internal {
|
||||
|
||||
struct MockDirInfo {
|
||||
std::string full_path;
|
||||
TimePoint mtime;
|
||||
|
||||
bool operator==(const MockDirInfo& other) const {
|
||||
return mtime == other.mtime && full_path == other.full_path;
|
||||
}
|
||||
|
||||
friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const MockDirInfo&);
|
||||
};
|
||||
|
||||
struct MockFileInfo {
|
||||
std::string full_path;
|
||||
TimePoint mtime;
|
||||
util::string_view data;
|
||||
|
||||
bool operator==(const MockFileInfo& other) const {
|
||||
return mtime == other.mtime && full_path == other.full_path && data == other.data;
|
||||
}
|
||||
|
||||
friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const MockFileInfo&);
|
||||
};
|
||||
|
||||
/// A mock FileSystem implementation that holds its contents in memory.
|
||||
///
|
||||
/// Useful for validating the FileSystem API, writing conformance suite,
|
||||
/// and bootstrapping FileSystem-based APIs.
|
||||
class ARROW_EXPORT MockFileSystem : public FileSystem {
|
||||
public:
|
||||
explicit MockFileSystem(TimePoint current_time,
|
||||
const io::IOContext& = io::default_io_context());
|
||||
~MockFileSystem() override;
|
||||
|
||||
std::string type_name() const override { return "mock"; }
|
||||
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
// XXX It's not very practical to have to explicitly declare inheritance
|
||||
// of default overrides.
|
||||
using FileSystem::GetFileInfo;
|
||||
Result<FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive = true) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
// Contents-dumping helpers to ease testing.
|
||||
// Output is lexicographically-ordered by full path.
|
||||
std::vector<MockDirInfo> AllDirs();
|
||||
std::vector<MockFileInfo> AllFiles();
|
||||
|
||||
// Create a File with a content from a string.
|
||||
Status CreateFile(const std::string& path, util::string_view content,
|
||||
bool recursive = true);
|
||||
|
||||
// Create a MockFileSystem out of (empty) FileInfo. The content of every
|
||||
// file is empty and of size 0. All directories will be created recursively.
|
||||
static Result<std::shared_ptr<FileSystem>> Make(TimePoint current_time,
|
||||
const std::vector<FileInfo>& infos);
|
||||
|
||||
class Impl;
|
||||
|
||||
protected:
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT MockAsyncFileSystem : public MockFileSystem {
|
||||
public:
|
||||
explicit MockAsyncFileSystem(TimePoint current_time,
|
||||
const io::IOContext& io_context = io::default_io_context())
|
||||
: MockFileSystem(current_time, io_context) {
|
||||
default_async_is_sync_ = false;
|
||||
}
|
||||
|
||||
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
@ -0,0 +1,133 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/optional.h"
|
||||
#include "arrow/util/string_view.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
namespace internal {
|
||||
|
||||
constexpr char kSep = '/';
|
||||
|
||||
// Computations on abstract paths (not local paths with system-dependent behaviour).
|
||||
// Abstract paths are typically used in URIs.
|
||||
|
||||
// Split an abstract path into its individual components.
|
||||
ARROW_EXPORT
|
||||
std::vector<std::string> SplitAbstractPath(const std::string& path, char sep = kSep);
|
||||
|
||||
// Return the extension of the file
|
||||
ARROW_EXPORT
|
||||
std::string GetAbstractPathExtension(const std::string& s);
|
||||
|
||||
// Return the parent directory and basename of an abstract path. Both values may be
|
||||
// empty.
|
||||
ARROW_EXPORT
|
||||
std::pair<std::string, std::string> GetAbstractPathParent(const std::string& s);
|
||||
|
||||
// Validate the components of an abstract path.
|
||||
ARROW_EXPORT
|
||||
Status ValidateAbstractPathParts(const std::vector<std::string>& parts);
|
||||
|
||||
// Append a non-empty stem to an abstract path.
|
||||
ARROW_EXPORT
|
||||
std::string ConcatAbstractPath(const std::string& base, const std::string& stem);
|
||||
|
||||
// Make path relative to base, if it starts with base. Otherwise error out.
|
||||
ARROW_EXPORT
|
||||
Result<std::string> MakeAbstractPathRelative(const std::string& base,
|
||||
const std::string& path);
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string EnsureLeadingSlash(util::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
util::string_view RemoveLeadingSlash(util::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
std::string EnsureTrailingSlash(util::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
util::string_view RemoveTrailingSlash(util::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool IsAncestorOf(util::string_view ancestor, util::string_view descendant);
|
||||
|
||||
ARROW_EXPORT
|
||||
util::optional<util::string_view> RemoveAncestor(util::string_view ancestor,
|
||||
util::string_view descendant);
|
||||
|
||||
/// Return a vector of ancestors between a base path and a descendant.
|
||||
/// For example,
|
||||
///
|
||||
/// AncestorsFromBasePath("a/b", "a/b/c/d/e") -> ["a/b/c", "a/b/c/d"]
|
||||
ARROW_EXPORT
|
||||
std::vector<std::string> AncestorsFromBasePath(util::string_view base_path,
|
||||
util::string_view descendant);
|
||||
|
||||
/// Given a vector of paths of directories which must be created, produce a the minimal
|
||||
/// subset for passing to CreateDir(recursive=true) by removing redundant parent
|
||||
/// directories
|
||||
ARROW_EXPORT
|
||||
std::vector<std::string> MinimalCreateDirSet(std::vector<std::string> dirs);
|
||||
|
||||
// Join the components of an abstract path.
|
||||
template <class StringIt>
|
||||
std::string JoinAbstractPath(StringIt it, StringIt end, char sep = kSep) {
|
||||
std::string path;
|
||||
for (; it != end; ++it) {
|
||||
if (it->empty()) continue;
|
||||
|
||||
if (!path.empty()) {
|
||||
path += sep;
|
||||
}
|
||||
path += *it;
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
template <class StringRange>
|
||||
std::string JoinAbstractPath(const StringRange& range, char sep = kSep) {
|
||||
return JoinAbstractPath(range.begin(), range.end(), sep);
|
||||
}
|
||||
|
||||
/// Convert slashes to backslashes, on all platforms. Mostly useful for testing.
|
||||
ARROW_EXPORT
|
||||
std::string ToBackslashes(util::string_view s);
|
||||
|
||||
/// Ensure a local path is abstract, by converting backslashes to regular slashes
|
||||
/// on Windows. Return the path unchanged on other systems.
|
||||
ARROW_EXPORT
|
||||
std::string ToSlashes(util::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool IsEmptyPath(util::string_view s);
|
||||
|
||||
ARROW_EXPORT
|
||||
bool IsLikelyUri(util::string_view s);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
@ -0,0 +1,90 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "arrow/filesystem/s3fs.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/testing/gtest_util.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
|
||||
// A minio test server, managed as a child process
|
||||
|
||||
class MinioTestServer {
|
||||
public:
|
||||
MinioTestServer();
|
||||
~MinioTestServer();
|
||||
|
||||
Status Start();
|
||||
|
||||
Status Stop();
|
||||
|
||||
std::string connect_string() const;
|
||||
|
||||
std::string access_key() const;
|
||||
|
||||
std::string secret_key() const;
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
// A Minio "environment" that spawns Minio processes in advances, such as
|
||||
// to hide process launch latencies during testing.
|
||||
|
||||
class MinioTestEnvironment : public ::testing::Environment {
|
||||
public:
|
||||
MinioTestEnvironment();
|
||||
~MinioTestEnvironment();
|
||||
|
||||
void SetUp() override;
|
||||
|
||||
Result<std::shared_ptr<MinioTestServer>> GetOneServer();
|
||||
|
||||
protected:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
// A global test "environment", to ensure that the S3 API is initialized before
|
||||
// running unit tests.
|
||||
|
||||
class S3Environment : public ::testing::Environment {
|
||||
public:
|
||||
void SetUp() override {
|
||||
// Change this to increase logging during tests
|
||||
S3GlobalOptions options;
|
||||
options.log_level = S3LogLevel::Fatal;
|
||||
ASSERT_OK(InitializeS3(options));
|
||||
}
|
||||
|
||||
void TearDown() override { ASSERT_OK(FinalizeS3()); }
|
||||
};
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
319
.venv/Lib/site-packages/pyarrow/include/arrow/filesystem/s3fs.h
Normal file
319
.venv/Lib/site-packages/pyarrow/include/arrow/filesystem/s3fs.h
Normal file
@ -0,0 +1,319 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/uri.h"
|
||||
|
||||
namespace Aws {
|
||||
namespace Auth {
|
||||
|
||||
class AWSCredentialsProvider;
|
||||
class STSAssumeRoleCredentialsProvider;
|
||||
|
||||
} // namespace Auth
|
||||
namespace STS {
|
||||
class STSClient;
|
||||
}
|
||||
} // namespace Aws
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
|
||||
/// Options for using a proxy for S3
|
||||
struct ARROW_EXPORT S3ProxyOptions {
|
||||
std::string scheme;
|
||||
std::string host;
|
||||
int port = -1;
|
||||
std::string username;
|
||||
std::string password;
|
||||
|
||||
/// Initialize from URI such as http://username:password@host:port
|
||||
/// or http://host:port
|
||||
static Result<S3ProxyOptions> FromUri(const std::string& uri);
|
||||
static Result<S3ProxyOptions> FromUri(const ::arrow::internal::Uri& uri);
|
||||
|
||||
bool Equals(const S3ProxyOptions& other) const;
|
||||
};
|
||||
|
||||
enum class S3CredentialsKind : int8_t {
|
||||
/// Anonymous access (no credentials used)
|
||||
Anonymous,
|
||||
/// Use default AWS credentials, configured through environment variables
|
||||
Default,
|
||||
/// Use explicitly-provided access key pair
|
||||
Explicit,
|
||||
/// Assume role through a role ARN
|
||||
Role,
|
||||
/// Use web identity token to assume role, configured through environment variables
|
||||
WebIdentity
|
||||
};
|
||||
|
||||
/// Pure virtual class for describing custom S3 retry strategies
|
||||
class S3RetryStrategy {
|
||||
public:
|
||||
virtual ~S3RetryStrategy() = default;
|
||||
|
||||
/// Simple struct where each field corresponds to a field in Aws::Client::AWSError
|
||||
struct AWSErrorDetail {
|
||||
/// Corresponds to AWSError::GetErrorType()
|
||||
int error_type;
|
||||
/// Corresponds to AWSError::GetMessage()
|
||||
std::string message;
|
||||
/// Corresponds to AWSError::GetExceptionName()
|
||||
std::string exception_name;
|
||||
/// Corresponds to AWSError::ShouldRetry()
|
||||
bool should_retry;
|
||||
};
|
||||
/// Returns true if the S3 request resulting in the provided error should be retried.
|
||||
virtual bool ShouldRetry(const AWSErrorDetail& error, int64_t attempted_retries) = 0;
|
||||
/// Returns the time in milliseconds the S3 client should sleep for until retrying.
|
||||
virtual int64_t CalculateDelayBeforeNextRetry(const AWSErrorDetail& error,
|
||||
int64_t attempted_retries) = 0;
|
||||
};
|
||||
|
||||
/// Options for the S3FileSystem implementation.
|
||||
struct ARROW_EXPORT S3Options {
|
||||
/// \brief AWS region to connect to.
|
||||
///
|
||||
/// If unset, the AWS SDK will choose a default value. The exact algorithm
|
||||
/// depends on the SDK version. Before 1.8, the default is hardcoded
|
||||
/// to "us-east-1". Since 1.8, several heuristics are used to determine
|
||||
/// the region (environment variables, configuration profile, EC2 metadata
|
||||
/// server).
|
||||
std::string region;
|
||||
|
||||
/// If non-empty, override region with a connect string such as "localhost:9000"
|
||||
// XXX perhaps instead take a URL like "http://localhost:9000"?
|
||||
std::string endpoint_override;
|
||||
/// S3 connection transport, default "https"
|
||||
std::string scheme = "https";
|
||||
|
||||
/// ARN of role to assume
|
||||
std::string role_arn;
|
||||
/// Optional identifier for an assumed role session.
|
||||
std::string session_name;
|
||||
/// Optional external idenitifer to pass to STS when assuming a role
|
||||
std::string external_id;
|
||||
/// Frequency (in seconds) to refresh temporary credentials from assumed role
|
||||
int load_frequency;
|
||||
|
||||
/// If connection is through a proxy, set options here
|
||||
S3ProxyOptions proxy_options;
|
||||
|
||||
/// AWS credentials provider
|
||||
std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
|
||||
|
||||
/// Type of credentials being used. Set along with credentials_provider.
|
||||
S3CredentialsKind credentials_kind = S3CredentialsKind::Default;
|
||||
|
||||
/// Whether OutputStream writes will be issued in the background, without blocking.
|
||||
bool background_writes = true;
|
||||
|
||||
/// \brief Default metadata for OpenOutputStream.
|
||||
///
|
||||
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
|
||||
std::shared_ptr<const KeyValueMetadata> default_metadata;
|
||||
|
||||
/// Optional retry strategy to determine which error types should be retried, and the
|
||||
/// delay between retries.
|
||||
std::shared_ptr<S3RetryStrategy> retry_strategy;
|
||||
|
||||
S3Options();
|
||||
|
||||
/// Configure with the default AWS credentials provider chain.
|
||||
void ConfigureDefaultCredentials();
|
||||
|
||||
/// Configure with anonymous credentials. This will only let you access public buckets.
|
||||
void ConfigureAnonymousCredentials();
|
||||
|
||||
/// Configure with explicit access and secret key.
|
||||
void ConfigureAccessKey(const std::string& access_key, const std::string& secret_key,
|
||||
const std::string& session_token = "");
|
||||
|
||||
/// Configure with credentials from an assumed role.
|
||||
void ConfigureAssumeRoleCredentials(
|
||||
const std::string& role_arn, const std::string& session_name = "",
|
||||
const std::string& external_id = "", int load_frequency = 900,
|
||||
const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
|
||||
|
||||
/// Configure with credentials from role assumed using a web identitiy token
|
||||
void ConfigureAssumeRoleWithWebIdentityCredentials();
|
||||
|
||||
std::string GetAccessKey() const;
|
||||
std::string GetSecretKey() const;
|
||||
std::string GetSessionToken() const;
|
||||
|
||||
bool Equals(const S3Options& other) const;
|
||||
|
||||
/// \brief Initialize with default credentials provider chain
|
||||
///
|
||||
/// This is recommended if you use the standard AWS environment variables
|
||||
/// and/or configuration file.
|
||||
static S3Options Defaults();
|
||||
|
||||
/// \brief Initialize with anonymous credentials.
|
||||
///
|
||||
/// This will only let you access public buckets.
|
||||
static S3Options Anonymous();
|
||||
|
||||
/// \brief Initialize with explicit access and secret key.
|
||||
///
|
||||
/// Optionally, a session token may also be provided for temporary credentials
|
||||
/// (from STS).
|
||||
static S3Options FromAccessKey(const std::string& access_key,
|
||||
const std::string& secret_key,
|
||||
const std::string& session_token = "");
|
||||
|
||||
/// \brief Initialize from an assumed role.
|
||||
static S3Options FromAssumeRole(
|
||||
const std::string& role_arn, const std::string& session_name = "",
|
||||
const std::string& external_id = "", int load_frequency = 900,
|
||||
const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
|
||||
|
||||
/// \brief Initialize from an assumed role with web-identity.
|
||||
/// Uses the AWS SDK which uses environment variables to
|
||||
/// generate temporary credentials.
|
||||
static S3Options FromAssumeRoleWithWebIdentity();
|
||||
|
||||
static Result<S3Options> FromUri(const ::arrow::internal::Uri& uri,
|
||||
std::string* out_path = NULLPTR);
|
||||
static Result<S3Options> FromUri(const std::string& uri,
|
||||
std::string* out_path = NULLPTR);
|
||||
};
|
||||
|
||||
/// S3-backed FileSystem implementation.
|
||||
///
|
||||
/// Some implementation notes:
|
||||
/// - buckets are special and the operations available on them may be limited
|
||||
/// or more expensive than desired.
|
||||
class ARROW_EXPORT S3FileSystem : public FileSystem {
|
||||
public:
|
||||
~S3FileSystem() override;
|
||||
|
||||
std::string type_name() const override { return "s3"; }
|
||||
|
||||
/// Return the original S3 options when constructing the filesystem
|
||||
S3Options options() const;
|
||||
/// Return the actual region this filesystem connects to
|
||||
std::string region() const;
|
||||
|
||||
bool Equals(const FileSystem& other) const override;
|
||||
|
||||
/// \cond FALSE
|
||||
using FileSystem::GetFileInfo;
|
||||
/// \endcond
|
||||
Result<FileInfo> GetFileInfo(const std::string& path) override;
|
||||
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
|
||||
|
||||
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
|
||||
|
||||
Status CreateDir(const std::string& path, bool recursive = true) override;
|
||||
|
||||
Status DeleteDir(const std::string& path) override;
|
||||
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
|
||||
Future<> DeleteDirContentsAsync(const std::string& path,
|
||||
bool missing_dir_ok = false) override;
|
||||
Status DeleteRootDirContents() override;
|
||||
|
||||
Status DeleteFile(const std::string& path) override;
|
||||
|
||||
Status Move(const std::string& src, const std::string& dest) override;
|
||||
|
||||
Status CopyFile(const std::string& src, const std::string& dest) override;
|
||||
|
||||
/// Create a sequential input stream for reading from a S3 object.
|
||||
///
|
||||
/// NOTE: Reads from the stream will be synchronous and unbuffered.
|
||||
/// You way want to wrap the stream in a BufferedInputStream or use
|
||||
/// a custom readahead strategy to avoid idle waits.
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
|
||||
const std::string& path) override;
|
||||
/// Create a sequential input stream for reading from a S3 object.
|
||||
///
|
||||
/// This override avoids a HEAD request by assuming the FileInfo
|
||||
/// contains correct information.
|
||||
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
|
||||
|
||||
/// Create a random access file for reading from a S3 object.
|
||||
///
|
||||
/// See OpenInputStream for performance notes.
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const std::string& path) override;
|
||||
/// Create a random access file for reading from a S3 object.
|
||||
///
|
||||
/// This override avoids a HEAD request by assuming the FileInfo
|
||||
/// contains correct information.
|
||||
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
|
||||
const FileInfo& info) override;
|
||||
|
||||
/// Create a sequential output stream for writing to a S3 object.
|
||||
///
|
||||
/// NOTE: Writes to the stream will be buffered. Depending on
|
||||
/// S3Options.background_writes, they can be synchronous or not.
|
||||
/// It is recommended to enable background_writes unless you prefer
|
||||
/// implementing your own background execution strategy.
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
/// Create a S3FileSystem instance from the given options.
|
||||
static Result<std::shared_ptr<S3FileSystem>> Make(
|
||||
const S3Options& options, const io::IOContext& = io::default_io_context());
|
||||
|
||||
protected:
|
||||
explicit S3FileSystem(const S3Options& options, const io::IOContext&);
|
||||
|
||||
class Impl;
|
||||
std::shared_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace };
|
||||
|
||||
struct ARROW_EXPORT S3GlobalOptions {
|
||||
S3LogLevel log_level;
|
||||
};
|
||||
|
||||
/// Initialize the S3 APIs. It is required to call this function at least once
|
||||
/// before using S3FileSystem.
|
||||
ARROW_EXPORT
|
||||
Status InitializeS3(const S3GlobalOptions& options);
|
||||
|
||||
/// Ensure the S3 APIs are initialized, but only if not already done.
|
||||
/// If necessary, this will call InitializeS3() with some default options.
|
||||
ARROW_EXPORT
|
||||
Status EnsureS3Initialized();
|
||||
|
||||
/// Shutdown the S3 APIs.
|
||||
ARROW_EXPORT
|
||||
Status FinalizeS3();
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::string> ResolveS3BucketRegion(const std::string& bucket);
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
@ -0,0 +1,248 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/filesystem/mockfs.h"
|
||||
#include "arrow/testing/visibility.h"
|
||||
#include "arrow/util/counting_semaphore.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
|
||||
static constexpr double kTimeSlack = 2.0; // In seconds
|
||||
|
||||
static inline FileInfo File(std::string path) {
|
||||
return FileInfo(std::move(path), FileType::File);
|
||||
}
|
||||
|
||||
static inline FileInfo Dir(std::string path) {
|
||||
return FileInfo(std::move(path), FileType::Directory);
|
||||
}
|
||||
|
||||
// A subclass of MockFileSystem that blocks operations until an unlock method is
|
||||
// called.
|
||||
//
|
||||
// This is intended for testing fine-grained ordering of filesystem operations.
|
||||
//
|
||||
// N.B. Only OpenOutputStream supports gating at the moment but this is simply because
|
||||
// it is all that has been needed so far. Feel free to add support for more methods
|
||||
// as required.
|
||||
class ARROW_TESTING_EXPORT GatedMockFilesystem : public internal::MockFileSystem {
|
||||
public:
|
||||
GatedMockFilesystem(TimePoint current_time,
|
||||
const io::IOContext& = io::default_io_context());
|
||||
~GatedMockFilesystem() override;
|
||||
|
||||
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
|
||||
const std::string& path,
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
|
||||
|
||||
// Wait until at least num_waiters are waiting on OpenOutputStream
|
||||
Status WaitForOpenOutputStream(uint32_t num_waiters);
|
||||
// Unlock `num_waiters` individual calls to OpenOutputStream
|
||||
Status UnlockOpenOutputStream(uint32_t num_waiters);
|
||||
|
||||
private:
|
||||
util::CountingSemaphore open_output_sem_;
|
||||
};
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void CreateFile(FileSystem* fs, const std::string& path, const std::string& data);
|
||||
|
||||
// Sort a vector of FileInfo by lexicographic path order
|
||||
ARROW_TESTING_EXPORT
|
||||
void SortInfos(FileInfoVector* infos);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
|
||||
TimePoint mtime);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
|
||||
TimePoint mtime, int64_t size);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
|
||||
int64_t size);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
|
||||
TimePoint mtime);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
|
||||
TimePoint mtime, int64_t size);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type, int64_t size);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void AssertFileContents(FileSystem* fs, const std::string& path,
|
||||
const std::string& expected_data);
|
||||
|
||||
template <typename Duration>
|
||||
void AssertDurationBetween(Duration d, double min_secs, double max_secs) {
|
||||
auto seconds = std::chrono::duration_cast<std::chrono::duration<double>>(d);
|
||||
ASSERT_GE(seconds.count(), min_secs);
|
||||
ASSERT_LE(seconds.count(), max_secs);
|
||||
}
|
||||
|
||||
// Generic tests for FileSystem implementations.
|
||||
// To use this class, subclass both from it and ::testing::Test,
|
||||
// implement GetEmptyFileSystem(), and use GENERIC_FS_TEST_FUNCTIONS()
|
||||
// to define the various tests.
|
||||
class ARROW_TESTING_EXPORT GenericFileSystemTest {
|
||||
public:
|
||||
virtual ~GenericFileSystemTest();
|
||||
|
||||
void TestEmpty();
|
||||
void TestNormalizePath();
|
||||
void TestCreateDir();
|
||||
void TestDeleteDir();
|
||||
void TestDeleteDirContents();
|
||||
void TestDeleteRootDirContents();
|
||||
void TestDeleteFile();
|
||||
void TestDeleteFiles();
|
||||
void TestMoveFile();
|
||||
void TestMoveDir();
|
||||
void TestCopyFile();
|
||||
void TestGetFileInfo();
|
||||
void TestGetFileInfoVector();
|
||||
void TestGetFileInfoSelector();
|
||||
void TestGetFileInfoSelectorWithRecursion();
|
||||
void TestGetFileInfoAsync();
|
||||
void TestGetFileInfoGenerator();
|
||||
void TestOpenOutputStream();
|
||||
void TestOpenAppendStream();
|
||||
void TestOpenInputStream();
|
||||
void TestOpenInputStreamWithFileInfo();
|
||||
void TestOpenInputStreamAsync();
|
||||
void TestOpenInputFile();
|
||||
void TestOpenInputFileWithFileInfo();
|
||||
void TestOpenInputFileAsync();
|
||||
void TestSpecialChars();
|
||||
|
||||
protected:
|
||||
// This function should return the filesystem under test.
|
||||
virtual std::shared_ptr<FileSystem> GetEmptyFileSystem() = 0;
|
||||
|
||||
// Override the following functions to specify deviations from expected
|
||||
// filesystem semantics.
|
||||
// - Whether the filesystem may "implicitly" create intermediate directories
|
||||
virtual bool have_implicit_directories() const { return false; }
|
||||
// - Whether the filesystem may allow writing a file "over" a directory
|
||||
virtual bool allow_write_file_over_dir() const { return false; }
|
||||
// - Whether the filesystem allows reading a directory
|
||||
virtual bool allow_read_dir_as_file() const { return false; }
|
||||
// - Whether the filesystem allows moving a directory
|
||||
virtual bool allow_move_dir() const { return true; }
|
||||
// - Whether the filesystem allows moving a directory "over" a non-empty destination
|
||||
virtual bool allow_move_dir_over_non_empty_dir() const { return false; }
|
||||
// - Whether the filesystem allows appending to a file
|
||||
virtual bool allow_append_to_file() const { return true; }
|
||||
// - Whether the filesystem allows appending to a new (not existent yet) file
|
||||
virtual bool allow_append_to_new_file() const { return true; }
|
||||
// - Whether the filesystem supports directory modification times
|
||||
virtual bool have_directory_mtimes() const { return true; }
|
||||
// - Whether some directory tree deletion tests may fail randomly
|
||||
virtual bool have_flaky_directory_tree_deletion() const { return false; }
|
||||
// - Whether the filesystem stores some metadata alongside files
|
||||
virtual bool have_file_metadata() const { return false; }
|
||||
|
||||
void TestEmpty(FileSystem* fs);
|
||||
void TestNormalizePath(FileSystem* fs);
|
||||
void TestCreateDir(FileSystem* fs);
|
||||
void TestDeleteDir(FileSystem* fs);
|
||||
void TestDeleteDirContents(FileSystem* fs);
|
||||
void TestDeleteRootDirContents(FileSystem* fs);
|
||||
void TestDeleteFile(FileSystem* fs);
|
||||
void TestDeleteFiles(FileSystem* fs);
|
||||
void TestMoveFile(FileSystem* fs);
|
||||
void TestMoveDir(FileSystem* fs);
|
||||
void TestCopyFile(FileSystem* fs);
|
||||
void TestGetFileInfo(FileSystem* fs);
|
||||
void TestGetFileInfoVector(FileSystem* fs);
|
||||
void TestGetFileInfoSelector(FileSystem* fs);
|
||||
void TestGetFileInfoSelectorWithRecursion(FileSystem* fs);
|
||||
void TestGetFileInfoAsync(FileSystem* fs);
|
||||
void TestGetFileInfoGenerator(FileSystem* fs);
|
||||
void TestOpenOutputStream(FileSystem* fs);
|
||||
void TestOpenAppendStream(FileSystem* fs);
|
||||
void TestOpenInputStream(FileSystem* fs);
|
||||
void TestOpenInputStreamWithFileInfo(FileSystem* fs);
|
||||
void TestOpenInputStreamAsync(FileSystem* fs);
|
||||
void TestOpenInputFile(FileSystem* fs);
|
||||
void TestOpenInputFileWithFileInfo(FileSystem* fs);
|
||||
void TestOpenInputFileAsync(FileSystem* fs);
|
||||
void TestSpecialChars(FileSystem* fs);
|
||||
};
|
||||
|
||||
#define GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NAME) \
|
||||
TEST_MACRO(TEST_CLASS, NAME) { this->Test##NAME(); }
|
||||
|
||||
#define GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_MACRO, TEST_CLASS) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, Empty) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NormalizePath) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CreateDir) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDir) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDirContents) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteRootDirContents) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFile) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFiles) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveFile) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveDir) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFile) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfo) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoVector) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelector) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelectorWithRecursion) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoAsync) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoGenerator) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenOutputStream) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenAppendStream) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStream) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamWithFileInfo) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamAsync) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFile) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileWithFileInfo) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileAsync) \
|
||||
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, SpecialChars)
|
||||
|
||||
#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \
|
||||
GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_F, TEST_CLASS)
|
||||
|
||||
#define GENERIC_FS_TYPED_TEST_FUNCTIONS(TEST_CLASS) \
|
||||
GENERIC_FS_TEST_FUNCTIONS_MACROS(TYPED_TEST, TEST_CLASS)
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
@ -0,0 +1,49 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace arrow {
|
||||
namespace fs {
|
||||
|
||||
/// \brief FileSystem entry type
|
||||
enum class FileType : int8_t {
|
||||
/// Entry is not found
|
||||
NotFound,
|
||||
/// Entry exists but its type is unknown
|
||||
///
|
||||
/// This can designate a special file such as a Unix socket or character
|
||||
/// device, or Windows NUL / CON / ...
|
||||
Unknown,
|
||||
/// Entry is a regular file
|
||||
File,
|
||||
/// Entry is a directory
|
||||
Directory
|
||||
};
|
||||
|
||||
struct FileInfo;
|
||||
|
||||
struct FileSelector;
|
||||
|
||||
class FileSystem;
|
||||
class SubTreeFileSystem;
|
||||
class SlowFileSystem;
|
||||
class LocalFileSystem;
|
||||
class S3FileSystem;
|
||||
|
||||
} // namespace fs
|
||||
} // namespace arrow
|
Reference in New Issue
Block a user