first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,28 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/config.h" // IWYU pragma: export
#include "arrow/filesystem/filesystem.h" // IWYU pragma: export
#include "arrow/filesystem/hdfs.h" // IWYU pragma: export
#include "arrow/filesystem/localfs.h" // IWYU pragma: export
#include "arrow/filesystem/mockfs.h" // IWYU pragma: export
#ifdef ARROW_S3
#include "arrow/filesystem/s3fs.h" // IWYU pragma: export
#endif

View File

@ -0,0 +1,541 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <chrono>
#include <cstdint>
#include <functional>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/filesystem/type_fwd.h"
#include "arrow/io/interfaces.h"
#include "arrow/type_fwd.h"
#include "arrow/util/compare.h"
#include "arrow/util/macros.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
#include "arrow/util/windows_fixup.h"
namespace arrow {
namespace fs {
// A system clock time point expressed as a 64-bit (or more) number of
// nanoseconds since the epoch.
using TimePoint =
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
ARROW_EXPORT std::string ToString(FileType);
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType);
static const int64_t kNoSize = -1;
static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1));
/// \brief FileSystem entry info
struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> {
FileInfo() = default;
FileInfo(FileInfo&&) = default;
FileInfo& operator=(FileInfo&&) = default;
FileInfo(const FileInfo&) = default;
FileInfo& operator=(const FileInfo&) = default;
explicit FileInfo(std::string path, FileType type = FileType::Unknown)
: path_(std::move(path)), type_(type) {}
/// The file type
FileType type() const { return type_; }
void set_type(FileType type) { type_ = type; }
/// The full file path in the filesystem
const std::string& path() const { return path_; }
void set_path(std::string path) { path_ = std::move(path); }
/// The file base name (component after the last directory separator)
std::string base_name() const;
// The directory base name (component before the file base name).
std::string dir_name() const;
/// The size in bytes, if available
///
/// Only regular files are guaranteed to have a size.
int64_t size() const { return size_; }
void set_size(int64_t size) { size_ = size; }
/// The file extension (excluding the dot)
std::string extension() const;
/// The time of last modification, if available
TimePoint mtime() const { return mtime_; }
void set_mtime(TimePoint mtime) { mtime_ = mtime; }
bool IsFile() const { return type_ == FileType::File; }
bool IsDirectory() const { return type_ == FileType::Directory; }
bool Equals(const FileInfo& other) const {
return type() == other.type() && path() == other.path() && size() == other.size() &&
mtime() == other.mtime();
}
std::string ToString() const;
/// Function object implementing less-than comparison and hashing by
/// path, to support sorting infos, using them as keys, and other
/// interactions with the STL.
struct ByPath {
bool operator()(const FileInfo& l, const FileInfo& r) const {
return l.path() < r.path();
}
size_t operator()(const FileInfo& i) const {
return std::hash<std::string>{}(i.path());
}
};
protected:
std::string path_;
FileType type_ = FileType::Unknown;
int64_t size_ = kNoSize;
TimePoint mtime_ = kNoTime;
};
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&);
/// \brief File selector for filesystem APIs
struct ARROW_EXPORT FileSelector {
/// The directory in which to select files.
/// If the path exists but doesn't point to a directory, this should be an error.
std::string base_dir;
/// The behavior if `base_dir` isn't found in the filesystem. If false,
/// an error is returned. If true, an empty selection is returned.
bool allow_not_found;
/// Whether to recurse into subdirectories.
bool recursive;
/// The maximum number of subdirectories to recurse into.
int32_t max_recursion;
FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {}
};
/// \brief FileSystem, path pair
struct ARROW_EXPORT FileLocator {
std::shared_ptr<FileSystem> filesystem;
std::string path;
};
using FileInfoVector = std::vector<FileInfo>;
using FileInfoGenerator = std::function<Future<FileInfoVector>()>;
} // namespace fs
template <>
struct IterationTraits<fs::FileInfoVector> {
static fs::FileInfoVector End() { return {}; }
static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); }
};
namespace fs {
/// \brief Abstract file system API
class ARROW_EXPORT FileSystem : public std::enable_shared_from_this<FileSystem> {
public:
virtual ~FileSystem();
virtual std::string type_name() const = 0;
/// EXPERIMENTAL: The IOContext associated with this filesystem.
const io::IOContext& io_context() const { return io_context_; }
/// Normalize path for the given filesystem
///
/// The default implementation of this method is a no-op, but subclasses
/// may allow normalizing irregular path forms (such as Windows local paths).
virtual Result<std::string> NormalizePath(std::string path);
virtual bool Equals(const FileSystem& other) const = 0;
virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
return Equals(*other);
}
/// Get info for the given target.
///
/// Any symlink is automatically dereferenced, recursively.
/// A nonexistent or unreachable file returns an Ok status and
/// has a FileType of value NotFound. An error status indicates
/// a truly exceptional condition (low-level I/O error, etc.).
virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0;
/// Same, for many targets at once.
virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>& paths);
/// Same, according to a selector.
///
/// The selector's base directory will not be part of the results, even if
/// it exists.
/// If it doesn't exist, see `FileSelector::allow_not_found`.
virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0;
/// Async version of GetFileInfo
virtual Future<FileInfoVector> GetFileInfoAsync(const std::vector<std::string>& paths);
/// Streaming async version of GetFileInfo
///
/// The returned generator is not async-reentrant, i.e. you need to wait for
/// the returned future to complete before calling the generator again.
virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select);
/// Create a directory and subdirectories.
///
/// This function succeeds if the directory already exists.
virtual Status CreateDir(const std::string& path, bool recursive = true) = 0;
/// Delete a directory and its contents, recursively.
virtual Status DeleteDir(const std::string& path) = 0;
/// Delete a directory's contents, recursively.
///
/// Like DeleteDir, but doesn't delete the directory itself.
/// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents.
virtual Status DeleteDirContents(const std::string& path,
bool missing_dir_ok = false) = 0;
/// Async version of DeleteDirContents.
virtual Future<> DeleteDirContentsAsync(const std::string& path,
bool missing_dir_ok = false);
/// EXPERIMENTAL: Delete the root directory's contents, recursively.
///
/// Implementations may decide to raise an error if this operation is
/// too dangerous.
// NOTE: may decide to remove this if it's deemed not useful
virtual Status DeleteRootDirContents() = 0;
/// Delete a file.
virtual Status DeleteFile(const std::string& path) = 0;
/// Delete many files.
///
/// The default implementation issues individual delete operations in sequence.
virtual Status DeleteFiles(const std::vector<std::string>& paths);
/// Move / rename a file or directory.
///
/// If the destination exists:
/// - if it is a non-empty directory, an error is returned
/// - otherwise, if it has the same type as the source, it is replaced
/// - otherwise, behavior is unspecified (implementation-dependent).
virtual Status Move(const std::string& src, const std::string& dest) = 0;
/// Copy a file.
///
/// If the destination exists and is a directory, an error is returned.
/// Otherwise, it is replaced.
virtual Status CopyFile(const std::string& src, const std::string& dest) = 0;
/// Open an input stream for sequential reading.
virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) = 0;
/// Open an input stream for sequential reading.
///
/// This override assumes the given FileInfo validly represents the file's
/// characteristics, and may optimize access depending on them (for example
/// avoid querying the file size or its existence).
virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info);
/// Open an input file for random access reading.
virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) = 0;
/// Open an input file for random access reading.
///
/// This override assumes the given FileInfo validly represents the file's
/// characteristics, and may optimize access depending on them (for example
/// avoid querying the file size or its existence).
virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info);
/// Async version of OpenInputStream
virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const std::string& path);
/// Async version of OpenInputStream
virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const FileInfo& info);
/// Async version of OpenInputFile
virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const std::string& path);
/// Async version of OpenInputFile
virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const FileInfo& info);
/// Open an output stream for sequential writing.
///
/// If the target already exists, existing data is truncated.
virtual Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(const std::string& path);
/// Open an output stream for appending.
///
/// If the target doesn't exist, a new empty file is created.
///
/// Note: some filesystem implementations do not support efficient appending
/// to an existing file, in which case this method will return NotImplemented.
/// Consider writing to multiple files (using e.g. the dataset layer) instead.
virtual Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const std::string& path);
protected:
explicit FileSystem(const io::IOContext& io_context = io::default_io_context())
: io_context_(io_context) {}
io::IOContext io_context_;
// Whether metadata operations (such as GetFileInfo or OpenInputStream)
// are cheap enough that the default async variants don't bother with
// a thread pool.
bool default_async_is_sync_ = true;
};
/// \brief A FileSystem implementation that delegates to another
/// implementation after prepending a fixed base path.
///
/// This is useful to expose a logical view of a subtree of a filesystem,
/// for example a directory in a LocalFileSystem.
/// This works on abstract paths, i.e. paths using forward slashes and
/// and a single root "/". Windows paths are not guaranteed to work.
/// This makes no security guarantee. For example, symlinks may allow to
/// "escape" the subtree and access other parts of the underlying filesystem.
class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
public:
// This constructor may abort if base_path is invalid.
explicit SubTreeFileSystem(const std::string& base_path,
std::shared_ptr<FileSystem> base_fs);
~SubTreeFileSystem() override;
std::string type_name() const override { return "subtree"; }
std::string base_path() const { return base_path_; }
std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
Result<std::string> NormalizePath(std::string path) override;
bool Equals(const FileSystem& other) const override;
/// \cond FALSE
using FileSystem::GetFileInfo;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive = true) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const std::string& path) override;
Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const FileInfo& info) override;
Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const std::string& path) override;
Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const FileInfo& info) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
protected:
SubTreeFileSystem() {}
const std::string base_path_;
std::shared_ptr<FileSystem> base_fs_;
Result<std::string> PrependBase(const std::string& s) const;
Result<std::string> PrependBaseNonEmpty(const std::string& s) const;
Result<std::string> StripBase(const std::string& s) const;
Status FixInfo(FileInfo* info) const;
static Result<std::string> NormalizeBasePath(
std::string base_path, const std::shared_ptr<FileSystem>& base_fs);
};
/// \brief A FileSystem implementation that delegates to another
/// implementation but inserts latencies at various points.
class ARROW_EXPORT SlowFileSystem : public FileSystem {
public:
SlowFileSystem(std::shared_ptr<FileSystem> base_fs,
std::shared_ptr<io::LatencyGenerator> latencies);
SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency);
SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency,
int32_t seed);
std::string type_name() const override { return "slow"; }
bool Equals(const FileSystem& other) const override;
using FileSystem::GetFileInfo;
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive = true) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
protected:
std::shared_ptr<FileSystem> base_fs_;
std::shared_ptr<io::LatencyGenerator> latencies_;
};
/// \defgroup filesystem-factories Functions for creating FileSystem instances
///
/// @{
/// \brief Create a new FileSystem by URI
///
/// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
///
/// \param[in] uri a URI-based path, ex: file:///some/local/path
/// \param[out] out_path (optional) Path inside the filesystem.
/// \return out_fs FileSystem instance.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
std::string* out_path = NULLPTR);
/// \brief Create a new FileSystem by URI with a custom IO context
///
/// Recognized schemes are "file", "mock", "hdfs" and "s3fs".
///
/// \param[in] uri a URI-based path, ex: file:///some/local/path
/// \param[in] io_context an IOContext which will be associated with the filesystem
/// \param[out] out_path (optional) Path inside the filesystem.
/// \return out_fs FileSystem instance.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
const io::IOContext& io_context,
std::string* out_path = NULLPTR);
/// \brief Create a new FileSystem by URI
///
/// Same as FileSystemFromUri, but in addition also recognize non-URIs
/// and treat them as local filesystem paths. Only absolute local filesystem
/// paths are allowed.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
const std::string& uri, std::string* out_path = NULLPTR);
/// \brief Create a new FileSystem by URI with a custom IO context
///
/// Same as FileSystemFromUri, but in addition also recognize non-URIs
/// and treat them as local filesystem paths. Only absolute local filesystem
/// paths are allowed.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
const std::string& uri, const io::IOContext& io_context,
std::string* out_path = NULLPTR);
/// @}
/// \brief Copy files, including from one FileSystem to another
///
/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile
/// will be used, otherwise the file will be opened as a stream in both FileSystems and
/// chunks copied from the source to the destination. No directories will be created.
ARROW_EXPORT
Status CopyFiles(const std::vector<FileLocator>& sources,
const std::vector<FileLocator>& destinations,
const io::IOContext& io_context = io::default_io_context(),
int64_t chunk_size = 1024 * 1024, bool use_threads = true);
/// \brief Copy selected files, including from one FileSystem to another
///
/// Directories will be created under the destination base directory as needed.
ARROW_EXPORT
Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs,
const FileSelector& source_sel,
const std::shared_ptr<FileSystem>& destination_fs,
const std::string& destination_base_dir,
const io::IOContext& io_context = io::default_io_context(),
int64_t chunk_size = 1024 * 1024, bool use_threads = true);
struct FileSystemGlobalOptions {
/// Path to a single PEM file holding all TLS CA certificates
///
/// If empty, the underlying TLS library's defaults will be used.
std::string tls_ca_file_path;
/// Path to a directory holding TLS CA certificates in individual PEM files
/// named along the OpenSSL "hashed" format.
///
/// If empty, the underlying TLS library's defaults will be used.
std::string tls_ca_dir_path;
};
/// EXPERIMENTAL: optional global initialization routine
///
/// This is for environments (such as manylinux) where the path
/// to TLS CA certificates needs to be configured at runtime.
ARROW_EXPORT
Status Initialize(const FileSystemGlobalOptions& options);
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,197 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/util/uri.h"
namespace arrow {
namespace fs {
struct GcsCredentials;
/// Options for the GcsFileSystem implementation.
struct ARROW_EXPORT GcsOptions {
std::shared_ptr<GcsCredentials> credentials;
std::string endpoint_override;
std::string scheme;
/// \brief Location to use for creating buckets.
std::string default_bucket_location;
/// \brief Default metadata for OpenOutputStream.
///
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
std::shared_ptr<const KeyValueMetadata> default_metadata;
bool Equals(const GcsOptions& other) const;
/// \brief Initialize with Google Default Credentials
///
/// Create options configured to use [Application Default Credentials][aip/4110]. The
/// details of this mechanism are too involved to describe here, but suffice is to say
/// that applications can override any defaults using an environment variable
/// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google
/// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have
/// the same behavior as the `gcloud` CLI tool on your workstation.
///
/// \see https://cloud.google.com/docs/authentication
///
/// [aip/4110]: https://google.aip.dev/auth/4110
static GcsOptions Defaults();
/// \brief Initialize with anonymous credentials
static GcsOptions Anonymous();
/// \brief Initialize with access token
///
/// These credentials are useful when using an out-of-band mechanism to fetch access
/// tokens. Note that access tokens are time limited, you will need to manually refresh
/// the tokens created by the out-of-band mechanism.
static GcsOptions FromAccessToken(const std::string& access_token,
std::chrono::system_clock::time_point expiration);
/// \brief Initialize with service account impersonation
///
/// Service account impersonation allows one principal (a user or service account) to
/// impersonate a service account. It requires that the calling principal has the
/// necessary permissions *on* the service account.
static GcsOptions FromImpersonatedServiceAccount(
const GcsCredentials& base_credentials, const std::string& target_service_account);
/// Creates service account credentials from a JSON object in string form.
///
/// The @p json_object is expected to be in the format described by [aip/4112]. Such an
/// object contains the identity of a service account, as well as a private key that can
/// be used to sign tokens, showing the caller was holding the private key.
///
/// In GCP one can create several "keys" for each service account, and these keys are
/// downloaded as a JSON "key file". The contents of such a file are in the format
/// required by this function. Remember that key files and their contents should be
/// treated as any other secret with security implications, think of them as passwords
/// (because they are!), don't store them or output them where unauthorized persons may
/// read them.
///
/// Most applications should probably use default credentials, maybe pointing them to a
/// file with these contents. Using this function may be useful when the json object is
/// obtained from a Cloud Secret Manager or a similar service.
///
/// [aip/4112]: https://google.aip.dev/auth/4112
static GcsOptions FromServiceAccountCredentials(const std::string& json_object);
/// Initialize from URIs such as "gs://bucket/object".
static Result<GcsOptions> FromUri(const arrow::internal::Uri& uri,
std::string* out_path);
static Result<GcsOptions> FromUri(const std::string& uri, std::string* out_path);
};
/// \brief GCS-backed FileSystem implementation.
///
/// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object
/// storage system for any amount of data. The main abstractions in GCS are buckets and
/// objects. A bucket is a namespace for objects, buckets can store any number of objects,
/// tens of millions and even billions is not uncommon. Each object contains a single
/// blob of data, up to 5TiB in size. Buckets are typically configured to keep a single
/// version of each object, but versioning can be enabled. Versioning is important because
/// objects are immutable, once created one cannot append data to the object or modify the
/// object data in any way.
///
/// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket
/// named `foo` no other customer can create a bucket with the same name. Note that a
/// principal (a user or service account) may only list the buckets they are entitled to,
/// and then only within a project. It is not possible to list "all" the buckets.
///
/// Within each bucket objects are in flat namespace. GCS does not have folders or
/// directories. However, following some conventions it is possible to emulate
/// directories. To this end, this class:
///
/// - All buckets are treated as directories at the "root"
/// - Creating a root directory results in a new bucket being created, this may be slower
/// than most GCS operations.
/// - The class creates marker objects for a directory, using a metadata attribute to
/// annotate the file.
/// - GCS can list all the objects with a given prefix, this is used to emulate listing
/// of directories.
/// - In object lists GCS can summarize all the objects with a common prefix as a single
/// entry, this is used to emulate non-recursive lists. Note that GCS list time is
/// proportional to the number of objects in the prefix. Listing recursively takes
/// almost the same time as non-recursive lists.
///
class ARROW_EXPORT GcsFileSystem : public FileSystem {
public:
~GcsFileSystem() override = default;
std::string type_name() const override;
bool Equals(const FileSystem& other) const override;
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
/// This is not implemented in GcsFileSystem, as it would be too dangerous.
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
ARROW_DEPRECATED(
"Deprecated. "
"OpenAppendStream is unsupported on the GCS FileSystem.")
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
/// Create a GcsFileSystem instance from the given options.
static std::shared_ptr<GcsFileSystem> Make(
const GcsOptions& options, const io::IOContext& = io::default_io_context());
private:
explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context);
class Impl;
std::shared_ptr<Impl> impl_;
};
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,113 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/io/hdfs.h"
#include "arrow/util/uri.h"
namespace arrow {
namespace fs {
/// Options for the HDFS implementation.
struct ARROW_EXPORT HdfsOptions {
HdfsOptions() = default;
~HdfsOptions() = default;
/// Hdfs configuration options, contains host, port, driver
io::HdfsConnectionConfig connection_config;
/// Used by Hdfs OpenWritable Interface.
int32_t buffer_size = 0;
int16_t replication = 3;
int64_t default_block_size = 0;
void ConfigureEndPoint(std::string host, int port);
void ConfigureReplication(int16_t replication);
void ConfigureUser(std::string user_name);
void ConfigureBufferSize(int32_t buffer_size);
void ConfigureBlockSize(int64_t default_block_size);
void ConfigureKerberosTicketCachePath(std::string path);
void ConfigureExtraConf(std::string key, std::string val);
bool Equals(const HdfsOptions& other) const;
static Result<HdfsOptions> FromUri(const ::arrow::internal::Uri& uri);
static Result<HdfsOptions> FromUri(const std::string& uri);
};
/// HDFS-backed FileSystem implementation.
///
/// implementation notes:
/// - This is a wrapper of arrow/io/hdfs, so we can use FileSystem API to handle hdfs.
class ARROW_EXPORT HadoopFileSystem : public FileSystem {
public:
~HadoopFileSystem() override;
std::string type_name() const override { return "hdfs"; }
HdfsOptions options() const;
bool Equals(const FileSystem& other) const override;
/// \cond FALSE
using FileSystem::GetFileInfo;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive = true) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
/// Create a HdfsFileSystem instance from the given options.
static Result<std::shared_ptr<HadoopFileSystem>> Make(
const HdfsOptions& options, const io::IOContext& = io::default_io_context());
protected:
HadoopFileSystem(const HdfsOptions& options, const io::IOContext&);
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,113 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
namespace arrow {
namespace internal {
class Uri;
}
namespace fs {
/// Options for the LocalFileSystem implementation.
struct ARROW_EXPORT LocalFileSystemOptions {
/// Whether OpenInputStream and OpenInputFile return a mmap'ed file,
/// or a regular one.
bool use_mmap = false;
/// \brief Initialize with defaults
static LocalFileSystemOptions Defaults();
bool Equals(const LocalFileSystemOptions& other) const;
static Result<LocalFileSystemOptions> FromUri(const ::arrow::internal::Uri& uri,
std::string* out_path);
};
/// \brief A FileSystem implementation accessing files on the local machine.
///
/// This class handles only `/`-separated paths. If desired, conversion
/// from Windows backslash-separated paths should be done by the caller.
/// Details such as symlinks are abstracted away (symlinks are always
/// followed, except when deleting an entry).
class ARROW_EXPORT LocalFileSystem : public FileSystem {
public:
explicit LocalFileSystem(const io::IOContext& = io::default_io_context());
explicit LocalFileSystem(const LocalFileSystemOptions&,
const io::IOContext& = io::default_io_context());
~LocalFileSystem() override;
std::string type_name() const override { return "local"; }
Result<std::string> NormalizePath(std::string path) override;
bool Equals(const FileSystem& other) const override;
LocalFileSystemOptions options() const { return options_; }
/// \cond FALSE
using FileSystem::GetFileInfo;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive = true) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
protected:
LocalFileSystemOptions options_;
};
namespace internal {
// Return whether the string is detected as a local absolute path.
ARROW_EXPORT
bool DetectAbsolutePath(const std::string& s);
} // namespace internal
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,132 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <iosfwd>
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/util/string_view.h"
#include "arrow/util/windows_fixup.h"
namespace arrow {
namespace fs {
namespace internal {
struct MockDirInfo {
std::string full_path;
TimePoint mtime;
bool operator==(const MockDirInfo& other) const {
return mtime == other.mtime && full_path == other.full_path;
}
friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const MockDirInfo&);
};
struct MockFileInfo {
std::string full_path;
TimePoint mtime;
util::string_view data;
bool operator==(const MockFileInfo& other) const {
return mtime == other.mtime && full_path == other.full_path && data == other.data;
}
friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const MockFileInfo&);
};
/// A mock FileSystem implementation that holds its contents in memory.
///
/// Useful for validating the FileSystem API, writing conformance suite,
/// and bootstrapping FileSystem-based APIs.
class ARROW_EXPORT MockFileSystem : public FileSystem {
public:
explicit MockFileSystem(TimePoint current_time,
const io::IOContext& = io::default_io_context());
~MockFileSystem() override;
std::string type_name() const override { return "mock"; }
bool Equals(const FileSystem& other) const override;
// XXX It's not very practical to have to explicitly declare inheritance
// of default overrides.
using FileSystem::GetFileInfo;
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive = true) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
// Contents-dumping helpers to ease testing.
// Output is lexicographically-ordered by full path.
std::vector<MockDirInfo> AllDirs();
std::vector<MockFileInfo> AllFiles();
// Create a File with a content from a string.
Status CreateFile(const std::string& path, util::string_view content,
bool recursive = true);
// Create a MockFileSystem out of (empty) FileInfo. The content of every
// file is empty and of size 0. All directories will be created recursively.
static Result<std::shared_ptr<FileSystem>> Make(TimePoint current_time,
const std::vector<FileInfo>& infos);
class Impl;
protected:
std::unique_ptr<Impl> impl_;
};
class ARROW_EXPORT MockAsyncFileSystem : public MockFileSystem {
public:
explicit MockAsyncFileSystem(TimePoint current_time,
const io::IOContext& io_context = io::default_io_context())
: MockFileSystem(current_time, io_context) {
default_async_is_sync_ = false;
}
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
};
} // namespace internal
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,133 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "arrow/type_fwd.h"
#include "arrow/util/optional.h"
#include "arrow/util/string_view.h"
namespace arrow {
namespace fs {
namespace internal {
constexpr char kSep = '/';
// Computations on abstract paths (not local paths with system-dependent behaviour).
// Abstract paths are typically used in URIs.
// Split an abstract path into its individual components.
ARROW_EXPORT
std::vector<std::string> SplitAbstractPath(const std::string& path, char sep = kSep);
// Return the extension of the file
ARROW_EXPORT
std::string GetAbstractPathExtension(const std::string& s);
// Return the parent directory and basename of an abstract path. Both values may be
// empty.
ARROW_EXPORT
std::pair<std::string, std::string> GetAbstractPathParent(const std::string& s);
// Validate the components of an abstract path.
ARROW_EXPORT
Status ValidateAbstractPathParts(const std::vector<std::string>& parts);
// Append a non-empty stem to an abstract path.
ARROW_EXPORT
std::string ConcatAbstractPath(const std::string& base, const std::string& stem);
// Make path relative to base, if it starts with base. Otherwise error out.
ARROW_EXPORT
Result<std::string> MakeAbstractPathRelative(const std::string& base,
const std::string& path);
ARROW_EXPORT
std::string EnsureLeadingSlash(util::string_view s);
ARROW_EXPORT
util::string_view RemoveLeadingSlash(util::string_view s);
ARROW_EXPORT
std::string EnsureTrailingSlash(util::string_view s);
ARROW_EXPORT
util::string_view RemoveTrailingSlash(util::string_view s);
ARROW_EXPORT
bool IsAncestorOf(util::string_view ancestor, util::string_view descendant);
ARROW_EXPORT
util::optional<util::string_view> RemoveAncestor(util::string_view ancestor,
util::string_view descendant);
/// Return a vector of ancestors between a base path and a descendant.
/// For example,
///
/// AncestorsFromBasePath("a/b", "a/b/c/d/e") -> ["a/b/c", "a/b/c/d"]
ARROW_EXPORT
std::vector<std::string> AncestorsFromBasePath(util::string_view base_path,
util::string_view descendant);
/// Given a vector of paths of directories which must be created, produce a the minimal
/// subset for passing to CreateDir(recursive=true) by removing redundant parent
/// directories
ARROW_EXPORT
std::vector<std::string> MinimalCreateDirSet(std::vector<std::string> dirs);
// Join the components of an abstract path.
template <class StringIt>
std::string JoinAbstractPath(StringIt it, StringIt end, char sep = kSep) {
std::string path;
for (; it != end; ++it) {
if (it->empty()) continue;
if (!path.empty()) {
path += sep;
}
path += *it;
}
return path;
}
template <class StringRange>
std::string JoinAbstractPath(const StringRange& range, char sep = kSep) {
return JoinAbstractPath(range.begin(), range.end(), sep);
}
/// Convert slashes to backslashes, on all platforms. Mostly useful for testing.
ARROW_EXPORT
std::string ToBackslashes(util::string_view s);
/// Ensure a local path is abstract, by converting backslashes to regular slashes
/// on Windows. Return the path unchanged on other systems.
ARROW_EXPORT
std::string ToSlashes(util::string_view s);
ARROW_EXPORT
bool IsEmptyPath(util::string_view s);
ARROW_EXPORT
bool IsLikelyUri(util::string_view s);
} // namespace internal
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,90 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <gtest/gtest.h>
#include "arrow/filesystem/s3fs.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
namespace arrow {
namespace fs {
// A minio test server, managed as a child process
class MinioTestServer {
public:
MinioTestServer();
~MinioTestServer();
Status Start();
Status Stop();
std::string connect_string() const;
std::string access_key() const;
std::string secret_key() const;
private:
struct Impl;
std::unique_ptr<Impl> impl_;
};
// A Minio "environment" that spawns Minio processes in advances, such as
// to hide process launch latencies during testing.
class MinioTestEnvironment : public ::testing::Environment {
public:
MinioTestEnvironment();
~MinioTestEnvironment();
void SetUp() override;
Result<std::shared_ptr<MinioTestServer>> GetOneServer();
protected:
struct Impl;
std::unique_ptr<Impl> impl_;
};
// A global test "environment", to ensure that the S3 API is initialized before
// running unit tests.
class S3Environment : public ::testing::Environment {
public:
void SetUp() override {
// Change this to increase logging during tests
S3GlobalOptions options;
options.log_level = S3LogLevel::Fatal;
ASSERT_OK(InitializeS3(options));
}
void TearDown() override { ASSERT_OK(FinalizeS3()); }
};
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,319 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/util/macros.h"
#include "arrow/util/uri.h"
namespace Aws {
namespace Auth {
class AWSCredentialsProvider;
class STSAssumeRoleCredentialsProvider;
} // namespace Auth
namespace STS {
class STSClient;
}
} // namespace Aws
namespace arrow {
namespace fs {
/// Options for using a proxy for S3
struct ARROW_EXPORT S3ProxyOptions {
std::string scheme;
std::string host;
int port = -1;
std::string username;
std::string password;
/// Initialize from URI such as http://username:password@host:port
/// or http://host:port
static Result<S3ProxyOptions> FromUri(const std::string& uri);
static Result<S3ProxyOptions> FromUri(const ::arrow::internal::Uri& uri);
bool Equals(const S3ProxyOptions& other) const;
};
enum class S3CredentialsKind : int8_t {
/// Anonymous access (no credentials used)
Anonymous,
/// Use default AWS credentials, configured through environment variables
Default,
/// Use explicitly-provided access key pair
Explicit,
/// Assume role through a role ARN
Role,
/// Use web identity token to assume role, configured through environment variables
WebIdentity
};
/// Pure virtual class for describing custom S3 retry strategies
class S3RetryStrategy {
public:
virtual ~S3RetryStrategy() = default;
/// Simple struct where each field corresponds to a field in Aws::Client::AWSError
struct AWSErrorDetail {
/// Corresponds to AWSError::GetErrorType()
int error_type;
/// Corresponds to AWSError::GetMessage()
std::string message;
/// Corresponds to AWSError::GetExceptionName()
std::string exception_name;
/// Corresponds to AWSError::ShouldRetry()
bool should_retry;
};
/// Returns true if the S3 request resulting in the provided error should be retried.
virtual bool ShouldRetry(const AWSErrorDetail& error, int64_t attempted_retries) = 0;
/// Returns the time in milliseconds the S3 client should sleep for until retrying.
virtual int64_t CalculateDelayBeforeNextRetry(const AWSErrorDetail& error,
int64_t attempted_retries) = 0;
};
/// Options for the S3FileSystem implementation.
struct ARROW_EXPORT S3Options {
/// \brief AWS region to connect to.
///
/// If unset, the AWS SDK will choose a default value. The exact algorithm
/// depends on the SDK version. Before 1.8, the default is hardcoded
/// to "us-east-1". Since 1.8, several heuristics are used to determine
/// the region (environment variables, configuration profile, EC2 metadata
/// server).
std::string region;
/// If non-empty, override region with a connect string such as "localhost:9000"
// XXX perhaps instead take a URL like "http://localhost:9000"?
std::string endpoint_override;
/// S3 connection transport, default "https"
std::string scheme = "https";
/// ARN of role to assume
std::string role_arn;
/// Optional identifier for an assumed role session.
std::string session_name;
/// Optional external idenitifer to pass to STS when assuming a role
std::string external_id;
/// Frequency (in seconds) to refresh temporary credentials from assumed role
int load_frequency;
/// If connection is through a proxy, set options here
S3ProxyOptions proxy_options;
/// AWS credentials provider
std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
/// Type of credentials being used. Set along with credentials_provider.
S3CredentialsKind credentials_kind = S3CredentialsKind::Default;
/// Whether OutputStream writes will be issued in the background, without blocking.
bool background_writes = true;
/// \brief Default metadata for OpenOutputStream.
///
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
std::shared_ptr<const KeyValueMetadata> default_metadata;
/// Optional retry strategy to determine which error types should be retried, and the
/// delay between retries.
std::shared_ptr<S3RetryStrategy> retry_strategy;
S3Options();
/// Configure with the default AWS credentials provider chain.
void ConfigureDefaultCredentials();
/// Configure with anonymous credentials. This will only let you access public buckets.
void ConfigureAnonymousCredentials();
/// Configure with explicit access and secret key.
void ConfigureAccessKey(const std::string& access_key, const std::string& secret_key,
const std::string& session_token = "");
/// Configure with credentials from an assumed role.
void ConfigureAssumeRoleCredentials(
const std::string& role_arn, const std::string& session_name = "",
const std::string& external_id = "", int load_frequency = 900,
const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
/// Configure with credentials from role assumed using a web identitiy token
void ConfigureAssumeRoleWithWebIdentityCredentials();
std::string GetAccessKey() const;
std::string GetSecretKey() const;
std::string GetSessionToken() const;
bool Equals(const S3Options& other) const;
/// \brief Initialize with default credentials provider chain
///
/// This is recommended if you use the standard AWS environment variables
/// and/or configuration file.
static S3Options Defaults();
/// \brief Initialize with anonymous credentials.
///
/// This will only let you access public buckets.
static S3Options Anonymous();
/// \brief Initialize with explicit access and secret key.
///
/// Optionally, a session token may also be provided for temporary credentials
/// (from STS).
static S3Options FromAccessKey(const std::string& access_key,
const std::string& secret_key,
const std::string& session_token = "");
/// \brief Initialize from an assumed role.
static S3Options FromAssumeRole(
const std::string& role_arn, const std::string& session_name = "",
const std::string& external_id = "", int load_frequency = 900,
const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
/// \brief Initialize from an assumed role with web-identity.
/// Uses the AWS SDK which uses environment variables to
/// generate temporary credentials.
static S3Options FromAssumeRoleWithWebIdentity();
static Result<S3Options> FromUri(const ::arrow::internal::Uri& uri,
std::string* out_path = NULLPTR);
static Result<S3Options> FromUri(const std::string& uri,
std::string* out_path = NULLPTR);
};
/// S3-backed FileSystem implementation.
///
/// Some implementation notes:
/// - buckets are special and the operations available on them may be limited
/// or more expensive than desired.
class ARROW_EXPORT S3FileSystem : public FileSystem {
public:
~S3FileSystem() override;
std::string type_name() const override { return "s3"; }
/// Return the original S3 options when constructing the filesystem
S3Options options() const;
/// Return the actual region this filesystem connects to
std::string region() const;
bool Equals(const FileSystem& other) const override;
/// \cond FALSE
using FileSystem::GetFileInfo;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive = true) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
Future<> DeleteDirContentsAsync(const std::string& path,
bool missing_dir_ok = false) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
/// Create a sequential input stream for reading from a S3 object.
///
/// NOTE: Reads from the stream will be synchronous and unbuffered.
/// You way want to wrap the stream in a BufferedInputStream or use
/// a custom readahead strategy to avoid idle waits.
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
/// Create a sequential input stream for reading from a S3 object.
///
/// This override avoids a HEAD request by assuming the FileInfo
/// contains correct information.
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
/// Create a random access file for reading from a S3 object.
///
/// See OpenInputStream for performance notes.
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
/// Create a random access file for reading from a S3 object.
///
/// This override avoids a HEAD request by assuming the FileInfo
/// contains correct information.
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
/// Create a sequential output stream for writing to a S3 object.
///
/// NOTE: Writes to the stream will be buffered. Depending on
/// S3Options.background_writes, they can be synchronous or not.
/// It is recommended to enable background_writes unless you prefer
/// implementing your own background execution strategy.
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
/// Create a S3FileSystem instance from the given options.
static Result<std::shared_ptr<S3FileSystem>> Make(
const S3Options& options, const io::IOContext& = io::default_io_context());
protected:
explicit S3FileSystem(const S3Options& options, const io::IOContext&);
class Impl;
std::shared_ptr<Impl> impl_;
};
enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace };
struct ARROW_EXPORT S3GlobalOptions {
S3LogLevel log_level;
};
/// Initialize the S3 APIs. It is required to call this function at least once
/// before using S3FileSystem.
ARROW_EXPORT
Status InitializeS3(const S3GlobalOptions& options);
/// Ensure the S3 APIs are initialized, but only if not already done.
/// If necessary, this will call InitializeS3() with some default options.
ARROW_EXPORT
Status EnsureS3Initialized();
/// Shutdown the S3 APIs.
ARROW_EXPORT
Status FinalizeS3();
ARROW_EXPORT
Result<std::string> ResolveS3BucketRegion(const std::string& bucket);
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,248 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <chrono>
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/filesystem/mockfs.h"
#include "arrow/testing/visibility.h"
#include "arrow/util/counting_semaphore.h"
namespace arrow {
namespace fs {
static constexpr double kTimeSlack = 2.0; // In seconds
static inline FileInfo File(std::string path) {
return FileInfo(std::move(path), FileType::File);
}
static inline FileInfo Dir(std::string path) {
return FileInfo(std::move(path), FileType::Directory);
}
// A subclass of MockFileSystem that blocks operations until an unlock method is
// called.
//
// This is intended for testing fine-grained ordering of filesystem operations.
//
// N.B. Only OpenOutputStream supports gating at the moment but this is simply because
// it is all that has been needed so far. Feel free to add support for more methods
// as required.
class ARROW_TESTING_EXPORT GatedMockFilesystem : public internal::MockFileSystem {
public:
GatedMockFilesystem(TimePoint current_time,
const io::IOContext& = io::default_io_context());
~GatedMockFilesystem() override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
// Wait until at least num_waiters are waiting on OpenOutputStream
Status WaitForOpenOutputStream(uint32_t num_waiters);
// Unlock `num_waiters` individual calls to OpenOutputStream
Status UnlockOpenOutputStream(uint32_t num_waiters);
private:
util::CountingSemaphore open_output_sem_;
};
ARROW_TESTING_EXPORT
void CreateFile(FileSystem* fs, const std::string& path, const std::string& data);
// Sort a vector of FileInfo by lexicographic path order
ARROW_TESTING_EXPORT
void SortInfos(FileInfoVector* infos);
ARROW_TESTING_EXPORT
void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
TimePoint mtime);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
TimePoint mtime, int64_t size);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
int64_t size);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
TimePoint mtime);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
TimePoint mtime, int64_t size);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type, int64_t size);
ARROW_TESTING_EXPORT
void AssertFileContents(FileSystem* fs, const std::string& path,
const std::string& expected_data);
template <typename Duration>
void AssertDurationBetween(Duration d, double min_secs, double max_secs) {
auto seconds = std::chrono::duration_cast<std::chrono::duration<double>>(d);
ASSERT_GE(seconds.count(), min_secs);
ASSERT_LE(seconds.count(), max_secs);
}
// Generic tests for FileSystem implementations.
// To use this class, subclass both from it and ::testing::Test,
// implement GetEmptyFileSystem(), and use GENERIC_FS_TEST_FUNCTIONS()
// to define the various tests.
class ARROW_TESTING_EXPORT GenericFileSystemTest {
public:
virtual ~GenericFileSystemTest();
void TestEmpty();
void TestNormalizePath();
void TestCreateDir();
void TestDeleteDir();
void TestDeleteDirContents();
void TestDeleteRootDirContents();
void TestDeleteFile();
void TestDeleteFiles();
void TestMoveFile();
void TestMoveDir();
void TestCopyFile();
void TestGetFileInfo();
void TestGetFileInfoVector();
void TestGetFileInfoSelector();
void TestGetFileInfoSelectorWithRecursion();
void TestGetFileInfoAsync();
void TestGetFileInfoGenerator();
void TestOpenOutputStream();
void TestOpenAppendStream();
void TestOpenInputStream();
void TestOpenInputStreamWithFileInfo();
void TestOpenInputStreamAsync();
void TestOpenInputFile();
void TestOpenInputFileWithFileInfo();
void TestOpenInputFileAsync();
void TestSpecialChars();
protected:
// This function should return the filesystem under test.
virtual std::shared_ptr<FileSystem> GetEmptyFileSystem() = 0;
// Override the following functions to specify deviations from expected
// filesystem semantics.
// - Whether the filesystem may "implicitly" create intermediate directories
virtual bool have_implicit_directories() const { return false; }
// - Whether the filesystem may allow writing a file "over" a directory
virtual bool allow_write_file_over_dir() const { return false; }
// - Whether the filesystem allows reading a directory
virtual bool allow_read_dir_as_file() const { return false; }
// - Whether the filesystem allows moving a directory
virtual bool allow_move_dir() const { return true; }
// - Whether the filesystem allows moving a directory "over" a non-empty destination
virtual bool allow_move_dir_over_non_empty_dir() const { return false; }
// - Whether the filesystem allows appending to a file
virtual bool allow_append_to_file() const { return true; }
// - Whether the filesystem allows appending to a new (not existent yet) file
virtual bool allow_append_to_new_file() const { return true; }
// - Whether the filesystem supports directory modification times
virtual bool have_directory_mtimes() const { return true; }
// - Whether some directory tree deletion tests may fail randomly
virtual bool have_flaky_directory_tree_deletion() const { return false; }
// - Whether the filesystem stores some metadata alongside files
virtual bool have_file_metadata() const { return false; }
void TestEmpty(FileSystem* fs);
void TestNormalizePath(FileSystem* fs);
void TestCreateDir(FileSystem* fs);
void TestDeleteDir(FileSystem* fs);
void TestDeleteDirContents(FileSystem* fs);
void TestDeleteRootDirContents(FileSystem* fs);
void TestDeleteFile(FileSystem* fs);
void TestDeleteFiles(FileSystem* fs);
void TestMoveFile(FileSystem* fs);
void TestMoveDir(FileSystem* fs);
void TestCopyFile(FileSystem* fs);
void TestGetFileInfo(FileSystem* fs);
void TestGetFileInfoVector(FileSystem* fs);
void TestGetFileInfoSelector(FileSystem* fs);
void TestGetFileInfoSelectorWithRecursion(FileSystem* fs);
void TestGetFileInfoAsync(FileSystem* fs);
void TestGetFileInfoGenerator(FileSystem* fs);
void TestOpenOutputStream(FileSystem* fs);
void TestOpenAppendStream(FileSystem* fs);
void TestOpenInputStream(FileSystem* fs);
void TestOpenInputStreamWithFileInfo(FileSystem* fs);
void TestOpenInputStreamAsync(FileSystem* fs);
void TestOpenInputFile(FileSystem* fs);
void TestOpenInputFileWithFileInfo(FileSystem* fs);
void TestOpenInputFileAsync(FileSystem* fs);
void TestSpecialChars(FileSystem* fs);
};
#define GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NAME) \
TEST_MACRO(TEST_CLASS, NAME) { this->Test##NAME(); }
#define GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_MACRO, TEST_CLASS) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, Empty) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NormalizePath) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CreateDir) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDir) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDirContents) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteRootDirContents) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFiles) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveDir) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfo) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoVector) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelector) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelectorWithRecursion) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoAsync) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoGenerator) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenOutputStream) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenAppendStream) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStream) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamWithFileInfo) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamAsync) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileWithFileInfo) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileAsync) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, SpecialChars)
#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \
GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_F, TEST_CLASS)
#define GENERIC_FS_TYPED_TEST_FUNCTIONS(TEST_CLASS) \
GENERIC_FS_TEST_FUNCTIONS_MACROS(TYPED_TEST, TEST_CLASS)
} // namespace fs
} // namespace arrow

View File

@ -0,0 +1,49 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
namespace arrow {
namespace fs {
/// \brief FileSystem entry type
enum class FileType : int8_t {
/// Entry is not found
NotFound,
/// Entry exists but its type is unknown
///
/// This can designate a special file such as a Unix socket or character
/// device, or Windows NUL / CON / ...
Unknown,
/// Entry is a regular file
File,
/// Entry is a directory
Directory
};
struct FileInfo;
struct FileSelector;
class FileSystem;
class SubTreeFileSystem;
class SlowFileSystem;
class LocalFileSystem;
class S3FileSystem;
} // namespace fs
} // namespace arrow