first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,42 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from libcpp.memory cimport shared_ptr
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CDataType,
CField, CRecordBatch, CSchema,
CTable, CTensor, CSparseCOOTensor,
CSparseCSRMatrix, CSparseCSCMatrix,
CSparseCSFTensor)
cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py":
cdef int import_pyarrow() except -1
cdef object wrap_buffer(const shared_ptr[CBuffer]& buffer)
cdef object wrap_data_type(const shared_ptr[CDataType]& type)
cdef object wrap_field(const shared_ptr[CField]& field)
cdef object wrap_schema(const shared_ptr[CSchema]& schema)
cdef object wrap_array(const shared_ptr[CArray]& sp_array)
cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
cdef object wrap_sparse_tensor_coo(
const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor)
cdef object wrap_sparse_tensor_csr(
const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor)
cdef object wrap_sparse_tensor_csc(
const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor)
cdef object wrap_sparse_tensor_csf(
const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor)
cdef object wrap_table(const shared_ptr[CTable]& ctable)
cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch)

View File

@ -0,0 +1,573 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
"""
PyArrow is the python implementation of Apache Arrow.
Apache Arrow is a cross-language development platform for in-memory data.
It specifies a standardized language-independent columnar memory format for
flat and hierarchical data, organized for efficient analytic operations on
modern hardware. It also provides computational libraries and zero-copy
streaming messaging and interprocess communication.
For more information see the official page at https://arrow.apache.org
"""
import gc as _gc
import importlib as _importlib
import os as _os
import platform as _platform
import sys as _sys
import warnings as _warnings
try:
from ._generated_version import version as __version__
except ImportError:
# Package is not installed, parse git tag at runtime
try:
import setuptools_scm
# Code duplicated from setup.py to avoid a dependency on each other
def parse_git(root, **kwargs):
"""
Parse function for setuptools_scm that ignores tags for non-C++
subprojects, e.g. apache-arrow-js-XXX tags.
"""
from setuptools_scm.git import parse
kwargs['describe_command'] = \
"git describe --dirty --tags --long --match 'apache-arrow-[0-9].*'"
return parse(root, **kwargs)
__version__ = setuptools_scm.get_version('../',
parse=parse_git)
except ImportError:
__version__ = None
# ARROW-8684: Disable GC while initializing Cython extension module,
# to workaround Cython bug in https://github.com/cython/cython/issues/3603
_gc_enabled = _gc.isenabled()
_gc.disable()
import pyarrow.lib as _lib
if _gc_enabled:
_gc.enable()
from pyarrow.lib import (BuildInfo, RuntimeInfo, MonthDayNano,
VersionInfo, cpp_build_info, cpp_version,
cpp_version_info, runtime_info, cpu_count,
set_cpu_count, enable_signal_handlers,
io_thread_count, set_io_thread_count)
def show_versions():
"""
Print various version information, to help with error reporting.
"""
def print_entry(label, value):
print(f"{label: <26}: {value: <8}")
print("pyarrow version info\n--------------------")
print_entry("Package kind", cpp_build_info.package_kind
if len(cpp_build_info.package_kind) > 0
else "not indicated")
print_entry("Arrow C++ library version", cpp_build_info.version)
print_entry("Arrow C++ compiler",
f"{cpp_build_info.compiler_id} {cpp_build_info.compiler_version}")
print_entry("Arrow C++ compiler flags", cpp_build_info.compiler_flags)
print_entry("Arrow C++ git revision", cpp_build_info.git_id)
print_entry("Arrow C++ git description", cpp_build_info.git_description)
print_entry("Arrow C++ build type", cpp_build_info.build_type)
def _module_is_available(module):
try:
_importlib.import_module(f'pyarrow.{module}')
except ImportError:
return False
else:
return True
def _filesystem_is_available(fs):
try:
import pyarrow.fs
except ImportError:
return False
try:
getattr(pyarrow.fs, fs)
except (ImportError, AttributeError):
return False
else:
return True
def show_info():
"""
Print detailed version and platform information, for error reporting
"""
show_versions()
def print_entry(label, value):
print(f" {label: <20}: {value: <8}")
print("\nPlatform:")
print_entry("OS / Arch", f"{_platform.system()} {_platform.machine()}")
print_entry("SIMD Level", runtime_info().simd_level)
print_entry("Detected SIMD Level", runtime_info().detected_simd_level)
pool = default_memory_pool()
print("\nMemory:")
print_entry("Default backend", pool.backend_name)
print_entry("Bytes allocated", f"{pool.bytes_allocated()} bytes")
print_entry("Max memory", f"{pool.max_memory()} bytes")
print_entry("Supported Backends", ', '.join(supported_memory_backends()))
print("\nOptional modules:")
modules = ["csv", "cuda", "dataset", "feather", "flight", "fs", "gandiva", "json",
"orc", "parquet", "plasma"]
for module in modules:
status = "Enabled" if _module_is_available(module) else "-"
print(f" {module: <20}: {status: <8}")
print("\nFilesystems:")
filesystems = ["GcsFileSystem", "HadoopFileSystem", "S3FileSystem"]
for fs in filesystems:
status = "Enabled" if _filesystem_is_available(fs) else "-"
print(f" {fs: <20}: {status: <8}")
print("\nCompression Codecs:")
codecs = ["brotli", "bz2", "gzip", "lz4_frame", "lz4", "snappy", "zstd"]
for codec in codecs:
status = "Enabled" if Codec.is_available(codec) else "-"
print(f" {codec: <20}: {status: <8}")
from pyarrow.lib import (null, bool_,
int8, int16, int32, int64,
uint8, uint16, uint32, uint64,
time32, time64, timestamp, date32, date64, duration,
month_day_nano_interval,
float16, float32, float64,
binary, string, utf8,
large_binary, large_string, large_utf8,
decimal128, decimal256,
list_, large_list, map_, struct,
union, sparse_union, dense_union,
dictionary,
field,
type_for_alias,
DataType, DictionaryType, StructType,
ListType, LargeListType, MapType, FixedSizeListType,
UnionType, SparseUnionType, DenseUnionType,
TimestampType, Time32Type, Time64Type, DurationType,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
KeyValueMetadata,
Field,
Schema,
schema,
unify_schemas,
Array, Tensor,
array, chunked_array, record_batch, nulls, repeat,
SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
SparseCSFTensor,
infer_type, from_numpy_dtype,
NullArray,
NumericArray, IntegerArray, FloatingPointArray,
BooleanArray,
Int8Array, UInt8Array,
Int16Array, UInt16Array,
Int32Array, UInt32Array,
Int64Array, UInt64Array,
ListArray, LargeListArray, MapArray,
FixedSizeListArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
FixedSizeBinaryArray,
DictionaryArray,
Date32Array, Date64Array, TimestampArray,
Time32Array, Time64Array, DurationArray,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
HalfFloatScalar, FloatScalar, DoubleScalar,
Decimal128Scalar, Decimal256Scalar,
ListScalar, LargeListScalar, FixedSizeListScalar,
Date32Scalar, Date64Scalar,
Time32Scalar, Time64Scalar,
TimestampScalar, DurationScalar,
MonthDayNanoIntervalScalar,
BinaryScalar, LargeBinaryScalar,
StringScalar, LargeStringScalar,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
ExtensionScalar)
# Buffers, allocation
from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
Codec, compress, decompress, allocate_buffer)
from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool,
total_allocated_bytes, set_memory_pool,
default_memory_pool, system_memory_pool,
jemalloc_memory_pool, mimalloc_memory_pool,
logging_memory_pool, proxy_memory_pool,
log_memory_allocations, jemalloc_set_decay_ms,
supported_memory_backends)
# I/O
from pyarrow.lib import (NativeFile, PythonFile,
BufferedInputStream, BufferedOutputStream,
CompressedInputStream, CompressedOutputStream,
TransformInputStream, transcoding_input_stream,
FixedSizeBufferWriter,
BufferReader, BufferOutputStream,
OSFile, MemoryMappedFile, memory_map,
create_memory_map, MockOutputStream,
input_stream, output_stream)
from pyarrow._hdfsio import HdfsFile, have_libhdfs
from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table,
concat_arrays, concat_tables, TableGroupBy,
RecordBatchReader)
# Exceptions
from pyarrow.lib import (ArrowCancelled,
ArrowCapacityError,
ArrowException,
ArrowKeyError,
ArrowIndexError,
ArrowInvalid,
ArrowIOError,
ArrowMemoryError,
ArrowNotImplementedError,
ArrowTypeError,
ArrowSerializationError)
# Serialization
from pyarrow.lib import (deserialize_from, deserialize,
deserialize_components,
serialize, serialize_to, read_serialized,
SerializationCallbackError,
DeserializationCallbackError)
import pyarrow.hdfs as hdfs
from pyarrow.ipc import serialize_pandas, deserialize_pandas
import pyarrow.ipc as ipc
from pyarrow.serialization import (default_serialization_context,
register_default_serialization_handlers,
register_torch_serialization_handlers)
import pyarrow.types as types
# deprecated top-level access
from pyarrow.filesystem import FileSystem as _FileSystem
from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem
from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem
from pyarrow.lib import SerializationContext as _SerializationContext
from pyarrow.lib import SerializedPyObject as _SerializedPyObject
_localfs = _LocalFileSystem._get_instance()
_msg = (
"pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
)
_serialization_msg = (
"'pyarrow.{0}' is deprecated and will be removed in a future version. "
"Use pickle or the pyarrow IPC functionality instead."
)
_deprecated = {
"localfs": (_localfs, "LocalFileSystem"),
"FileSystem": (_FileSystem, "FileSystem"),
"LocalFileSystem": (_LocalFileSystem, "LocalFileSystem"),
"HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"),
}
_serialization_deprecatd = {
"SerializationContext": _SerializationContext,
"SerializedPyObject": _SerializedPyObject,
}
def __getattr__(name):
if name in _deprecated:
obj, new_name = _deprecated[name]
_warnings.warn(_msg.format(name, new_name),
FutureWarning, stacklevel=2)
return obj
elif name in _serialization_deprecatd:
_warnings.warn(_serialization_msg.format(name),
FutureWarning, stacklevel=2)
return _serialization_deprecatd[name]
raise AttributeError(
"module 'pyarrow' has no attribute '{0}'".format(name)
)
# Entry point for starting the plasma store
def _plasma_store_entry_point():
"""Entry point for starting the plasma store.
This can be used by invoking e.g.
``plasma_store -s /tmp/plasma -m 1000000000``
from the command line and will start the plasma_store executable with the
given arguments.
"""
import pyarrow
plasma_store_executable = _os.path.join(pyarrow.__path__[0],
"plasma-store-server")
_os.execv(plasma_store_executable, _sys.argv)
# ----------------------------------------------------------------------
# Deprecations
from pyarrow.util import _deprecate_api, _deprecate_class
read_message = _deprecate_api("read_message", "ipc.read_message",
ipc.read_message, "0.17.0")
read_record_batch = _deprecate_api("read_record_batch",
"ipc.read_record_batch",
ipc.read_record_batch, "0.17.0")
read_schema = _deprecate_api("read_schema", "ipc.read_schema",
ipc.read_schema, "0.17.0")
read_tensor = _deprecate_api("read_tensor", "ipc.read_tensor",
ipc.read_tensor, "0.17.0")
write_tensor = _deprecate_api("write_tensor", "ipc.write_tensor",
ipc.write_tensor, "0.17.0")
get_record_batch_size = _deprecate_api("get_record_batch_size",
"ipc.get_record_batch_size",
ipc.get_record_batch_size, "0.17.0")
get_tensor_size = _deprecate_api("get_tensor_size",
"ipc.get_tensor_size",
ipc.get_tensor_size, "0.17.0")
open_stream = _deprecate_api("open_stream", "ipc.open_stream",
ipc.open_stream, "0.17.0")
open_file = _deprecate_api("open_file", "ipc.open_file", ipc.open_file,
"0.17.0")
def _deprecate_scalar(ty, symbol):
return _deprecate_class("{}Value".format(ty), symbol, "1.0.0")
ArrayValue = _deprecate_class("ArrayValue", Scalar, "1.0.0")
NullType = _deprecate_class("NullType", NullScalar, "1.0.0")
BooleanValue = _deprecate_scalar("Boolean", BooleanScalar)
Int8Value = _deprecate_scalar("Int8", Int8Scalar)
Int16Value = _deprecate_scalar("Int16", Int16Scalar)
Int32Value = _deprecate_scalar("Int32", Int32Scalar)
Int64Value = _deprecate_scalar("Int64", Int64Scalar)
UInt8Value = _deprecate_scalar("UInt8", UInt8Scalar)
UInt16Value = _deprecate_scalar("UInt16", UInt16Scalar)
UInt32Value = _deprecate_scalar("UInt32", UInt32Scalar)
UInt64Value = _deprecate_scalar("UInt64", UInt64Scalar)
HalfFloatValue = _deprecate_scalar("HalfFloat", HalfFloatScalar)
FloatValue = _deprecate_scalar("Float", FloatScalar)
DoubleValue = _deprecate_scalar("Double", DoubleScalar)
ListValue = _deprecate_scalar("List", ListScalar)
LargeListValue = _deprecate_scalar("LargeList", LargeListScalar)
MapValue = _deprecate_scalar("Map", MapScalar)
FixedSizeListValue = _deprecate_scalar("FixedSizeList", FixedSizeListScalar)
BinaryValue = _deprecate_scalar("Binary", BinaryScalar)
StringValue = _deprecate_scalar("String", StringScalar)
LargeBinaryValue = _deprecate_scalar("LargeBinary", LargeBinaryScalar)
LargeStringValue = _deprecate_scalar("LargeString", LargeStringScalar)
FixedSizeBinaryValue = _deprecate_scalar("FixedSizeBinary",
FixedSizeBinaryScalar)
Decimal128Value = _deprecate_scalar("Decimal128", Decimal128Scalar)
Decimal256Value = _deprecate_scalar("Decimal256", Decimal256Scalar)
UnionValue = _deprecate_scalar("Union", UnionScalar)
StructValue = _deprecate_scalar("Struct", StructScalar)
DictionaryValue = _deprecate_scalar("Dictionary", DictionaryScalar)
Date32Value = _deprecate_scalar("Date32", Date32Scalar)
Date64Value = _deprecate_scalar("Date64", Date64Scalar)
Time32Value = _deprecate_scalar("Time32", Time32Scalar)
Time64Value = _deprecate_scalar("Time64", Time64Scalar)
TimestampValue = _deprecate_scalar("Timestamp", TimestampScalar)
DurationValue = _deprecate_scalar("Duration", DurationScalar)
# TODO: Deprecate these somehow in the pyarrow namespace
from pyarrow.ipc import (Message, MessageReader, MetadataVersion,
RecordBatchFileReader, RecordBatchFileWriter,
RecordBatchStreamReader, RecordBatchStreamWriter)
# ----------------------------------------------------------------------
# Returning absolute path to the pyarrow include directory (if bundled, e.g. in
# wheels)
def get_include():
"""
Return absolute path to directory containing Arrow C++ include
headers. Similar to numpy.get_include
"""
return _os.path.join(_os.path.dirname(__file__), 'include')
def _get_pkg_config_executable():
return _os.environ.get('PKG_CONFIG', 'pkg-config')
def _has_pkg_config(pkgname):
import subprocess
try:
return subprocess.call([_get_pkg_config_executable(),
'--exists', pkgname]) == 0
except FileNotFoundError:
return False
def _read_pkg_config_variable(pkgname, cli_args):
import subprocess
cmd = [_get_pkg_config_executable(), pkgname] + cli_args
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = proc.communicate()
if proc.returncode != 0:
raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
return out.rstrip().decode('utf8')
def get_libraries():
"""
Return list of library names to include in the `libraries` argument for C
or Cython extensions using pyarrow
"""
return ['arrow', 'arrow_python']
def create_library_symlinks():
"""
With Linux and macOS wheels, the bundled shared libraries have an embedded
ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them
with -larrow won't work unless we create symlinks at locations like
site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses
prior problems we had with shipping two copies of the shared libraries to
permit third party projects like turbodbc to build their C++ extensions
against the pyarrow wheels.
This function must only be invoked once and only when the shared libraries
are bundled with the Python package, which should only apply to wheel-based
installs. It requires write access to the site-packages/pyarrow directory
and so depending on your system may need to be run with root.
"""
import glob
if _sys.platform == 'win32':
return
package_cwd = _os.path.dirname(__file__)
if _sys.platform == 'linux':
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*'))
def get_symlink_path(hard_path):
return hard_path.rsplit('.', 1)[0]
else:
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib'))
def get_symlink_path(hard_path):
return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib'))
for lib_hard_path in bundled_libs:
symlink_path = get_symlink_path(lib_hard_path)
if _os.path.exists(symlink_path):
continue
try:
_os.symlink(lib_hard_path, symlink_path)
except PermissionError:
print("Tried creating symlink {}. If you need to link to "
"bundled shared libraries, run "
"pyarrow.create_library_symlinks() as root")
def get_library_dirs():
"""
Return lists of directories likely to contain Arrow C++ libraries for
linking C or Cython extensions using pyarrow
"""
package_cwd = _os.path.dirname(__file__)
library_dirs = [package_cwd]
def append_library_dir(library_dir):
if library_dir not in library_dirs:
library_dirs.append(library_dir)
# Search library paths via pkg-config. This is necessary if the user
# installed libarrow and the other shared libraries manually and they
# are not shipped inside the pyarrow package (see also ARROW-2976).
pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
for pkgname in ["arrow", "arrow_python"]:
if _has_pkg_config(pkgname):
library_dir = _read_pkg_config_variable(pkgname,
["--libs-only-L"])
# pkg-config output could be empty if Arrow is installed
# as a system package.
if library_dir:
if not library_dir.startswith("-L"):
raise ValueError(
"pkg-config --libs-only-L returned unexpected "
"value {!r}".format(library_dir))
append_library_dir(library_dir[2:])
if _sys.platform == 'win32':
# TODO(wesm): Is this necessary, or does setuptools within a conda
# installation add Library\lib to the linker path for MSVC?
python_base_install = _os.path.dirname(_sys.executable)
library_dir = _os.path.join(python_base_install, 'Library', 'lib')
if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
append_library_dir(library_dir)
# ARROW-4074: Allow for ARROW_HOME to be set to some other directory
if _os.environ.get('ARROW_HOME'):
append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
else:
# Python wheels bundle the Arrow libraries in the pyarrow directory.
append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
return library_dirs

View File

@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport *
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
cdef class FunctionOptions(_Weakrefable):
cdef:
shared_ptr[CFunctionOptions] wrapped
cdef const CFunctionOptions* get_options(self) except NULL
cdef void init(self, const shared_ptr[CFunctionOptions]& sp)
cdef inline shared_ptr[CFunctionOptions] unwrap(self)
cdef CExpression _bind(Expression filter, Schema schema) except *
cdef class Expression(_Weakrefable):
cdef:
CExpression expr
cdef void init(self, const CExpression& sp)
@staticmethod
cdef wrap(const CExpression& sp)
cdef inline CExpression unwrap(self)
@staticmethod
cdef Expression _expr_or_scalar(object expr)
cdef CExpression _true

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,56 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Custom documentation additions for compute functions.
"""
function_doc_additions = {}
function_doc_additions["filter"] = """
Examples
--------
>>> import pyarrow as pa
>>> arr = pa.array(["a", "b", "c", None, "e"])
>>> mask = pa.array([True, False, None, False, True])
>>> arr.filter(mask)
<pyarrow.lib.StringArray object at 0x7fa826df9200>
[
"a",
"e"
]
>>> arr.filter(mask, null_selection_behavior='emit_null')
<pyarrow.lib.StringArray object at 0x7fa826df9200>
[
"a",
null,
"e"
]
"""
function_doc_additions["mode"] = """
Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2])
>>> modes = pc.mode(arr, 2)
>>> modes[0]
<pyarrow.StructScalar: {'mode': 2, 'count': 5}>
>>> modes[1]
<pyarrow.StructScalar: {'mode': 1, 'count': 2}>
"""

View File

@ -0,0 +1,55 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport _Weakrefable
cdef class ConvertOptions(_Weakrefable):
cdef:
unique_ptr[CCSVConvertOptions] options
@staticmethod
cdef ConvertOptions wrap(CCSVConvertOptions options)
cdef class ParseOptions(_Weakrefable):
cdef:
unique_ptr[CCSVParseOptions] options
object _invalid_row_handler
@staticmethod
cdef ParseOptions wrap(CCSVParseOptions options)
cdef class ReadOptions(_Weakrefable):
cdef:
unique_ptr[CCSVReadOptions] options
public object encoding
@staticmethod
cdef ReadOptions wrap(CCSVReadOptions options)
cdef class WriteOptions(_Weakrefable):
cdef:
unique_ptr[CCSVWriteOptions] options
@staticmethod
cdef WriteOptions wrap(CCSVWriteOptions options)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport *
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_cuda cimport *
cdef class Context(_Weakrefable):
cdef:
shared_ptr[CCudaContext] context
int device_number
cdef void init(self, const shared_ptr[CCudaContext]& ctx)
cdef class IpcMemHandle(_Weakrefable):
cdef:
shared_ptr[CCudaIpcMemHandle] handle
cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h)
cdef class CudaBuffer(Buffer):
cdef:
shared_ptr[CCudaBuffer] cuda_buffer
object base
cdef void init_cuda(self,
const shared_ptr[CCudaBuffer]& buffer,
object base)
cdef class HostBuffer(Buffer):
cdef:
shared_ptr[CCudaHostBuffer] host_buffer
cdef void init_host(self, const shared_ptr[CCudaHostBuffer]& buffer)
cdef class BufferReader(NativeFile):
cdef:
CCudaBufferReader* reader
CudaBuffer buffer
cdef class BufferWriter(NativeFile):
cdef:
CCudaBufferWriter* writer
CudaBuffer buffer

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,162 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
"""Dataset is currently unstable. APIs subject to change without notice."""
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow.lib cimport *
from pyarrow._fs cimport FileSystem
cdef CFileSource _make_file_source(object file, FileSystem filesystem=*)
cdef class DatasetFactory(_Weakrefable):
cdef:
shared_ptr[CDatasetFactory] wrapped
CDatasetFactory* factory
cdef init(self, const shared_ptr[CDatasetFactory]& sp)
@staticmethod
cdef wrap(const shared_ptr[CDatasetFactory]& sp)
cdef inline shared_ptr[CDatasetFactory] unwrap(self) nogil
cdef class Dataset(_Weakrefable):
cdef:
shared_ptr[CDataset] wrapped
CDataset* dataset
cdef void init(self, const shared_ptr[CDataset]& sp)
@staticmethod
cdef wrap(const shared_ptr[CDataset]& sp)
cdef shared_ptr[CDataset] unwrap(self) nogil
cdef class FragmentScanOptions(_Weakrefable):
cdef:
shared_ptr[CFragmentScanOptions] wrapped
cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFragmentScanOptions]& sp)
cdef class FileFormat(_Weakrefable):
cdef:
shared_ptr[CFileFormat] wrapped
CFileFormat* format
cdef void init(self, const shared_ptr[CFileFormat]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFileFormat]& sp)
cdef inline shared_ptr[CFileFormat] unwrap(self)
cdef _set_default_fragment_scan_options(self, FragmentScanOptions options)
# Return a WrittenFile after a file was written.
# May be overridden by subclasses, e.g. to add metadata.
cdef WrittenFile _finish_write(self, path, base_dir,
CFileWriter* file_writer)
cdef class FileWriteOptions(_Weakrefable):
cdef:
shared_ptr[CFileWriteOptions] wrapped
CFileWriteOptions* c_options
cdef void init(self, const shared_ptr[CFileWriteOptions]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFileWriteOptions]& sp)
cdef inline shared_ptr[CFileWriteOptions] unwrap(self)
cdef class Fragment(_Weakrefable):
cdef:
shared_ptr[CFragment] wrapped
CFragment* fragment
cdef void init(self, const shared_ptr[CFragment]& sp)
@staticmethod
cdef wrap(const shared_ptr[CFragment]& sp)
cdef inline shared_ptr[CFragment] unwrap(self)
cdef class FileFragment(Fragment):
cdef:
CFileFragment* file_fragment
cdef void init(self, const shared_ptr[CFragment]& sp)
cdef class Partitioning(_Weakrefable):
cdef:
shared_ptr[CPartitioning] wrapped
CPartitioning* partitioning
cdef init(self, const shared_ptr[CPartitioning]& sp)
@staticmethod
cdef wrap(const shared_ptr[CPartitioning]& sp)
cdef inline shared_ptr[CPartitioning] unwrap(self)
cdef class PartitioningFactory(_Weakrefable):
cdef:
shared_ptr[CPartitioningFactory] wrapped
CPartitioningFactory* factory
cdef init(self, const shared_ptr[CPartitioningFactory]& sp)
@staticmethod
cdef wrap(const shared_ptr[CPartitioningFactory]& sp)
cdef inline shared_ptr[CPartitioningFactory] unwrap(self)
cdef class WrittenFile(_Weakrefable):
# The full path to the created file
cdef public str path
# Optional Parquet metadata
# This metadata will have the file path attribute set to the path of
# the written file.
cdef public object metadata

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
"""Dataset support for ORC file format."""
from pyarrow.lib cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow._dataset cimport FileFormat
cdef class OrcFileFormat(FileFormat):
def __init__(self):
self.init(shared_ptr[CFileFormat](new COrcFileFormat()))
def equals(self, OrcFileFormat other):
return True
@property
def default_extname(self):
return "orc"
def __reduce__(self):
return OrcFileFormat, tuple()

View File

@ -0,0 +1,811 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
"""Dataset support for Parquest file format."""
from cython.operator cimport dereference as deref
import os
import warnings
import pyarrow as pa
from pyarrow.lib cimport *
from pyarrow.lib import frombytes, tobytes
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow.includes.libarrow_dataset_parquet cimport *
from pyarrow._fs cimport FileSystem
from pyarrow.util import _is_path_like, _stringify_path
from pyarrow._compute cimport Expression, _bind
from pyarrow._dataset cimport (
_make_file_source,
DatasetFactory,
FileFormat,
FileFragment,
FileWriteOptions,
Fragment,
FragmentScanOptions,
Partitioning,
PartitioningFactory,
WrittenFile
)
from pyarrow._parquet cimport (
_create_writer_properties, _create_arrow_writer_properties,
FileMetaData, RowGroupMetaData, ColumnChunkMetaData
)
cdef Expression _true = Expression._scalar(True)
ctypedef CParquetFileWriter* _CParquetFileWriterPtr
cdef class ParquetFileFormat(FileFormat):
"""
FileFormat for Parquet
Parameters
----------
read_options : ParquetReadOptions
Read options for the file.
default_fragment_scan_options : ParquetFragmentScanOptions
Scan Options for the file.
**kwargs : dict
Additional options for read option or scan option.
"""
cdef:
CParquetFileFormat* parquet_format
def __init__(self, read_options=None,
default_fragment_scan_options=None, **kwargs):
cdef:
shared_ptr[CParquetFileFormat] wrapped
CParquetFileFormatReaderOptions* options
# Read/scan options
read_options_args = {option: kwargs[option] for option in kwargs
if option in _PARQUET_READ_OPTIONS}
scan_args = {option: kwargs[option] for option in kwargs
if option not in _PARQUET_READ_OPTIONS}
if read_options and read_options_args:
duplicates = ', '.join(sorted(read_options_args))
raise ValueError(f'If `read_options` is given, '
f'cannot specify {duplicates}')
if default_fragment_scan_options and scan_args:
duplicates = ', '.join(sorted(scan_args))
raise ValueError(f'If `default_fragment_scan_options` is given, '
f'cannot specify {duplicates}')
if read_options is None:
read_options = ParquetReadOptions(**read_options_args)
elif isinstance(read_options, dict):
# For backwards compatibility
duplicates = []
for option, value in read_options.items():
if option in _PARQUET_READ_OPTIONS:
read_options_args[option] = value
else:
duplicates.append(option)
scan_args[option] = value
if duplicates:
duplicates = ", ".join(duplicates)
warnings.warn(f'The scan options {duplicates} should be '
'specified directly as keyword arguments')
read_options = ParquetReadOptions(**read_options_args)
elif not isinstance(read_options, ParquetReadOptions):
raise TypeError('`read_options` must be either a dictionary or an '
'instance of ParquetReadOptions')
if default_fragment_scan_options is None:
default_fragment_scan_options = ParquetFragmentScanOptions(
**scan_args)
elif isinstance(default_fragment_scan_options, dict):
default_fragment_scan_options = ParquetFragmentScanOptions(
**default_fragment_scan_options)
elif not isinstance(default_fragment_scan_options,
ParquetFragmentScanOptions):
raise TypeError('`default_fragment_scan_options` must be either a '
'dictionary or an instance of '
'ParquetFragmentScanOptions')
wrapped = make_shared[CParquetFileFormat]()
options = &(wrapped.get().reader_options)
if read_options.dictionary_columns is not None:
for column in read_options.dictionary_columns:
options.dict_columns.insert(tobytes(column))
options.coerce_int96_timestamp_unit = \
read_options._coerce_int96_timestamp_unit
self.init(<shared_ptr[CFileFormat]> wrapped)
self.default_fragment_scan_options = default_fragment_scan_options
cdef void init(self, const shared_ptr[CFileFormat]& sp):
FileFormat.init(self, sp)
self.parquet_format = <CParquetFileFormat*> sp.get()
cdef WrittenFile _finish_write(self, path, base_dir,
CFileWriter* file_writer):
cdef:
FileMetaData parquet_metadata
CParquetFileWriter* parquet_file_writer
parquet_metadata = None
parquet_file_writer = dynamic_cast[_CParquetFileWriterPtr](file_writer)
with nogil:
metadata = deref(
deref(parquet_file_writer).parquet_writer()).metadata()
if metadata:
parquet_metadata = FileMetaData()
parquet_metadata.init(metadata)
parquet_metadata.set_file_path(os.path.relpath(path, base_dir))
return WrittenFile(path, parquet_metadata)
@property
def read_options(self):
cdef CParquetFileFormatReaderOptions* options
options = &self.parquet_format.reader_options
parquet_read_options = ParquetReadOptions(
dictionary_columns={frombytes(col)
for col in options.dict_columns},
)
# Read options getter/setter works with strings so setting
# the private property which uses the C Type
parquet_read_options._coerce_int96_timestamp_unit = \
options.coerce_int96_timestamp_unit
return parquet_read_options
def make_write_options(self, **kwargs):
opts = FileFormat.make_write_options(self)
(<ParquetFileWriteOptions> opts).update(**kwargs)
return opts
cdef _set_default_fragment_scan_options(self, FragmentScanOptions options):
if options.type_name == 'parquet':
self.parquet_format.default_fragment_scan_options = options.wrapped
else:
super()._set_default_fragment_scan_options(options)
def equals(self, ParquetFileFormat other):
return (
self.read_options.equals(other.read_options) and
self.default_fragment_scan_options ==
other.default_fragment_scan_options
)
@property
def default_extname(self):
return "parquet"
def __reduce__(self):
return ParquetFileFormat, (self.read_options,
self.default_fragment_scan_options)
def __repr__(self):
return f"<ParquetFileFormat read_options={self.read_options}>"
def make_fragment(self, file, filesystem=None,
Expression partition_expression=None, row_groups=None):
cdef:
vector[int] c_row_groups
if partition_expression is None:
partition_expression = _true
if row_groups is None:
return super().make_fragment(file, filesystem,
partition_expression)
c_source = _make_file_source(file, filesystem)
c_row_groups = [<int> row_group for row_group in set(row_groups)]
c_fragment = <shared_ptr[CFragment]> GetResultValue(
self.parquet_format.MakeFragment(move(c_source),
partition_expression.unwrap(),
<shared_ptr[CSchema]>nullptr,
move(c_row_groups)))
return Fragment.wrap(move(c_fragment))
class RowGroupInfo:
"""
A wrapper class for RowGroup information
Parameters
----------
id : the group id.
metadata : the rowgroup metadata.
schema : schema of the rows.
"""
def __init__(self, id, metadata, schema):
self.id = id
self.metadata = metadata
self.schema = schema
@property
def num_rows(self):
return self.metadata.num_rows
@property
def total_byte_size(self):
return self.metadata.total_byte_size
@property
def statistics(self):
def name_stats(i):
col = self.metadata.column(i)
stats = col.statistics
if stats is None or not stats.has_min_max:
return None, None
name = col.path_in_schema
field_index = self.schema.get_field_index(name)
if field_index < 0:
return None, None
typ = self.schema.field(field_index).type
return col.path_in_schema, {
'min': pa.scalar(stats.min, type=typ).as_py(),
'max': pa.scalar(stats.max, type=typ).as_py()
}
return {
name: stats for name, stats
in map(name_stats, range(self.metadata.num_columns))
if stats is not None
}
def __repr__(self):
return "RowGroupInfo({})".format(self.id)
def __eq__(self, other):
if isinstance(other, int):
return self.id == other
if not isinstance(other, RowGroupInfo):
return False
return self.id == other.id
cdef class ParquetFileFragment(FileFragment):
"""A Fragment representing a parquet file."""
cdef:
CParquetFileFragment* parquet_file_fragment
cdef void init(self, const shared_ptr[CFragment]& sp):
FileFragment.init(self, sp)
self.parquet_file_fragment = <CParquetFileFragment*> sp.get()
def __reduce__(self):
buffer = self.buffer
# parquet_file_fragment.row_groups() is empty if the metadata
# information of the file is not yet populated
if not bool(self.parquet_file_fragment.row_groups()):
row_groups = None
else:
row_groups = [row_group.id for row_group in self.row_groups]
return self.format.make_fragment, (
self.path if buffer is None else buffer,
self.filesystem,
self.partition_expression,
row_groups
)
def ensure_complete_metadata(self):
"""
Ensure that all metadata (statistics, physical schema, ...) have
been read and cached in this fragment.
"""
with nogil:
check_status(self.parquet_file_fragment.EnsureCompleteMetadata())
@property
def row_groups(self):
metadata = self.metadata
cdef vector[int] row_groups = self.parquet_file_fragment.row_groups()
return [RowGroupInfo(i, metadata.row_group(i), self.physical_schema)
for i in row_groups]
@property
def metadata(self):
self.ensure_complete_metadata()
cdef FileMetaData metadata = FileMetaData()
metadata.init(self.parquet_file_fragment.metadata())
return metadata
@property
def num_row_groups(self):
"""
Return the number of row groups viewed by this fragment (not the
number of row groups in the origin file).
"""
self.ensure_complete_metadata()
return self.parquet_file_fragment.row_groups().size()
def split_by_row_group(self, Expression filter=None,
Schema schema=None):
"""
Split the fragment into multiple fragments.
Yield a Fragment wrapping each row group in this ParquetFileFragment.
Row groups will be excluded whose metadata contradicts the optional
filter.
Parameters
----------
filter : Expression, default None
Only include the row groups which satisfy this predicate (using
the Parquet RowGroup statistics).
schema : Schema, default None
Schema to use when filtering row groups. Defaults to the
Fragment's phsyical schema
Returns
-------
A list of Fragments
"""
cdef:
vector[shared_ptr[CFragment]] c_fragments
CExpression c_filter
shared_ptr[CFragment] c_fragment
schema = schema or self.physical_schema
c_filter = _bind(filter, schema)
with nogil:
c_fragments = move(GetResultValue(
self.parquet_file_fragment.SplitByRowGroup(move(c_filter))))
return [Fragment.wrap(c_fragment) for c_fragment in c_fragments]
def subset(self, Expression filter=None, Schema schema=None,
object row_group_ids=None):
"""
Create a subset of the fragment (viewing a subset of the row groups).
Subset can be specified by either a filter predicate (with optional
schema) or by a list of row group IDs. Note that when using a filter,
the resulting fragment can be empty (viewing no row groups).
Parameters
----------
filter : Expression, default None
Only include the row groups which satisfy this predicate (using
the Parquet RowGroup statistics).
schema : Schema, default None
Schema to use when filtering row groups. Defaults to the
Fragment's phsyical schema
row_group_ids : list of ints
The row group IDs to include in the subset. Can only be specified
if `filter` is None.
Returns
-------
ParquetFileFragment
"""
cdef:
CExpression c_filter
vector[int] c_row_group_ids
shared_ptr[CFragment] c_fragment
if filter is not None and row_group_ids is not None:
raise ValueError(
"Cannot specify both 'filter' and 'row_group_ids'."
)
if filter is not None:
schema = schema or self.physical_schema
c_filter = _bind(filter, schema)
with nogil:
c_fragment = move(GetResultValue(
self.parquet_file_fragment.SubsetWithFilter(
move(c_filter))))
elif row_group_ids is not None:
c_row_group_ids = [
<int> row_group for row_group in sorted(set(row_group_ids))
]
with nogil:
c_fragment = move(GetResultValue(
self.parquet_file_fragment.SubsetWithIds(
move(c_row_group_ids))))
else:
raise ValueError(
"Need to specify one of 'filter' or 'row_group_ids'"
)
return Fragment.wrap(c_fragment)
cdef class ParquetReadOptions(_Weakrefable):
"""
Parquet format specific options for reading.
Parameters
----------
dictionary_columns : list of string, default None
Names of columns which should be dictionary encoded as
they are read.
coerce_int96_timestamp_unit : str, default None.
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
and therefore INT96 timestamps will be inferred as timestamps
in nanoseconds.
"""
cdef public:
set dictionary_columns
TimeUnit _coerce_int96_timestamp_unit
# Also see _PARQUET_READ_OPTIONS
def __init__(self, dictionary_columns=None,
coerce_int96_timestamp_unit=None):
self.dictionary_columns = set(dictionary_columns or set())
self.coerce_int96_timestamp_unit = coerce_int96_timestamp_unit
@property
def coerce_int96_timestamp_unit(self):
return timeunit_to_string(self._coerce_int96_timestamp_unit)
@coerce_int96_timestamp_unit.setter
def coerce_int96_timestamp_unit(self, unit):
if unit is not None:
self._coerce_int96_timestamp_unit = string_to_timeunit(unit)
else:
self._coerce_int96_timestamp_unit = TimeUnit_NANO
def equals(self, ParquetReadOptions other):
return (self.dictionary_columns == other.dictionary_columns and
self.coerce_int96_timestamp_unit ==
other.coerce_int96_timestamp_unit)
def __eq__(self, other):
try:
return self.equals(other)
except TypeError:
return False
def __repr__(self):
return (
f"<ParquetReadOptions"
f" dictionary_columns={self.dictionary_columns}"
f" coerce_int96_timestamp_unit={self.coerce_int96_timestamp_unit}>"
)
cdef class ParquetFileWriteOptions(FileWriteOptions):
cdef:
CParquetFileWriteOptions* parquet_options
object _properties
def update(self, **kwargs):
arrow_fields = {
"use_deprecated_int96_timestamps",
"coerce_timestamps",
"allow_truncated_timestamps",
}
setters = set()
for name, value in kwargs.items():
if name not in self._properties:
raise TypeError("unexpected parquet write option: " + name)
self._properties[name] = value
if name in arrow_fields:
setters.add(self._set_arrow_properties)
else:
setters.add(self._set_properties)
for setter in setters:
setter()
def _set_properties(self):
cdef CParquetFileWriteOptions* opts = self.parquet_options
opts.writer_properties = _create_writer_properties(
use_dictionary=self._properties["use_dictionary"],
compression=self._properties["compression"],
version=self._properties["version"],
write_statistics=self._properties["write_statistics"],
data_page_size=self._properties["data_page_size"],
compression_level=self._properties["compression_level"],
use_byte_stream_split=(
self._properties["use_byte_stream_split"]
),
column_encoding=self._properties["column_encoding"],
data_page_version=self._properties["data_page_version"],
)
def _set_arrow_properties(self):
cdef CParquetFileWriteOptions* opts = self.parquet_options
opts.arrow_writer_properties = _create_arrow_writer_properties(
use_deprecated_int96_timestamps=(
self._properties["use_deprecated_int96_timestamps"]
),
coerce_timestamps=self._properties["coerce_timestamps"],
allow_truncated_timestamps=(
self._properties["allow_truncated_timestamps"]
),
writer_engine_version="V2",
use_compliant_nested_type=(
self._properties["use_compliant_nested_type"]
)
)
cdef void init(self, const shared_ptr[CFileWriteOptions]& sp):
FileWriteOptions.init(self, sp)
self.parquet_options = <CParquetFileWriteOptions*> sp.get()
self._properties = dict(
use_dictionary=True,
compression="snappy",
version="1.0",
write_statistics=None,
data_page_size=None,
compression_level=None,
use_byte_stream_split=False,
column_encoding=None,
data_page_version="1.0",
use_deprecated_int96_timestamps=False,
coerce_timestamps=None,
allow_truncated_timestamps=False,
use_compliant_nested_type=False,
)
self._set_properties()
self._set_arrow_properties()
cdef set _PARQUET_READ_OPTIONS = {
'dictionary_columns', 'coerce_int96_timestamp_unit'
}
cdef class ParquetFragmentScanOptions(FragmentScanOptions):
"""
Scan-specific options for Parquet fragments.
Parameters
----------
use_buffered_stream : bool, default False
Read files through buffered input streams rather than loading entire
row groups at once. This may be enabled to reduce memory overhead.
Disabled by default.
buffer_size : int, default 8192
Size of buffered stream, if enabled. Default is 8KB.
pre_buffer : bool, default False
If enabled, pre-buffer the raw Parquet data instead of issuing one
read per column chunk. This can improve performance on high-latency
filesystems.
"""
cdef:
CParquetFragmentScanOptions* parquet_options
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, bint use_buffered_stream=False,
buffer_size=8192,
bint pre_buffer=False):
self.init(shared_ptr[CFragmentScanOptions](
new CParquetFragmentScanOptions()))
self.use_buffered_stream = use_buffered_stream
self.buffer_size = buffer_size
self.pre_buffer = pre_buffer
cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp):
FragmentScanOptions.init(self, sp)
self.parquet_options = <CParquetFragmentScanOptions*> sp.get()
cdef CReaderProperties* reader_properties(self):
return self.parquet_options.reader_properties.get()
cdef ArrowReaderProperties* arrow_reader_properties(self):
return self.parquet_options.arrow_reader_properties.get()
@property
def use_buffered_stream(self):
return self.reader_properties().is_buffered_stream_enabled()
@use_buffered_stream.setter
def use_buffered_stream(self, bint use_buffered_stream):
if use_buffered_stream:
self.reader_properties().enable_buffered_stream()
else:
self.reader_properties().disable_buffered_stream()
@property
def buffer_size(self):
return self.reader_properties().buffer_size()
@buffer_size.setter
def buffer_size(self, buffer_size):
if buffer_size <= 0:
raise ValueError("Buffer size must be larger than zero")
self.reader_properties().set_buffer_size(buffer_size)
@property
def pre_buffer(self):
return self.arrow_reader_properties().pre_buffer()
@pre_buffer.setter
def pre_buffer(self, bint pre_buffer):
self.arrow_reader_properties().set_pre_buffer(pre_buffer)
def equals(self, ParquetFragmentScanOptions other):
return (
self.use_buffered_stream == other.use_buffered_stream and
self.buffer_size == other.buffer_size and
self.pre_buffer == other.pre_buffer
)
def __reduce__(self):
return ParquetFragmentScanOptions, (
self.use_buffered_stream, self.buffer_size, self.pre_buffer
)
cdef class ParquetFactoryOptions(_Weakrefable):
"""
Influences the discovery of parquet dataset.
Parameters
----------
partition_base_dir : str, optional
For the purposes of applying the partitioning, paths will be
stripped of the partition_base_dir. Files not matching the
partition_base_dir prefix will be skipped for partitioning discovery.
The ignored files will still be part of the Dataset, but will not
have partition information.
partitioning : Partitioning, PartitioningFactory, optional
The partitioning scheme applied to fragments, see ``Partitioning``.
validate_column_chunk_paths : bool, default False
Assert that all ColumnChunk paths are consistent. The parquet spec
allows for ColumnChunk data to be stored in multiple files, but
ParquetDatasetFactory supports only a single file with all ColumnChunk
data. If this flag is set construction of a ParquetDatasetFactory will
raise an error if ColumnChunk data is not resident in a single file.
"""
cdef:
CParquetFactoryOptions options
__slots__ = () # avoid mistakingly creating attributes
def __init__(self, partition_base_dir=None, partitioning=None,
validate_column_chunk_paths=False):
if isinstance(partitioning, PartitioningFactory):
self.partitioning_factory = partitioning
elif isinstance(partitioning, Partitioning):
self.partitioning = partitioning
if partition_base_dir is not None:
self.partition_base_dir = partition_base_dir
self.options.validate_column_chunk_paths = validate_column_chunk_paths
cdef inline CParquetFactoryOptions unwrap(self):
return self.options
@property
def partitioning(self):
"""Partitioning to apply to discovered files.
NOTE: setting this property will overwrite partitioning_factory.
"""
c_partitioning = self.options.partitioning.partitioning()
if c_partitioning.get() == nullptr:
return None
return Partitioning.wrap(c_partitioning)
@partitioning.setter
def partitioning(self, Partitioning value):
self.options.partitioning = (<Partitioning> value).unwrap()
@property
def partitioning_factory(self):
"""PartitioningFactory to apply to discovered files and
discover a Partitioning.
NOTE: setting this property will overwrite partitioning.
"""
c_factory = self.options.partitioning.factory()
if c_factory.get() == nullptr:
return None
return PartitioningFactory.wrap(c_factory)
@partitioning_factory.setter
def partitioning_factory(self, PartitioningFactory value):
self.options.partitioning = (<PartitioningFactory> value).unwrap()
@property
def partition_base_dir(self):
"""
Base directory to strip paths before applying the partitioning.
"""
return frombytes(self.options.partition_base_dir)
@partition_base_dir.setter
def partition_base_dir(self, value):
self.options.partition_base_dir = tobytes(value)
@property
def validate_column_chunk_paths(self):
"""
Base directory to strip paths before applying the partitioning.
"""
return self.options.validate_column_chunk_paths
@validate_column_chunk_paths.setter
def validate_column_chunk_paths(self, value):
self.options.validate_column_chunk_paths = value
cdef class ParquetDatasetFactory(DatasetFactory):
"""
Create a ParquetDatasetFactory from a Parquet `_metadata` file.
Parameters
----------
metadata_path : str
Path to the `_metadata` parquet metadata-only file generated with
`pyarrow.parquet.write_metadata`.
filesystem : pyarrow.fs.FileSystem
Filesystem to read the metadata_path from, and subsequent parquet
files.
format : ParquetFileFormat
Parquet format options.
options : ParquetFactoryOptions, optional
Various flags influencing the discovery of filesystem paths.
"""
cdef:
CParquetDatasetFactory* parquet_factory
def __init__(self, metadata_path, FileSystem filesystem not None,
FileFormat format not None,
ParquetFactoryOptions options=None):
cdef:
c_string c_path
shared_ptr[CFileSystem] c_filesystem
shared_ptr[CParquetFileFormat] c_format
CResult[shared_ptr[CDatasetFactory]] result
CParquetFactoryOptions c_options
c_path = tobytes(metadata_path)
c_filesystem = filesystem.unwrap()
c_format = static_pointer_cast[CParquetFileFormat, CFileFormat](
format.unwrap())
options = options or ParquetFactoryOptions()
c_options = options.unwrap()
with nogil:
result = CParquetDatasetFactory.MakeFromMetaDataPath(
c_path, c_filesystem, c_format, c_options)
self.init(GetResultValue(result))
cdef init(self, shared_ptr[CDatasetFactory]& sp):
DatasetFactory.init(self, sp)
self.parquet_factory = <CParquetDatasetFactory*> sp.get()

View File

@ -0,0 +1,353 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ---------------------------------------------------------------------
# Implement Internal ExecPlan bindings
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from cython.operator cimport dereference as deref, preincrement as inc
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow.lib cimport (Table, check_status, pyarrow_unwrap_table, pyarrow_wrap_table)
from pyarrow.lib import tobytes
from pyarrow._compute cimport Expression, _true
from pyarrow._dataset cimport Dataset
from pyarrow._dataset import InMemoryDataset
Initialize() # Initialise support for Datasets in ExecPlan
cdef execplan(inputs, output_type, vector[CDeclaration] plan, c_bool use_threads=True):
"""
Internal Function to create an ExecPlan and run it.
Parameters
----------
inputs : list of Table or Dataset
The sources from which the ExecPlan should fetch data.
In most cases this is only one, unless the first node of the
plan is able to get data from multiple different sources.
output_type : Table or InMemoryDataset
In which format the output should be provided.
plan : vector[CDeclaration]
The nodes of the plan that should be applied to the sources
to produce the output.
use_threads : bool, default True
Whenever to use multithreading or not.
"""
cdef:
CExecutor *c_executor
shared_ptr[CExecContext] c_exec_context
shared_ptr[CExecPlan] c_exec_plan
vector[CDeclaration] c_decls
vector[CExecNode*] _empty
vector[CExecNode*] c_final_node_vec
CExecNode *c_node
CTable* c_table
shared_ptr[CTable] c_in_table
shared_ptr[CTable] c_out_table
shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
shared_ptr[CScanNodeOptions] c_scanopts
shared_ptr[CExecNodeOptions] c_input_node_opts
shared_ptr[CSinkNodeOptions] c_sinkopts
shared_ptr[CAsyncExecBatchGenerator] c_async_exec_batch_gen
shared_ptr[CRecordBatchReader] c_recordbatchreader
vector[CDeclaration].iterator plan_iter
vector[CDeclaration.Input] no_c_inputs
CStatus c_plan_status
if use_threads:
c_executor = GetCpuThreadPool()
else:
c_executor = NULL
c_exec_context = make_shared[CExecContext](
c_default_memory_pool(), c_executor)
c_exec_plan = GetResultValue(CExecPlan.Make(c_exec_context.get()))
plan_iter = plan.begin()
# Create source nodes for each input
for ipt in inputs:
if isinstance(ipt, Table):
node_factory = "table_source"
c_in_table = pyarrow_unwrap_table(ipt)
c_tablesourceopts = make_shared[CTableSourceNodeOptions](
c_in_table, 1 << 20)
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions](
c_tablesourceopts)
elif isinstance(ipt, Dataset):
node_factory = "scan"
c_in_dataset = (<Dataset>ipt).unwrap()
c_scanopts = make_shared[CScanNodeOptions](
c_in_dataset, make_shared[CScanOptions]())
deref(deref(c_scanopts).scan_options).use_threads = use_threads
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CScanNodeOptions](
c_scanopts)
else:
raise TypeError("Unsupported type")
if plan_iter != plan.end():
# Flag the source as the input of the first plan node.
deref(plan_iter).inputs.push_back(CDeclaration.Input(
CDeclaration(tobytes(node_factory),
no_c_inputs, c_input_node_opts)
))
else:
# Empty plan, make the source the first plan node.
c_decls.push_back(
CDeclaration(tobytes(node_factory),
no_c_inputs, c_input_node_opts)
)
# Add Here additional nodes
while plan_iter != plan.end():
c_decls.push_back(deref(plan_iter))
inc(plan_iter)
# Add all CDeclarations to the plan
c_node = GetResultValue(
CDeclaration.Sequence(c_decls).AddToPlan(&deref(c_exec_plan))
)
c_final_node_vec.push_back(c_node)
# Create the output node
c_async_exec_batch_gen = make_shared[CAsyncExecBatchGenerator]()
c_sinkopts = make_shared[CSinkNodeOptions](c_async_exec_batch_gen.get())
GetResultValue(
MakeExecNode(tobytes("sink"), &deref(c_exec_plan),
c_final_node_vec, deref(c_sinkopts))
)
# Convert the asyncgenerator to a sync batch reader
c_recordbatchreader = MakeGeneratorReader(c_node.output_schema(),
deref(c_async_exec_batch_gen),
deref(c_exec_context).memory_pool())
# Start execution of the ExecPlan
deref(c_exec_plan).Validate()
deref(c_exec_plan).StartProducing()
# Convert output to the expected one.
c_out_table = GetResultValue(
CTable.FromRecordBatchReader(c_recordbatchreader.get()))
if output_type == Table:
output = pyarrow_wrap_table(c_out_table)
elif output_type == InMemoryDataset:
output = InMemoryDataset(pyarrow_wrap_table(c_out_table))
else:
raise TypeError("Unsupported output type")
with nogil:
c_plan_status = deref(c_exec_plan).finished().status()
check_status(c_plan_status)
return output
def _perform_join(join_type, left_operand not None, left_keys,
right_operand not None, right_keys,
left_suffix=None, right_suffix=None,
use_threads=True, coalesce_keys=False,
output_type=Table):
"""
Perform join of two tables or datasets.
The result will be an output table with the result of the join operation
Parameters
----------
join_type : str
One of supported join types.
left_operand : Table or Dataset
The left operand for the join operation.
left_keys : str or list[str]
The left key (or keys) on which the join operation should be performed.
right_operand : Table or Dataset
The right operand for the join operation.
right_keys : str or list[str]
The right key (or keys) on which the join operation should be performed.
left_suffix : str, default None
Which suffix to add to right column names. This prevents confusion
when the columns in left and right operands have colliding names.
right_suffix : str, default None
Which suffic to add to the left column names. This prevents confusion
when the columns in left and right operands have colliding names.
use_threads : bool, default True
Whenever to use multithreading or not.
coalesce_keys : bool, default False
If the duplicated keys should be omitted from one of the sides
in the join result.
output_type: Table or InMemoryDataset
The output type for the exec plan result.
Returns
-------
result_table : Table
"""
cdef:
vector[CFieldRef] c_left_keys
vector[CFieldRef] c_right_keys
vector[CFieldRef] c_left_columns
vector[CFieldRef] c_right_columns
vector[CDeclaration] c_decl_plan
vector[CExpression] c_projections
vector[c_string] c_projected_col_names
CJoinType c_join_type
# Prepare left and right tables Keys to send them to the C++ function
left_keys_order = {}
if isinstance(left_keys, str):
left_keys = [left_keys]
for idx, key in enumerate(left_keys):
left_keys_order[key] = idx
c_left_keys.push_back(CFieldRef(<c_string>tobytes(key)))
right_keys_order = {}
if isinstance(right_keys, str):
right_keys = [right_keys]
for idx, key in enumerate(right_keys):
right_keys_order[key] = idx
c_right_keys.push_back(CFieldRef(<c_string>tobytes(key)))
# By default expose all columns on both left and right table
if isinstance(left_operand, Table):
left_columns = left_operand.column_names
elif isinstance(left_operand, Dataset):
left_columns = left_operand.schema.names
else:
raise TypeError("Unsupported left join member type")
if isinstance(right_operand, Table):
right_columns = right_operand.column_names
elif isinstance(right_operand, Dataset):
right_columns = right_operand.schema.names
else:
raise TypeError("Unsupported right join member type")
# Pick the join type
if join_type == "left semi":
c_join_type = CJoinType_LEFT_SEMI
right_columns = []
elif join_type == "right semi":
c_join_type = CJoinType_RIGHT_SEMI
left_columns = []
elif join_type == "left anti":
c_join_type = CJoinType_LEFT_ANTI
right_columns = []
elif join_type == "right anti":
c_join_type = CJoinType_RIGHT_ANTI
left_columns = []
elif join_type == "inner":
c_join_type = CJoinType_INNER
right_columns = set(right_columns) - set(right_keys)
elif join_type == "left outer":
c_join_type = CJoinType_LEFT_OUTER
right_columns = set(right_columns) - set(right_keys)
elif join_type == "right outer":
c_join_type = CJoinType_RIGHT_OUTER
left_columns = set(left_columns) - set(left_keys)
elif join_type == "full outer":
c_join_type = CJoinType_FULL_OUTER
else:
raise ValueError("Unsupported join type")
# Turn the columns to vectors of FieldRefs
# and set aside indices of keys.
left_column_keys_indices = {}
for idx, colname in enumerate(left_columns):
c_left_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
if colname in left_keys:
left_column_keys_indices[colname] = idx
right_column_keys_indices = {}
for idx, colname in enumerate(right_columns):
c_right_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
if colname in right_keys:
right_column_keys_indices[colname] = idx
# Add the join node to the execplan
if coalesce_keys:
c_decl_plan.push_back(
CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
c_join_type, c_left_keys, c_right_keys,
c_left_columns, c_right_columns,
_true,
<c_string>tobytes(left_suffix or ""),
<c_string>tobytes(right_suffix or "")
))
)
if join_type == "full outer":
# In case of full outer joins, the join operation will output all columns
# so that we can coalesce the keys and exclude duplicates in a subsequent projection.
left_columns_set = set(left_columns)
right_columns_set = set(right_columns)
# Where the right table columns start.
right_operand_index = len(left_columns)
for idx, col in enumerate(left_columns + right_columns):
if idx < len(left_columns) and col in left_column_keys_indices:
# Include keys only once and coalesce left+right table keys.
c_projected_col_names.push_back(tobytes(col))
# Get the index of the right key that is being paired
# with this left key. We do so by retrieving the name
# of the right key that is in the same position in the provided keys
# and then looking up the index for that name in the right table.
right_key_index = right_column_keys_indices[right_keys[left_keys_order[col]]]
c_projections.push_back(Expression.unwrap(
Expression._call("coalesce", [
Expression._field(idx), Expression._field(
right_operand_index+right_key_index)
])
))
elif idx >= right_operand_index and col in right_column_keys_indices:
# Do not include right table keys. As they would lead to duplicated keys.
continue
else:
# For all the other columns incude them as they are.
# Just recompute the suffixes that the join produced as the projection
# would lose them otherwise.
if left_suffix and idx < right_operand_index and col in right_columns_set:
col += left_suffix
if right_suffix and idx >= right_operand_index and col in left_columns_set:
col += right_suffix
c_projected_col_names.push_back(tobytes(col))
c_projections.push_back(
Expression.unwrap(Expression._field(idx)))
c_decl_plan.push_back(
CDeclaration(tobytes("project"), CProjectNodeOptions(
c_projections, c_projected_col_names))
)
else:
c_decl_plan.push_back(
CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
c_join_type, c_left_keys, c_right_keys,
_true,
<c_string>tobytes(left_suffix or ""),
<c_string>tobytes(right_suffix or "")
))
)
result_table = execplan([left_operand, right_operand],
plan=c_decl_plan,
output_type=output_type,
use_threads=use_threads)
return result_table

View File

@ -0,0 +1,117 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ---------------------------------------------------------------------
# Implement Feather file format
# cython: profile=False
# distutils: language = c++
# cython: language_level=3
from cython.operator cimport dereference as deref
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_feather cimport *
from pyarrow.lib cimport (check_status, Table, _Weakrefable,
get_writer, get_reader, pyarrow_wrap_table)
from pyarrow.lib import tobytes
class FeatherError(Exception):
pass
def write_feather(Table table, object dest, compression=None,
compression_level=None, chunksize=None, version=2):
cdef shared_ptr[COutputStream] sink
get_writer(dest, &sink)
cdef CFeatherProperties properties
if version == 2:
properties.version = kFeatherV2Version
else:
properties.version = kFeatherV1Version
if compression == 'zstd':
properties.compression = CCompressionType_ZSTD
elif compression == 'lz4':
properties.compression = CCompressionType_LZ4_FRAME
else:
properties.compression = CCompressionType_UNCOMPRESSED
if chunksize is not None:
properties.chunksize = chunksize
if compression_level is not None:
properties.compression_level = compression_level
with nogil:
check_status(WriteFeather(deref(table.table), sink.get(),
properties))
cdef class FeatherReader(_Weakrefable):
cdef:
shared_ptr[CFeatherReader] reader
def __cinit__(self, source, c_bool use_memory_map, c_bool use_threads):
cdef:
shared_ptr[CRandomAccessFile] reader
CIpcReadOptions options = CIpcReadOptions.Defaults()
options.use_threads = use_threads
get_reader(source, use_memory_map, &reader)
with nogil:
self.reader = GetResultValue(CFeatherReader.Open(reader, options))
@property
def version(self):
return self.reader.get().version()
def read(self):
cdef shared_ptr[CTable] sp_table
with nogil:
check_status(self.reader.get()
.Read(&sp_table))
return pyarrow_wrap_table(sp_table)
def read_indices(self, indices):
cdef:
shared_ptr[CTable] sp_table
vector[int] c_indices
for index in indices:
c_indices.push_back(index)
with nogil:
check_status(self.reader.get()
.Read(c_indices, &sp_table))
return pyarrow_wrap_table(sp_table)
def read_names(self, names):
cdef:
shared_ptr[CTable] sp_table
vector[c_string] c_names
for name in names:
c_names.push_back(tobytes(name))
with nogil:
check_status(self.reader.get()
.Read(c_names, &sp_table))
return pyarrow_wrap_table(sp_table)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,94 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow.lib import _detect_compression, frombytes, tobytes
from pyarrow.lib cimport *
cpdef enum FileType:
NotFound = <int8_t> CFileType_NotFound
Unknown = <int8_t> CFileType_Unknown
File = <int8_t> CFileType_File
Directory = <int8_t> CFileType_Directory
cdef class FileInfo(_Weakrefable):
cdef:
CFileInfo info
@staticmethod
cdef wrap(CFileInfo info)
cdef inline CFileInfo unwrap(self) nogil
@staticmethod
cdef CFileInfo unwrap_safe(obj)
cdef class FileSelector(_Weakrefable):
cdef:
CFileSelector selector
@staticmethod
cdef FileSelector wrap(CFileSelector selector)
cdef inline CFileSelector unwrap(self) nogil
cdef class FileSystem(_Weakrefable):
cdef:
shared_ptr[CFileSystem] wrapped
CFileSystem* fs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
@staticmethod
cdef wrap(const shared_ptr[CFileSystem]& sp)
cdef inline shared_ptr[CFileSystem] unwrap(self) nogil
cdef class LocalFileSystem(FileSystem):
cdef:
CLocalFileSystem* localfs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
cdef class SubTreeFileSystem(FileSystem):
cdef:
CSubTreeFileSystem* subtreefs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
cdef class _MockFileSystem(FileSystem):
cdef:
CMockFileSystem* mockfs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
cdef class PyFileSystem(FileSystem):
cdef:
CPyFileSystem* pyfs
cdef init(self, const shared_ptr[CFileSystem]& wrapped)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
# coding: utf-8
# file generated by setuptools_scm
# don't change, don't track in version control
version = '8.0.0'
version_tuple = (8, 0, 0)

View File

@ -0,0 +1,149 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport check_status
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow._fs cimport FileSystem
from pyarrow.lib import frombytes, tobytes
from pyarrow.util import _stringify_path
cdef class HadoopFileSystem(FileSystem):
"""
HDFS backed FileSystem implementation
Parameters
----------
host : str
HDFS host to connect to. Set to "default" for fs.defaultFS from
core-site.xml.
port : int, default 8020
HDFS port to connect to. Set to 0 for default or logical (HA) nodes.
user : str, default None
Username when connecting to HDFS; None implies login user.
replication : int, default 3
Number of copies each block will have.
buffer_size : int, default 0
If 0, no buffering will happen otherwise the size of the temporary read
and write buffer.
default_block_size : int, default None
None means the default configuration for HDFS, a typical block size is
128 MB.
kerb_ticket : string or path, default None
If not None, the path to the Kerberos ticket cache.
extra_conf : dict, default None
Extra key/value pairs for configuration; will override any
hdfs-site.xml properties.
"""
cdef:
CHadoopFileSystem* hdfs
def __init__(self, str host, int port=8020, *, str user=None,
int replication=3, int buffer_size=0,
default_block_size=None, kerb_ticket=None,
extra_conf=None):
cdef:
CHdfsOptions options
shared_ptr[CHadoopFileSystem] wrapped
if not host.startswith(('hdfs://', 'viewfs://')) and host != "default":
# TODO(kszucs): do more sanitization
host = 'hdfs://{}'.format(host)
options.ConfigureEndPoint(tobytes(host), int(port))
options.ConfigureReplication(replication)
options.ConfigureBufferSize(buffer_size)
if user is not None:
options.ConfigureUser(tobytes(user))
if default_block_size is not None:
options.ConfigureBlockSize(default_block_size)
if kerb_ticket is not None:
options.ConfigureKerberosTicketCachePath(
tobytes(_stringify_path(kerb_ticket)))
if extra_conf is not None:
for k, v in extra_conf.items():
options.ConfigureExtraConf(tobytes(k), tobytes(v))
with nogil:
wrapped = GetResultValue(CHadoopFileSystem.Make(options))
self.init(<shared_ptr[CFileSystem]> wrapped)
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
FileSystem.init(self, wrapped)
self.hdfs = <CHadoopFileSystem*> wrapped.get()
@staticmethod
def from_uri(uri):
"""
Instantiate HadoopFileSystem object from an URI string.
The following two calls are equivalent
* ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\
&replication=1')``
* ``HadoopFileSystem('localhost', port=8020, user='test', \
replication=1)``
Parameters
----------
uri : str
A string URI describing the connection to HDFS.
In order to change the user, replication, buffer_size or
default_block_size pass the values as query parts.
Returns
-------
HadoopFileSystem
"""
cdef:
HadoopFileSystem self = HadoopFileSystem.__new__(HadoopFileSystem)
shared_ptr[CHadoopFileSystem] wrapped
CHdfsOptions options
options = GetResultValue(CHdfsOptions.FromUriString(tobytes(uri)))
with nogil:
wrapped = GetResultValue(CHadoopFileSystem.Make(options))
self.init(<shared_ptr[CFileSystem]> wrapped)
return self
@classmethod
def _reconstruct(cls, kwargs):
return cls(**kwargs)
def __reduce__(self):
cdef CHdfsOptions opts = self.hdfs.options()
return (
HadoopFileSystem._reconstruct, (dict(
host=frombytes(opts.connection_config.host),
port=opts.connection_config.port,
user=frombytes(opts.connection_config.user),
replication=opts.replication,
buffer_size=opts.buffer_size,
default_block_size=opts.default_block_size,
kerb_ticket=frombytes(opts.connection_config.kerb_ticket),
extra_conf={frombytes(k): frombytes(v)
for k, v in opts.connection_config.extra_conf},
),)
)

View File

@ -0,0 +1,480 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------
# HDFS IO implementation
# cython: language_level = 3
import re
from pyarrow.lib cimport check_status, _Weakrefable, NativeFile
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow.lib import frombytes, tobytes, ArrowIOError
from queue import Queue, Empty as QueueEmpty, Full as QueueFull
_HDFS_PATH_RE = re.compile(r'hdfs://(.*):(\d+)(.*)')
def have_libhdfs():
try:
with nogil:
check_status(HaveLibHdfs())
return True
except Exception:
return False
def strip_hdfs_abspath(path):
m = _HDFS_PATH_RE.match(path)
if m:
return m.group(3)
else:
return path
cdef class HadoopFileSystem(_Weakrefable):
cdef:
shared_ptr[CIOHadoopFileSystem] client
cdef readonly:
bint is_open
object host
object user
object kerb_ticket
int port
dict extra_conf
def _connect(self, host, port, user, kerb_ticket, extra_conf):
cdef HdfsConnectionConfig conf
if host is not None:
conf.host = tobytes(host)
self.host = host
conf.port = port
self.port = port
if user is not None:
conf.user = tobytes(user)
self.user = user
if kerb_ticket is not None:
conf.kerb_ticket = tobytes(kerb_ticket)
self.kerb_ticket = kerb_ticket
with nogil:
check_status(HaveLibHdfs())
if extra_conf is not None and isinstance(extra_conf, dict):
conf.extra_conf = {tobytes(k): tobytes(v)
for k, v in extra_conf.items()}
self.extra_conf = extra_conf
with nogil:
check_status(CIOHadoopFileSystem.Connect(&conf, &self.client))
self.is_open = True
@classmethod
def connect(cls, *args, **kwargs):
return cls(*args, **kwargs)
def __dealloc__(self):
if self.is_open:
self.close()
def close(self):
"""
Disconnect from the HDFS cluster
"""
self._ensure_client()
with nogil:
check_status(self.client.get().Disconnect())
self.is_open = False
cdef _ensure_client(self):
if self.client.get() == NULL:
raise IOError('HDFS client improperly initialized')
elif not self.is_open:
raise IOError('HDFS client is closed')
def exists(self, path):
"""
Returns True if the path is known to the cluster, False if it does not
(or there is an RPC error)
"""
self._ensure_client()
cdef c_string c_path = tobytes(path)
cdef c_bool result
with nogil:
result = self.client.get().Exists(c_path)
return result
def isdir(self, path):
cdef HdfsPathInfo info
try:
self._path_info(path, &info)
except ArrowIOError:
return False
return info.kind == ObjectType_DIRECTORY
def isfile(self, path):
cdef HdfsPathInfo info
try:
self._path_info(path, &info)
except ArrowIOError:
return False
return info.kind == ObjectType_FILE
def get_capacity(self):
"""
Get reported total capacity of file system
Returns
-------
capacity : int
"""
cdef int64_t capacity = 0
with nogil:
check_status(self.client.get().GetCapacity(&capacity))
return capacity
def get_space_used(self):
"""
Get space used on file system
Returns
-------
space_used : int
"""
cdef int64_t space_used = 0
with nogil:
check_status(self.client.get().GetUsed(&space_used))
return space_used
def df(self):
"""
Return free space on disk, like the UNIX df command
Returns
-------
space : int
"""
return self.get_capacity() - self.get_space_used()
def rename(self, path, new_path):
cdef c_string c_path = tobytes(path)
cdef c_string c_new_path = tobytes(new_path)
with nogil:
check_status(self.client.get().Rename(c_path, c_new_path))
def info(self, path):
"""
Return detailed HDFS information for path
Parameters
----------
path : string
Path to file or directory
Returns
-------
path_info : dict
"""
cdef HdfsPathInfo info
self._path_info(path, &info)
return {
'path': frombytes(info.name),
'owner': frombytes(info.owner),
'group': frombytes(info.group),
'size': info.size,
'block_size': info.block_size,
'last_modified': info.last_modified_time,
'last_accessed': info.last_access_time,
'replication': info.replication,
'permissions': info.permissions,
'kind': ('directory' if info.kind == ObjectType_DIRECTORY
else 'file')
}
def stat(self, path):
"""
Return basic file system statistics about path
Parameters
----------
path : string
Path to file or directory
Returns
-------
stat : dict
"""
cdef FileStatistics info
cdef c_string c_path = tobytes(path)
with nogil:
check_status(self.client.get()
.Stat(c_path, &info))
return {
'size': info.size,
'kind': ('directory' if info.kind == ObjectType_DIRECTORY
else 'file')
}
cdef _path_info(self, path, HdfsPathInfo* info):
cdef c_string c_path = tobytes(path)
with nogil:
check_status(self.client.get()
.GetPathInfo(c_path, info))
def ls(self, path, bint full_info):
cdef:
c_string c_path = tobytes(path)
vector[HdfsPathInfo] listing
list results = []
int i
self._ensure_client()
with nogil:
check_status(self.client.get()
.ListDirectory(c_path, &listing))
cdef const HdfsPathInfo* info
for i in range(<int> listing.size()):
info = &listing[i]
# Try to trim off the hdfs://HOST:PORT piece
name = strip_hdfs_abspath(frombytes(info.name))
if full_info:
kind = ('file' if info.kind == ObjectType_FILE
else 'directory')
results.append({
'kind': kind,
'name': name,
'owner': frombytes(info.owner),
'group': frombytes(info.group),
'last_modified_time': info.last_modified_time,
'last_access_time': info.last_access_time,
'size': info.size,
'replication': info.replication,
'block_size': info.block_size,
'permissions': info.permissions
})
else:
results.append(name)
return results
def chmod(self, path, mode):
"""
Change file permissions
Parameters
----------
path : string
absolute path to file or directory
mode : int
POSIX-like bitmask
"""
self._ensure_client()
cdef c_string c_path = tobytes(path)
cdef int c_mode = mode
with nogil:
check_status(self.client.get()
.Chmod(c_path, c_mode))
def chown(self, path, owner=None, group=None):
"""
Change file permissions
Parameters
----------
path : string
absolute path to file or directory
owner : string, default None
New owner, None for no change
group : string, default None
New group, None for no change
"""
cdef:
c_string c_path
c_string c_owner
c_string c_group
const char* c_owner_ptr = NULL
const char* c_group_ptr = NULL
self._ensure_client()
c_path = tobytes(path)
if owner is not None:
c_owner = tobytes(owner)
c_owner_ptr = c_owner.c_str()
if group is not None:
c_group = tobytes(group)
c_group_ptr = c_group.c_str()
with nogil:
check_status(self.client.get()
.Chown(c_path, c_owner_ptr, c_group_ptr))
def mkdir(self, path):
"""
Create indicated directory and any necessary parent directories
"""
self._ensure_client()
cdef c_string c_path = tobytes(path)
with nogil:
check_status(self.client.get()
.MakeDirectory(c_path))
def delete(self, path, bint recursive=False):
"""
Delete the indicated file or directory
Parameters
----------
path : string
recursive : boolean, default False
If True, also delete child paths for directories
"""
self._ensure_client()
cdef c_string c_path = tobytes(path)
with nogil:
check_status(self.client.get()
.Delete(c_path, recursive == 1))
def open(self, path, mode='rb', buffer_size=None, replication=None,
default_block_size=None):
"""
Open HDFS file for reading or writing
Parameters
----------
mode : string
Must be one of 'rb', 'wb', 'ab'
Returns
-------
handle : HdfsFile
"""
self._ensure_client()
cdef HdfsFile out = HdfsFile()
if mode not in ('rb', 'wb', 'ab'):
raise Exception("Mode must be 'rb' (read), "
"'wb' (write, new file), or 'ab' (append)")
cdef c_string c_path = tobytes(path)
cdef c_bool append = False
# 0 in libhdfs means "use the default"
cdef int32_t c_buffer_size = buffer_size or 0
cdef int16_t c_replication = replication or 0
cdef int64_t c_default_block_size = default_block_size or 0
cdef shared_ptr[HdfsOutputStream] wr_handle
cdef shared_ptr[HdfsReadableFile] rd_handle
if mode in ('wb', 'ab'):
if mode == 'ab':
append = True
with nogil:
check_status(
self.client.get()
.OpenWritable(c_path, append, c_buffer_size,
c_replication, c_default_block_size,
&wr_handle))
out.set_output_stream(<shared_ptr[COutputStream]> wr_handle)
out.is_writable = True
else:
with nogil:
check_status(self.client.get()
.OpenReadable(c_path, &rd_handle))
out.set_random_access_file(
<shared_ptr[CRandomAccessFile]> rd_handle)
out.is_readable = True
assert not out.closed
if c_buffer_size == 0:
c_buffer_size = 2 ** 16
out.mode = mode
out.buffer_size = c_buffer_size
out.parent = _HdfsFileNanny(self, out)
out.own_file = True
return out
def download(self, path, stream, buffer_size=None):
with self.open(path, 'rb') as f:
f.download(stream, buffer_size=buffer_size)
def upload(self, path, stream, buffer_size=None):
"""
Upload file-like object to HDFS path
"""
with self.open(path, 'wb') as f:
f.upload(stream, buffer_size=buffer_size)
# ARROW-404: Helper class to ensure that files are closed before the
# client. During deallocation of the extension class, the attributes are
# decref'd which can cause the client to get closed first if the file has the
# last remaining reference
cdef class _HdfsFileNanny(_Weakrefable):
cdef:
object client
object file_handle_ref
def __cinit__(self, client, file_handle):
import weakref
self.client = client
self.file_handle_ref = weakref.ref(file_handle)
def __dealloc__(self):
fh = self.file_handle_ref()
if fh:
fh.close()
# avoid cyclic GC
self.file_handle_ref = None
self.client = None
cdef class HdfsFile(NativeFile):
cdef readonly:
int32_t buffer_size
object mode
object parent
def __dealloc__(self):
self.parent = None

View File

@ -0,0 +1,261 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (check_status, _Weakrefable, Field, MemoryPool,
ensure_type, maybe_unbox_memory_pool,
get_input_stream, pyarrow_wrap_table,
pyarrow_wrap_data_type, pyarrow_unwrap_data_type,
pyarrow_wrap_schema, pyarrow_unwrap_schema)
cdef class ReadOptions(_Weakrefable):
"""
Options for reading JSON files.
Parameters
----------
use_threads : bool, optional (default True)
Whether to use multiple threads to accelerate reading
block_size : int, optional
How much bytes to process at a time from the input stream.
This will determine multi-threading granularity as well as
the size of individual chunks in the Table.
"""
cdef:
CJSONReadOptions options
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, use_threads=None, block_size=None):
self.options = CJSONReadOptions.Defaults()
if use_threads is not None:
self.use_threads = use_threads
if block_size is not None:
self.block_size = block_size
@property
def use_threads(self):
"""
Whether to use multiple threads to accelerate reading.
"""
return self.options.use_threads
@use_threads.setter
def use_threads(self, value):
self.options.use_threads = value
@property
def block_size(self):
"""
How much bytes to process at a time from the input stream.
This will determine multi-threading granularity as well as the size of
individual chunks in the Table.
"""
return self.options.block_size
@block_size.setter
def block_size(self, value):
self.options.block_size = value
def __reduce__(self):
return ReadOptions, (
self.use_threads,
self.block_size
)
cdef class ParseOptions(_Weakrefable):
"""
Options for parsing JSON files.
Parameters
----------
explicit_schema : Schema, optional (default None)
Optional explicit schema (no type inference, ignores other fields).
newlines_in_values : bool, optional (default False)
Whether objects may be printed across multiple lines (for example
pretty printed). If false, input must end with an empty line.
unexpected_field_behavior : str, default "infer"
How JSON fields outside of explicit_schema (if given) are treated.
Possible behaviors:
- "ignore": unexpected JSON fields are ignored
- "error": error out on unexpected JSON fields
- "infer": unexpected JSON fields are type-inferred and included in
the output
"""
cdef:
CJSONParseOptions options
__slots__ = ()
def __init__(self, explicit_schema=None, newlines_in_values=None,
unexpected_field_behavior=None):
self.options = CJSONParseOptions.Defaults()
if explicit_schema is not None:
self.explicit_schema = explicit_schema
if newlines_in_values is not None:
self.newlines_in_values = newlines_in_values
if unexpected_field_behavior is not None:
self.unexpected_field_behavior = unexpected_field_behavior
def __reduce__(self):
return ParseOptions, (
self.explicit_schema,
self.newlines_in_values,
self.unexpected_field_behavior
)
@property
def explicit_schema(self):
"""
Optional explicit schema (no type inference, ignores other fields)
"""
if self.options.explicit_schema.get() == NULL:
return None
else:
return pyarrow_wrap_schema(self.options.explicit_schema)
@explicit_schema.setter
def explicit_schema(self, value):
self.options.explicit_schema = pyarrow_unwrap_schema(value)
@property
def newlines_in_values(self):
"""
Whether newline characters are allowed in JSON values.
Setting this to True reduces the performance of multi-threaded
JSON reading.
"""
return self.options.newlines_in_values
@newlines_in_values.setter
def newlines_in_values(self, value):
self.options.newlines_in_values = value
@property
def unexpected_field_behavior(self):
"""
How JSON fields outside of explicit_schema (if given) are treated.
Possible behaviors:
- "ignore": unexpected JSON fields are ignored
- "error": error out on unexpected JSON fields
- "infer": unexpected JSON fields are type-inferred and included in
the output
Set to "infer" by default.
"""
v = self.options.unexpected_field_behavior
if v == CUnexpectedFieldBehavior_Ignore:
return "ignore"
elif v == CUnexpectedFieldBehavior_Error:
return "error"
elif v == CUnexpectedFieldBehavior_InferType:
return "infer"
else:
raise ValueError('Unexpected value for unexpected_field_behavior')
@unexpected_field_behavior.setter
def unexpected_field_behavior(self, value):
cdef CUnexpectedFieldBehavior v
if value == "ignore":
v = CUnexpectedFieldBehavior_Ignore
elif value == "error":
v = CUnexpectedFieldBehavior_Error
elif value == "infer":
v = CUnexpectedFieldBehavior_InferType
else:
raise ValueError(
"Unexpected value `{}` for `unexpected_field_behavior`, pass "
"either `ignore`, `error` or `infer`.".format(value)
)
self.options.unexpected_field_behavior = v
cdef _get_reader(input_file, shared_ptr[CInputStream]* out):
use_memory_map = False
get_input_stream(input_file, use_memory_map, out)
cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
if read_options is None:
out[0] = CJSONReadOptions.Defaults()
else:
out[0] = read_options.options
cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
if parse_options is None:
out[0] = CJSONParseOptions.Defaults()
else:
out[0] = parse_options.options
def read_json(input_file, read_options=None, parse_options=None,
MemoryPool memory_pool=None):
"""
Read a Table from a stream of JSON data.
Parameters
----------
input_file : str, path or file-like object
The location of JSON data. Currently only the line-delimited JSON
format is supported.
read_options : pyarrow.json.ReadOptions, optional
Options for the JSON reader (see ReadOptions constructor for defaults).
parse_options : pyarrow.json.ParseOptions, optional
Options for the JSON parser
(see ParseOptions constructor for defaults).
memory_pool : MemoryPool, optional
Pool to allocate Table memory from.
Returns
-------
:class:`pyarrow.Table`
Contents of the JSON file as a in-memory table.
"""
cdef:
shared_ptr[CInputStream] stream
CJSONReadOptions c_read_options
CJSONParseOptions c_parse_options
shared_ptr[CJSONReader] reader
shared_ptr[CTable] table
_get_reader(input_file, &stream)
_get_read_options(read_options, &c_read_options)
_get_parse_options(parse_options, &c_parse_options)
reader = GetResultValue(
CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
stream, c_read_options, c_parse_options))
with nogil:
table = GetResultValue(reader.get().Read())
return pyarrow_wrap_table(table)

View File

@ -0,0 +1,134 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# distutils: language = c++
# cython: language_level = 3
from libcpp cimport bool as c_bool
from libc.string cimport const_char
from libcpp.vector cimport vector as std_vector
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus,
CResult, CTable, CMemoryPool,
CKeyValueMetadata,
CRecordBatch,
CTable, CCompressionType,
CRandomAccessFile, COutputStream,
TimeUnit)
cdef extern from "arrow/adapters/orc/options.h" \
namespace "arrow::adapters::orc" nogil:
cdef enum CompressionStrategy \
" arrow::adapters::orc::CompressionStrategy":
_CompressionStrategy_SPEED \
" arrow::adapters::orc::CompressionStrategy::kSpeed"
_CompressionStrategy_COMPRESSION \
" arrow::adapters::orc::CompressionStrategy::kCompression"
cdef enum WriterId" arrow::adapters::orc::WriterId":
_WriterId_ORC_JAVA_WRITER" arrow::adapters::orc::WriterId::kOrcJava"
_WriterId_ORC_CPP_WRITER" arrow::adapters::orc::WriterId::kOrcCpp"
_WriterId_PRESTO_WRITER" arrow::adapters::orc::WriterId::kPresto"
_WriterId_SCRITCHLEY_GO \
" arrow::adapters::orc::WriterId::kScritchleyGo"
_WriterId_TRINO_WRITER" arrow::adapters::orc::WriterId::kTrino"
_WriterId_UNKNOWN_WRITER" arrow::adapters::orc::WriterId::kUnknown"
cdef enum WriterVersion" arrow::adapters::orc::WriterVersion":
_WriterVersion_ORIGINAL \
" arrow::adapters::orc::WriterVersion::kOriginal"
_WriterVersion_HIVE_8732 \
" arrow::adapters::orc::WriterVersion::kHive8732"
_WriterVersion_HIVE_4243 \
" arrow::adapters::orc::WriterVersion::kHive4243"
_WriterVersion_HIVE_12055 \
" arrow::adapters::orc::WriterVersion::kHive12055"
_WriterVersion_HIVE_13083 \
" arrow::adapters::orc::WriterVersion::kHive13083"
_WriterVersion_ORC_101" arrow::adapters::orc::WriterVersion::kOrc101"
_WriterVersion_ORC_135" arrow::adapters::orc::WriterVersion::kOrc135"
_WriterVersion_ORC_517" arrow::adapters::orc::WriterVersion::kOrc517"
_WriterVersion_ORC_203" arrow::adapters::orc::WriterVersion::kOrc203"
_WriterVersion_ORC_14" arrow::adapters::orc::WriterVersion::kOrc14"
_WriterVersion_MAX" arrow::adapters::orc::WriterVersion::kMax"
cdef cppclass FileVersion" arrow::adapters::orc::FileVersion":
FileVersion(uint32_t major_version, uint32_t minor_version)
uint32_t major_version()
uint32_t minor_version()
c_string ToString()
cdef struct WriteOptions" arrow::adapters::orc::WriteOptions":
int64_t batch_size
FileVersion file_version
int64_t stripe_size
CCompressionType compression
int64_t compression_block_size
CompressionStrategy compression_strategy
int64_t row_index_stride
double padding_tolerance
double dictionary_key_size_threshold
std_vector[int64_t] bloom_filter_columns
double bloom_filter_fpp
cdef extern from "arrow/adapters/orc/adapter.h" \
namespace "arrow::adapters::orc" nogil:
cdef cppclass ORCFileReader:
@staticmethod
CResult[unique_ptr[ORCFileReader]] Open(
const shared_ptr[CRandomAccessFile]& file,
CMemoryPool* pool)
CResult[shared_ptr[const CKeyValueMetadata]] ReadMetadata()
CResult[shared_ptr[CSchema]] ReadSchema()
CResult[shared_ptr[CRecordBatch]] ReadStripe(int64_t stripe)
CResult[shared_ptr[CRecordBatch]] ReadStripe(
int64_t stripe, std_vector[c_string])
CResult[shared_ptr[CTable]] Read()
CResult[shared_ptr[CTable]] Read(std_vector[c_string])
int64_t NumberOfStripes()
int64_t NumberOfRows()
FileVersion GetFileVersion()
c_string GetSoftwareVersion()
CResult[CCompressionType] GetCompression()
int64_t GetCompressionSize()
int64_t GetRowIndexStride()
WriterId GetWriterId()
int32_t GetWriterIdValue()
WriterVersion GetWriterVersion()
int64_t GetNumberOfStripeStatistics()
int64_t GetContentLength()
int64_t GetStripeStatisticsLength()
int64_t GetFileFooterLength()
int64_t GetFilePostscriptLength()
int64_t GetFileLength()
c_string GetSerializedFileTail()
cdef cppclass ORCFileWriter:
@staticmethod
CResult[unique_ptr[ORCFileWriter]] Open(
COutputStream* output_stream, const WriteOptions& writer_options)
CStatus Write(const CTable& table)
CStatus Close()

View File

@ -0,0 +1,449 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
from cython.operator cimport dereference as deref
from libcpp.vector cimport vector as std_vector
from libcpp.utility cimport move
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (check_status, _Weakrefable,
MemoryPool, maybe_unbox_memory_pool,
Schema, pyarrow_wrap_schema,
KeyValueMetadata,
pyarrow_wrap_batch,
RecordBatch,
Table,
pyarrow_wrap_table,
pyarrow_unwrap_schema,
pyarrow_wrap_metadata,
pyarrow_unwrap_table,
get_reader,
get_writer)
from pyarrow.lib import frombytes, tobytes
from pyarrow.util import _stringify_path
cdef compression_type_from_enum(CCompressionType compression_type):
compression_map = {
CCompressionType_UNCOMPRESSED: 'UNCOMPRESSED',
CCompressionType_GZIP: 'ZLIB',
CCompressionType_SNAPPY: 'SNAPPY',
CCompressionType_LZ4: 'LZ4',
CCompressionType_ZSTD: 'ZSTD',
}
if compression_type in compression_map:
return compression_map[compression_type]
raise ValueError('Unsupported compression')
cdef CCompressionType compression_type_from_name(name) except *:
if not isinstance(name, str):
raise TypeError('compression must be a string')
name = name.upper()
if name == 'ZLIB':
return CCompressionType_GZIP
elif name == 'SNAPPY':
return CCompressionType_SNAPPY
elif name == 'LZ4':
return CCompressionType_LZ4
elif name == 'ZSTD':
return CCompressionType_ZSTD
elif name == 'UNCOMPRESSED':
return CCompressionType_UNCOMPRESSED
raise ValueError(f'Unknown CompressionKind: {name}')
cdef compression_strategy_from_enum(
CompressionStrategy compression_strategy
):
compression_strategy_map = {
_CompressionStrategy_SPEED: 'SPEED',
_CompressionStrategy_COMPRESSION: 'COMPRESSION',
}
if compression_strategy in compression_strategy_map:
return compression_strategy_map[compression_strategy]
raise ValueError('Unsupported compression strategy')
cdef CompressionStrategy compression_strategy_from_name(name) except *:
if not isinstance(name, str):
raise TypeError('compression strategy must be a string')
name = name.upper()
if name == 'COMPRESSION':
return _CompressionStrategy_COMPRESSION
elif name == 'SPEED':
return _CompressionStrategy_SPEED
raise ValueError(f'Unknown CompressionStrategy: {name}')
cdef file_version_from_class(FileVersion file_version):
return frombytes(file_version.ToString())
cdef writer_id_from_enum(WriterId writer_id):
writer_id_map = {
_WriterId_ORC_JAVA_WRITER: 'ORC_JAVA',
_WriterId_ORC_CPP_WRITER: 'ORC_CPP',
_WriterId_PRESTO_WRITER: 'PRESTO',
_WriterId_SCRITCHLEY_GO: 'SCRITCHLEY_GO',
_WriterId_TRINO_WRITER: 'TRINO',
}
if writer_id in writer_id_map:
return writer_id_map[writer_id]
raise ValueError('Unsupported writer ID')
cdef writer_version_from_enum(WriterVersion writer_version):
writer_version_map = {
_WriterVersion_ORIGINAL: 'ORIGINAL',
_WriterVersion_HIVE_8732: 'HIVE_8732',
_WriterVersion_HIVE_4243: 'HIVE_4243',
_WriterVersion_HIVE_12055: 'HIVE_12055',
_WriterVersion_HIVE_13083: 'HIVE_13083',
_WriterVersion_ORC_101: 'ORC_101',
_WriterVersion_ORC_135: 'ORC_135',
_WriterVersion_ORC_517: 'ORC_517',
_WriterVersion_ORC_203: 'ORC_203',
_WriterVersion_ORC_14: 'ORC_14',
}
if writer_version in writer_version_map:
return writer_version_map[writer_version]
raise ValueError('Unsupported writer version')
cdef shared_ptr[WriteOptions] _create_write_options(
file_version=None,
batch_size=None,
stripe_size=None,
compression=None,
compression_block_size=None,
compression_strategy=None,
row_index_stride=None,
padding_tolerance=None,
dictionary_key_size_threshold=None,
bloom_filter_columns=None,
bloom_filter_fpp=None
) except *:
"""General writer options"""
cdef:
shared_ptr[WriteOptions] options
options = make_shared[WriteOptions]()
# batch_size
if batch_size is not None:
if isinstance(batch_size, int) and batch_size > 0:
deref(options).batch_size = batch_size
else:
raise ValueError(f"Invalid ORC writer batch size: {batch_size}")
# file_version
if file_version is not None:
if file_version == "0.12":
deref(options).file_version = FileVersion(0, 12)
elif file_version == "0.11":
deref(options).file_version = FileVersion(0, 11)
else:
raise ValueError(f"Unsupported ORC file version: {file_version}")
# stripe_size
if stripe_size is not None:
if isinstance(stripe_size, int) and stripe_size > 0:
deref(options).stripe_size = stripe_size
else:
raise ValueError(f"Invalid ORC stripe size: {stripe_size}")
# compression
if compression is not None:
if isinstance(compression, str):
deref(options).compression = compression_type_from_name(
compression)
else:
raise TypeError("Unsupported ORC compression type: "
f"{compression}")
# compression_block_size
if compression_block_size is not None:
if (isinstance(compression_block_size, int) and
compression_block_size > 0):
deref(options).compression_block_size = compression_block_size
else:
raise ValueError("Invalid ORC compression block size: "
f"{compression_block_size}")
# compression_strategy
if compression_strategy is not None:
if isinstance(compression, str):
deref(options).compression_strategy = \
compression_strategy_from_name(compression_strategy)
else:
raise TypeError("Unsupported ORC compression strategy: "
f"{compression_strategy}")
# row_index_stride
if row_index_stride is not None:
if isinstance(row_index_stride, int) and row_index_stride > 0:
deref(options).row_index_stride = row_index_stride
else:
raise ValueError("Invalid ORC row index stride: "
f"{row_index_stride}")
# padding_tolerance
if padding_tolerance is not None:
try:
padding_tolerance = float(padding_tolerance)
deref(options).padding_tolerance = padding_tolerance
except Exception:
raise ValueError("Invalid ORC padding tolerance: "
f"{padding_tolerance}")
# dictionary_key_size_threshold
if dictionary_key_size_threshold is not None:
try:
dictionary_key_size_threshold = float(
dictionary_key_size_threshold)
assert 0 <= dictionary_key_size_threshold <= 1
deref(options).dictionary_key_size_threshold = \
dictionary_key_size_threshold
except Exception:
raise ValueError("Invalid ORC dictionary key size threshold: "
f"{dictionary_key_size_threshold}")
# bloom_filter_columns
if bloom_filter_columns is not None:
try:
bloom_filter_columns = list(bloom_filter_columns)
for col in bloom_filter_columns:
assert isinstance(col, int) and col >= 0
deref(options).bloom_filter_columns = bloom_filter_columns
except Exception:
raise ValueError("Invalid ORC BloomFilter columns: "
f"{bloom_filter_columns}")
# Max false positive rate of the Bloom Filter
if bloom_filter_fpp is not None:
try:
bloom_filter_fpp = float(bloom_filter_fpp)
assert 0 <= bloom_filter_fpp <= 1
deref(options).bloom_filter_fpp = bloom_filter_fpp
except Exception:
raise ValueError("Invalid ORC BloomFilter false positive rate: "
f"{bloom_filter_fpp}")
return options
cdef class ORCReader(_Weakrefable):
cdef:
object source
CMemoryPool* allocator
unique_ptr[ORCFileReader] reader
def __cinit__(self, MemoryPool memory_pool=None):
self.allocator = maybe_unbox_memory_pool(memory_pool)
def open(self, object source, c_bool use_memory_map=True):
cdef:
shared_ptr[CRandomAccessFile] rd_handle
self.source = source
get_reader(source, use_memory_map, &rd_handle)
with nogil:
self.reader = move(GetResultValue(
ORCFileReader.Open(rd_handle, self.allocator)
))
def metadata(self):
"""
The arrow metadata for this file.
Returns
-------
metadata : pyarrow.KeyValueMetadata
"""
cdef:
shared_ptr[const CKeyValueMetadata] sp_arrow_metadata
with nogil:
sp_arrow_metadata = GetResultValue(
deref(self.reader).ReadMetadata()
)
return pyarrow_wrap_metadata(sp_arrow_metadata)
def schema(self):
"""
The arrow schema for this file.
Returns
-------
schema : pyarrow.Schema
"""
cdef:
shared_ptr[CSchema] sp_arrow_schema
with nogil:
sp_arrow_schema = GetResultValue(deref(self.reader).ReadSchema())
return pyarrow_wrap_schema(sp_arrow_schema)
def nrows(self):
return deref(self.reader).NumberOfRows()
def nstripes(self):
return deref(self.reader).NumberOfStripes()
def file_version(self):
return file_version_from_class(deref(self.reader).GetFileVersion())
def software_version(self):
return frombytes(deref(self.reader).GetSoftwareVersion())
def compression(self):
return compression_type_from_enum(
GetResultValue(deref(self.reader).GetCompression()))
def compression_size(self):
return deref(self.reader).GetCompressionSize()
def row_index_stride(self):
return deref(self.reader).GetRowIndexStride()
def writer(self):
writer_name = writer_id_from_enum(deref(self.reader).GetWriterId())
if writer_name == 'UNKNOWN':
return deref(self.reader).GetWriterIdValue()
else:
return writer_name
def writer_version(self):
return writer_version_from_enum(deref(self.reader).GetWriterVersion())
def nstripe_statistics(self):
return deref(self.reader).GetNumberOfStripeStatistics()
def content_length(self):
return deref(self.reader).GetContentLength()
def stripe_statistics_length(self):
return deref(self.reader).GetStripeStatisticsLength()
def file_footer_length(self):
return deref(self.reader).GetFileFooterLength()
def file_postscript_length(self):
return deref(self.reader).GetFilePostscriptLength()
def file_length(self):
return deref(self.reader).GetFileLength()
def serialized_file_tail(self):
return deref(self.reader).GetSerializedFileTail()
def read_stripe(self, n, columns=None):
cdef:
shared_ptr[CRecordBatch] sp_record_batch
RecordBatch batch
int64_t stripe
std_vector[c_string] c_names
stripe = n
if columns is None:
with nogil:
sp_record_batch = GetResultValue(
deref(self.reader).ReadStripe(stripe)
)
else:
c_names = [tobytes(name) for name in columns]
with nogil:
sp_record_batch = GetResultValue(
deref(self.reader).ReadStripe(stripe, c_names)
)
return pyarrow_wrap_batch(sp_record_batch)
def read(self, columns=None):
cdef:
shared_ptr[CTable] sp_table
std_vector[c_string] c_names
if columns is None:
with nogil:
sp_table = GetResultValue(deref(self.reader).Read())
else:
c_names = [tobytes(name) for name in columns]
with nogil:
sp_table = GetResultValue(deref(self.reader).Read(c_names))
return pyarrow_wrap_table(sp_table)
cdef class ORCWriter(_Weakrefable):
cdef:
unique_ptr[ORCFileWriter] writer
shared_ptr[COutputStream] sink
c_bool own_sink
def open(self, object where, *,
file_version=None,
batch_size=None,
stripe_size=None,
compression=None,
compression_block_size=None,
compression_strategy=None,
row_index_stride=None,
padding_tolerance=None,
dictionary_key_size_threshold=None,
bloom_filter_columns=None,
bloom_filter_fpp=None):
cdef:
shared_ptr[WriteOptions] write_options
c_string c_where
try:
where = _stringify_path(where)
except TypeError:
get_writer(where, &self.sink)
self.own_sink = False
else:
c_where = tobytes(where)
with nogil:
self.sink = GetResultValue(FileOutputStream.Open(c_where))
self.own_sink = True
write_options = _create_write_options(
file_version=file_version,
batch_size=batch_size,
stripe_size=stripe_size,
compression=compression,
compression_block_size=compression_block_size,
compression_strategy=compression_strategy,
row_index_stride=row_index_stride,
padding_tolerance=padding_tolerance,
dictionary_key_size_threshold=dictionary_key_size_threshold,
bloom_filter_columns=bloom_filter_columns,
bloom_filter_fpp=bloom_filter_fpp
)
with nogil:
self.writer = move(GetResultValue(
ORCFileWriter.Open(self.sink.get(),
deref(write_options))))
def write(self, Table table):
cdef:
shared_ptr[CTable] sp_table
sp_table = pyarrow_unwrap_table(table)
with nogil:
check_status(deref(self.writer).Write(deref(sp_table)))
def close(self):
with nogil:
check_status(deref(self.writer).Close())
if self.own_sink:
check_status(deref(self.sink).Close())

View File

@ -0,0 +1,638 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport (CChunkedArray, CScalar, CSchema, CStatus,
CTable, CMemoryPool, CBuffer,
CKeyValueMetadata,
CRandomAccessFile, COutputStream,
TimeUnit, CRecordBatchReader)
from pyarrow.lib cimport _Weakrefable
cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
cdef cppclass Node:
pass
cdef cppclass GroupNode(Node):
pass
cdef cppclass PrimitiveNode(Node):
pass
cdef cppclass ColumnPath:
c_string ToDotString()
vector[c_string] ToDotVector()
cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
enum ParquetType" parquet::Type::type":
ParquetType_BOOLEAN" parquet::Type::BOOLEAN"
ParquetType_INT32" parquet::Type::INT32"
ParquetType_INT64" parquet::Type::INT64"
ParquetType_INT96" parquet::Type::INT96"
ParquetType_FLOAT" parquet::Type::FLOAT"
ParquetType_DOUBLE" parquet::Type::DOUBLE"
ParquetType_BYTE_ARRAY" parquet::Type::BYTE_ARRAY"
ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY"
enum ParquetLogicalTypeId" parquet::LogicalType::Type::type":
ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED"
ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING"
ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP"
ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST"
ParquetLogicalType_ENUM" parquet::LogicalType::Type::ENUM"
ParquetLogicalType_DECIMAL" parquet::LogicalType::Type::DECIMAL"
ParquetLogicalType_DATE" parquet::LogicalType::Type::DATE"
ParquetLogicalType_TIME" parquet::LogicalType::Type::TIME"
ParquetLogicalType_TIMESTAMP" parquet::LogicalType::Type::TIMESTAMP"
ParquetLogicalType_INT" parquet::LogicalType::Type::INT"
ParquetLogicalType_JSON" parquet::LogicalType::Type::JSON"
ParquetLogicalType_BSON" parquet::LogicalType::Type::BSON"
ParquetLogicalType_UUID" parquet::LogicalType::Type::UUID"
ParquetLogicalType_NONE" parquet::LogicalType::Type::NONE"
enum ParquetTimeUnit" parquet::LogicalType::TimeUnit::unit":
ParquetTimeUnit_UNKNOWN" parquet::LogicalType::TimeUnit::UNKNOWN"
ParquetTimeUnit_MILLIS" parquet::LogicalType::TimeUnit::MILLIS"
ParquetTimeUnit_MICROS" parquet::LogicalType::TimeUnit::MICROS"
ParquetTimeUnit_NANOS" parquet::LogicalType::TimeUnit::NANOS"
enum ParquetConvertedType" parquet::ConvertedType::type":
ParquetConvertedType_NONE" parquet::ConvertedType::NONE"
ParquetConvertedType_UTF8" parquet::ConvertedType::UTF8"
ParquetConvertedType_MAP" parquet::ConvertedType::MAP"
ParquetConvertedType_MAP_KEY_VALUE \
" parquet::ConvertedType::MAP_KEY_VALUE"
ParquetConvertedType_LIST" parquet::ConvertedType::LIST"
ParquetConvertedType_ENUM" parquet::ConvertedType::ENUM"
ParquetConvertedType_DECIMAL" parquet::ConvertedType::DECIMAL"
ParquetConvertedType_DATE" parquet::ConvertedType::DATE"
ParquetConvertedType_TIME_MILLIS" parquet::ConvertedType::TIME_MILLIS"
ParquetConvertedType_TIME_MICROS" parquet::ConvertedType::TIME_MICROS"
ParquetConvertedType_TIMESTAMP_MILLIS \
" parquet::ConvertedType::TIMESTAMP_MILLIS"
ParquetConvertedType_TIMESTAMP_MICROS \
" parquet::ConvertedType::TIMESTAMP_MICROS"
ParquetConvertedType_UINT_8" parquet::ConvertedType::UINT_8"
ParquetConvertedType_UINT_16" parquet::ConvertedType::UINT_16"
ParquetConvertedType_UINT_32" parquet::ConvertedType::UINT_32"
ParquetConvertedType_UINT_64" parquet::ConvertedType::UINT_64"
ParquetConvertedType_INT_8" parquet::ConvertedType::INT_8"
ParquetConvertedType_INT_16" parquet::ConvertedType::INT_16"
ParquetConvertedType_INT_32" parquet::ConvertedType::INT_32"
ParquetConvertedType_INT_64" parquet::ConvertedType::INT_64"
ParquetConvertedType_JSON" parquet::ConvertedType::JSON"
ParquetConvertedType_BSON" parquet::ConvertedType::BSON"
ParquetConvertedType_INTERVAL" parquet::ConvertedType::INTERVAL"
enum ParquetRepetition" parquet::Repetition::type":
ParquetRepetition_REQUIRED" parquet::REPETITION::REQUIRED"
ParquetRepetition_OPTIONAL" parquet::REPETITION::OPTIONAL"
ParquetRepetition_REPEATED" parquet::REPETITION::REPEATED"
enum ParquetEncoding" parquet::Encoding::type":
ParquetEncoding_PLAIN" parquet::Encoding::PLAIN"
ParquetEncoding_PLAIN_DICTIONARY" parquet::Encoding::PLAIN_DICTIONARY"
ParquetEncoding_RLE" parquet::Encoding::RLE"
ParquetEncoding_BIT_PACKED" parquet::Encoding::BIT_PACKED"
ParquetEncoding_DELTA_BINARY_PACKED \
" parquet::Encoding::DELTA_BINARY_PACKED"
ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY \
" parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY"
ParquetEncoding_DELTA_BYTE_ARRAY" parquet::Encoding::DELTA_BYTE_ARRAY"
ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY"
ParquetEncoding_BYTE_STREAM_SPLIT \
" parquet::Encoding::BYTE_STREAM_SPLIT"
enum ParquetCompression" parquet::Compression::type":
ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED"
ParquetCompression_SNAPPY" parquet::Compression::SNAPPY"
ParquetCompression_GZIP" parquet::Compression::GZIP"
ParquetCompression_LZO" parquet::Compression::LZO"
ParquetCompression_BROTLI" parquet::Compression::BROTLI"
ParquetCompression_LZ4" parquet::Compression::LZ4"
ParquetCompression_ZSTD" parquet::Compression::ZSTD"
enum ParquetVersion" parquet::ParquetVersion::type":
ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0"
ParquetVersion_V2_0" parquet::ParquetVersion::PARQUET_2_0"
ParquetVersion_V2_4" parquet::ParquetVersion::PARQUET_2_4"
ParquetVersion_V2_6" parquet::ParquetVersion::PARQUET_2_6"
enum ParquetSortOrder" parquet::SortOrder::type":
ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED"
ParquetSortOrder_UNSIGNED" parquet::SortOrder::UNSIGNED"
ParquetSortOrder_UNKNOWN" parquet::SortOrder::UNKNOWN"
cdef cppclass CParquetLogicalType" parquet::LogicalType":
c_string ToString() const
c_string ToJSON() const
ParquetLogicalTypeId type() const
cdef cppclass CParquetDecimalType \
" parquet::DecimalLogicalType"(CParquetLogicalType):
int32_t precision() const
int32_t scale() const
cdef cppclass CParquetIntType \
" parquet::IntLogicalType"(CParquetLogicalType):
int bit_width() const
c_bool is_signed() const
cdef cppclass CParquetTimeType \
" parquet::TimeLogicalType"(CParquetLogicalType):
c_bool is_adjusted_to_utc() const
ParquetTimeUnit time_unit() const
cdef cppclass CParquetTimestampType \
" parquet::TimestampLogicalType"(CParquetLogicalType):
c_bool is_adjusted_to_utc() const
ParquetTimeUnit time_unit() const
cdef cppclass ColumnDescriptor" parquet::ColumnDescriptor":
c_bool Equals(const ColumnDescriptor& other)
shared_ptr[ColumnPath] path()
int16_t max_definition_level()
int16_t max_repetition_level()
ParquetType physical_type()
const shared_ptr[const CParquetLogicalType]& logical_type()
ParquetConvertedType converted_type()
const c_string& name()
int type_length()
int type_precision()
int type_scale()
cdef cppclass SchemaDescriptor:
const ColumnDescriptor* Column(int i)
shared_ptr[Node] schema()
GroupNode* group()
c_bool Equals(const SchemaDescriptor& other)
c_string ToString()
int num_columns()
cdef c_string FormatStatValue(ParquetType parquet_type, c_string val)
enum ParquetCipher" parquet::ParquetCipher::type":
ParquetCipher_AES_GCM_V1" parquet::ParquetCipher::AES_GCM_V1"
ParquetCipher_AES_GCM_CTR_V1" parquet::ParquetCipher::AES_GCM_CTR_V1"
struct AadMetadata:
c_string aad_prefix
c_string aad_file_unique
c_bool supply_aad_prefix
struct EncryptionAlgorithm:
ParquetCipher algorithm
AadMetadata aad
cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
cdef cppclass ColumnReader:
pass
cdef cppclass BoolReader(ColumnReader):
pass
cdef cppclass Int32Reader(ColumnReader):
pass
cdef cppclass Int64Reader(ColumnReader):
pass
cdef cppclass Int96Reader(ColumnReader):
pass
cdef cppclass FloatReader(ColumnReader):
pass
cdef cppclass DoubleReader(ColumnReader):
pass
cdef cppclass ByteArrayReader(ColumnReader):
pass
cdef cppclass RowGroupReader:
pass
cdef cppclass CEncodedStatistics" parquet::EncodedStatistics":
const c_string& max() const
const c_string& min() const
int64_t null_count
int64_t distinct_count
bint has_min
bint has_max
bint has_null_count
bint has_distinct_count
cdef cppclass ParquetByteArray" parquet::ByteArray":
uint32_t len
const uint8_t* ptr
cdef cppclass ParquetFLBA" parquet::FLBA":
const uint8_t* ptr
cdef cppclass CStatistics" parquet::Statistics":
int64_t null_count() const
int64_t distinct_count() const
int64_t num_values() const
bint HasMinMax()
bint HasNullCount()
bint HasDistinctCount()
c_bool Equals(const CStatistics&) const
void Reset()
c_string EncodeMin()
c_string EncodeMax()
CEncodedStatistics Encode()
void SetComparator()
ParquetType physical_type() const
const ColumnDescriptor* descr() const
cdef cppclass CBoolStatistics" parquet::BoolStatistics"(CStatistics):
c_bool min()
c_bool max()
cdef cppclass CInt32Statistics" parquet::Int32Statistics"(CStatistics):
int32_t min()
int32_t max()
cdef cppclass CInt64Statistics" parquet::Int64Statistics"(CStatistics):
int64_t min()
int64_t max()
cdef cppclass CFloatStatistics" parquet::FloatStatistics"(CStatistics):
float min()
float max()
cdef cppclass CDoubleStatistics" parquet::DoubleStatistics"(CStatistics):
double min()
double max()
cdef cppclass CByteArrayStatistics \
" parquet::ByteArrayStatistics"(CStatistics):
ParquetByteArray min()
ParquetByteArray max()
cdef cppclass CFLBAStatistics" parquet::FLBAStatistics"(CStatistics):
ParquetFLBA min()
ParquetFLBA max()
cdef cppclass CColumnCryptoMetaData" parquet::ColumnCryptoMetaData":
shared_ptr[ColumnPath] path_in_schema() const
c_bool encrypted_with_footer_key() const
const c_string& key_metadata() const
cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData":
int64_t file_offset() const
const c_string& file_path() const
c_bool is_metadata_set() const
ParquetType type() const
int64_t num_values() const
shared_ptr[ColumnPath] path_in_schema() const
bint is_stats_set() const
shared_ptr[CStatistics] statistics() const
ParquetCompression compression() const
const vector[ParquetEncoding]& encodings() const
c_bool Equals(const CColumnChunkMetaData&) const
int64_t has_dictionary_page() const
int64_t dictionary_page_offset() const
int64_t data_page_offset() const
int64_t index_page_offset() const
int64_t total_compressed_size() const
int64_t total_uncompressed_size() const
unique_ptr[CColumnCryptoMetaData] crypto_metadata() const
cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData":
c_bool Equals(const CRowGroupMetaData&) const
int num_columns()
int64_t num_rows()
int64_t total_byte_size()
unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const
cdef cppclass CFileMetaData" parquet::FileMetaData":
c_bool Equals(const CFileMetaData&) const
uint32_t size()
int num_columns()
int64_t num_rows()
int num_row_groups()
ParquetVersion version()
const c_string created_by()
int num_schema_elements()
void set_file_path(const c_string& path)
void AppendRowGroups(const CFileMetaData& other) except +
unique_ptr[CRowGroupMetaData] RowGroup(int i)
const SchemaDescriptor* schema()
shared_ptr[const CKeyValueMetadata] key_value_metadata() const
void WriteTo(COutputStream* dst) const
inline c_bool is_encryption_algorithm_set() const
inline EncryptionAlgorithm encryption_algorithm() const
inline const c_string& footer_signing_key_metadata() const
cdef shared_ptr[CFileMetaData] CFileMetaData_Make \
" parquet::FileMetaData::Make"(const void* serialized_metadata,
uint32_t* metadata_len)
cdef cppclass CReaderProperties" parquet::ReaderProperties":
c_bool is_buffered_stream_enabled() const
void enable_buffered_stream()
void disable_buffered_stream()
void set_buffer_size(int64_t buf_size)
int64_t buffer_size() const
void file_decryption_properties(shared_ptr[CFileDecryptionProperties]
decryption)
shared_ptr[CFileDecryptionProperties] file_decryption_properties() \
const
CReaderProperties default_reader_properties()
cdef cppclass ArrowReaderProperties:
ArrowReaderProperties()
void set_read_dictionary(int column_index, c_bool read_dict)
c_bool read_dictionary()
void set_batch_size(int64_t batch_size)
int64_t batch_size()
void set_pre_buffer(c_bool pre_buffer)
c_bool pre_buffer() const
void set_coerce_int96_timestamp_unit(TimeUnit unit)
TimeUnit coerce_int96_timestamp_unit() const
ArrowReaderProperties default_arrow_reader_properties()
cdef cppclass ParquetFileReader:
shared_ptr[CFileMetaData] metadata()
cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
cdef cppclass WriterProperties:
cppclass Builder:
Builder* data_page_version(ParquetDataPageVersion version)
Builder* version(ParquetVersion version)
Builder* compression(ParquetCompression codec)
Builder* compression(const c_string& path,
ParquetCompression codec)
Builder* compression_level(int compression_level)
Builder* compression_level(const c_string& path,
int compression_level)
Builder* encryption(
shared_ptr[CFileEncryptionProperties]
file_encryption_properties)
Builder* disable_dictionary()
Builder* enable_dictionary()
Builder* enable_dictionary(const c_string& path)
Builder* disable_statistics()
Builder* enable_statistics()
Builder* enable_statistics(const c_string& path)
Builder* data_pagesize(int64_t size)
Builder* encoding(ParquetEncoding encoding)
Builder* encoding(const c_string& path,
ParquetEncoding encoding)
Builder* write_batch_size(int64_t batch_size)
Builder* dictionary_pagesize_limit(int64_t dictionary_pagesize_limit)
shared_ptr[WriterProperties] build()
cdef cppclass ArrowWriterProperties:
cppclass Builder:
Builder()
Builder* disable_deprecated_int96_timestamps()
Builder* enable_deprecated_int96_timestamps()
Builder* coerce_timestamps(TimeUnit unit)
Builder* allow_truncated_timestamps()
Builder* disallow_truncated_timestamps()
Builder* store_schema()
Builder* enable_compliant_nested_types()
Builder* disable_compliant_nested_types()
Builder* set_engine_version(ArrowWriterEngineVersion version)
shared_ptr[ArrowWriterProperties] build()
c_bool support_deprecated_int96_timestamps()
cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil:
cdef cppclass FileReader:
FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader)
CStatus GetSchema(shared_ptr[CSchema]* out)
CStatus ReadColumn(int i, shared_ptr[CChunkedArray]* out)
CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out)
int num_row_groups()
CStatus ReadRowGroup(int i, shared_ptr[CTable]* out)
CStatus ReadRowGroup(int i, const vector[int]& column_indices,
shared_ptr[CTable]* out)
CStatus ReadRowGroups(const vector[int]& row_groups,
shared_ptr[CTable]* out)
CStatus ReadRowGroups(const vector[int]& row_groups,
const vector[int]& column_indices,
shared_ptr[CTable]* out)
CStatus GetRecordBatchReader(const vector[int]& row_group_indices,
const vector[int]& column_indices,
unique_ptr[CRecordBatchReader]* out)
CStatus GetRecordBatchReader(const vector[int]& row_group_indices,
unique_ptr[CRecordBatchReader]* out)
CStatus ReadTable(shared_ptr[CTable]* out)
CStatus ReadTable(const vector[int]& column_indices,
shared_ptr[CTable]* out)
CStatus ScanContents(vector[int] columns, int32_t column_batch_size,
int64_t* num_rows)
const ParquetFileReader* parquet_reader()
void set_use_threads(c_bool use_threads)
void set_batch_size(int64_t batch_size)
cdef cppclass FileReaderBuilder:
FileReaderBuilder()
CStatus Open(const shared_ptr[CRandomAccessFile]& file,
const CReaderProperties& properties,
const shared_ptr[CFileMetaData]& metadata)
ParquetFileReader* raw_reader()
FileReaderBuilder* memory_pool(CMemoryPool*)
FileReaderBuilder* properties(const ArrowReaderProperties&)
CStatus Build(unique_ptr[FileReader]* out)
CStatus FromParquetSchema(
const SchemaDescriptor* parquet_schema,
const ArrowReaderProperties& properties,
const shared_ptr[const CKeyValueMetadata]& key_value_metadata,
shared_ptr[CSchema]* out)
CStatus StatisticsAsScalars(const CStatistics& Statistics,
shared_ptr[CScalar]* min,
shared_ptr[CScalar]* max)
cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil:
CStatus ToParquetSchema(
const CSchema* arrow_schema,
const ArrowReaderProperties& properties,
const shared_ptr[const CKeyValueMetadata]& key_value_metadata,
shared_ptr[SchemaDescriptor]* out)
cdef extern from "parquet/properties.h" namespace "parquet" nogil:
cdef enum ArrowWriterEngineVersion:
V1 "parquet::ArrowWriterProperties::V1",
V2 "parquet::ArrowWriterProperties::V2"
cdef cppclass ParquetDataPageVersion:
pass
cdef ParquetDataPageVersion ParquetDataPageVersion_V1 \
" parquet::ParquetDataPageVersion::V1"
cdef ParquetDataPageVersion ParquetDataPageVersion_V2 \
" parquet::ParquetDataPageVersion::V2"
cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
cdef cppclass FileWriter:
@staticmethod
CStatus Open(const CSchema& schema, CMemoryPool* pool,
const shared_ptr[COutputStream]& sink,
const shared_ptr[WriterProperties]& properties,
const shared_ptr[ArrowWriterProperties]& arrow_properties,
unique_ptr[FileWriter]* writer)
CStatus WriteTable(const CTable& table, int64_t chunk_size)
CStatus NewRowGroup(int64_t chunk_size)
CStatus Close()
const shared_ptr[CFileMetaData] metadata() const
CStatus WriteMetaDataFile(
const CFileMetaData& file_metadata,
const COutputStream* sink)
cdef class FileEncryptionProperties:
"""File-level encryption properties for the low-level API"""
cdef:
shared_ptr[CFileEncryptionProperties] properties
@staticmethod
cdef inline FileEncryptionProperties wrap(
shared_ptr[CFileEncryptionProperties] properties):
result = FileEncryptionProperties()
result.properties = properties
return result
cdef inline shared_ptr[CFileEncryptionProperties] unwrap(self):
return self.properties
cdef shared_ptr[WriterProperties] _create_writer_properties(
use_dictionary=*,
compression=*,
version=*,
write_statistics=*,
data_page_size=*,
compression_level=*,
use_byte_stream_split=*,
column_encoding=*,
data_page_version=*,
FileEncryptionProperties encryption_properties=*,
write_batch_size=*,
dictionary_pagesize_limit=*) except *
cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
use_deprecated_int96_timestamps=*,
coerce_timestamps=*,
allow_truncated_timestamps=*,
writer_engine_version=*,
use_compliant_nested_type=*) except *
cdef class ParquetSchema(_Weakrefable):
cdef:
FileMetaData parent # the FileMetaData owning the SchemaDescriptor
const SchemaDescriptor* schema
cdef class FileMetaData(_Weakrefable):
cdef:
shared_ptr[CFileMetaData] sp_metadata
CFileMetaData* _metadata
ParquetSchema _schema
cdef inline init(self, const shared_ptr[CFileMetaData]& metadata):
self.sp_metadata = metadata
self._metadata = metadata.get()
cdef class RowGroupMetaData(_Weakrefable):
cdef:
int index # for pickling support
unique_ptr[CRowGroupMetaData] up_metadata
CRowGroupMetaData* metadata
FileMetaData parent
cdef class ColumnChunkMetaData(_Weakrefable):
cdef:
unique_ptr[CColumnChunkMetaData] up_metadata
CColumnChunkMetaData* metadata
RowGroupMetaData parent
cdef inline init(self, RowGroupMetaData parent, int i):
self.up_metadata = parent.metadata.ColumnChunk(i)
self.metadata = self.up_metadata.get()
self.parent = parent
cdef class Statistics(_Weakrefable):
cdef:
shared_ptr[CStatistics] statistics
ColumnChunkMetaData parent
cdef inline init(self, const shared_ptr[CStatistics]& statistics,
ColumnChunkMetaData parent):
self.statistics = statistics
self.parent = parent
cdef extern from "parquet/encryption/encryption.h" namespace "parquet" nogil:
cdef cppclass CFileDecryptionProperties\
" parquet::FileDecryptionProperties":
pass
cdef cppclass CFileEncryptionProperties\
" parquet::FileEncryptionProperties":
pass
cdef class FileDecryptionProperties:
"""File-level decryption properties for the low-level API"""
cdef:
shared_ptr[CFileDecryptionProperties] properties
@staticmethod
cdef inline FileDecryptionProperties wrap(
shared_ptr[CFileDecryptionProperties] properties):
result = FileDecryptionProperties()
result.properties = properties
return result
cdef inline shared_ptr[CFileDecryptionProperties] unwrap(self):
return self.properties

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,133 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# distutils: language = c++
# cython: language_level = 3
from pyarrow.includes.common cimport *
from pyarrow._parquet cimport (ParquetCipher,
CFileEncryptionProperties,
CFileDecryptionProperties,
FileEncryptionProperties,
FileDecryptionProperties,
ParquetCipher_AES_GCM_V1,
ParquetCipher_AES_GCM_CTR_V1)
cdef extern from "parquet/encryption/kms_client.h" \
namespace "parquet::encryption" nogil:
cdef cppclass CKmsClient" parquet::encryption::KmsClient":
c_string WrapKey(const c_string& key_bytes,
const c_string& master_key_identifier) except +
c_string UnwrapKey(const c_string& wrapped_key,
const c_string& master_key_identifier) except +
cdef cppclass CKeyAccessToken" parquet::encryption::KeyAccessToken":
CKeyAccessToken(const c_string value)
void Refresh(const c_string& new_value)
const c_string& value() const
cdef cppclass CKmsConnectionConfig \
" parquet::encryption::KmsConnectionConfig":
CKmsConnectionConfig()
c_string kms_instance_id
c_string kms_instance_url
shared_ptr[CKeyAccessToken] refreshable_key_access_token
unordered_map[c_string, c_string] custom_kms_conf
# Callbacks for implementing Python kms clients
# Use typedef to emulate syntax for std::function<void(..)>
ctypedef void CallbackWrapKey(
object, const c_string&, const c_string&, c_string*)
ctypedef void CallbackUnwrapKey(
object, const c_string&, const c_string&, c_string*)
cdef extern from "parquet/encryption/kms_client_factory.h" \
namespace "parquet::encryption" nogil:
cdef cppclass CKmsClientFactory" parquet::encryption::KmsClientFactory":
shared_ptr[CKmsClient] CreateKmsClient(
const CKmsConnectionConfig& kms_connection_config) except +
# Callbacks for implementing Python kms client factories
# Use typedef to emulate syntax for std::function<void(..)>
ctypedef void CallbackCreateKmsClient(
object,
const CKmsConnectionConfig&, shared_ptr[CKmsClient]*)
cdef extern from "parquet/encryption/crypto_factory.h" \
namespace "parquet::encryption" nogil:
cdef cppclass CEncryptionConfiguration\
" parquet::encryption::EncryptionConfiguration":
CEncryptionConfiguration(const c_string& footer_key) except +
c_string footer_key
c_string column_keys
ParquetCipher encryption_algorithm
c_bool plaintext_footer
c_bool double_wrapping
double cache_lifetime_seconds
c_bool internal_key_material
int32_t data_key_length_bits
cdef cppclass CDecryptionConfiguration\
" parquet::encryption::DecryptionConfiguration":
CDecryptionConfiguration() except +
double cache_lifetime_seconds
cdef cppclass CCryptoFactory" parquet::encryption::CryptoFactory":
void RegisterKmsClientFactory(
shared_ptr[CKmsClientFactory] kms_client_factory) except +
shared_ptr[CFileEncryptionProperties] GetFileEncryptionProperties(
const CKmsConnectionConfig& kms_connection_config,
const CEncryptionConfiguration& encryption_config) except +*
shared_ptr[CFileDecryptionProperties] GetFileDecryptionProperties(
const CKmsConnectionConfig& kms_connection_config,
const CDecryptionConfiguration& decryption_config) except +*
void RemoveCacheEntriesForToken(const c_string& access_token) except +
void RemoveCacheEntriesForAllTokens() except +
cdef extern from "arrow/python/parquet_encryption.h" \
namespace "arrow::py::parquet::encryption" nogil:
cdef cppclass CPyKmsClientVtable \
" arrow::py::parquet::encryption::PyKmsClientVtable":
CPyKmsClientVtable()
function[CallbackWrapKey] wrap_key
function[CallbackUnwrapKey] unwrap_key
cdef cppclass CPyKmsClient\
" arrow::py::parquet::encryption::PyKmsClient"(CKmsClient):
CPyKmsClient(object handler, CPyKmsClientVtable vtable)
cdef cppclass CPyKmsClientFactoryVtable\
" arrow::py::parquet::encryption::PyKmsClientFactoryVtable":
CPyKmsClientFactoryVtable()
function[CallbackCreateKmsClient] create_kms_client
cdef cppclass CPyKmsClientFactory\
" arrow::py::parquet::encryption::PyKmsClientFactory"(
CKmsClientFactory):
CPyKmsClientFactory(object handler, CPyKmsClientFactoryVtable vtable)
cdef cppclass CPyCryptoFactory\
" arrow::py::parquet::encryption::PyCryptoFactory"(CCryptoFactory):
CResult[shared_ptr[CFileEncryptionProperties]] \
SafeGetFileEncryptionProperties(
const CKmsConnectionConfig& kms_connection_config,
const CEncryptionConfiguration& encryption_config)
CResult[shared_ptr[CFileDecryptionProperties]] \
SafeGetFileDecryptionProperties(
const CKmsConnectionConfig& kms_connection_config,
const CDecryptionConfiguration& decryption_config)

View File

@ -0,0 +1,475 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
from datetime import timedelta
import io
import warnings
from libcpp cimport nullptr
from cython.operator cimport dereference as deref
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport _Weakrefable
from pyarrow.lib import (ArrowException,
tobytes, frombytes)
cimport cpython as cp
cdef ParquetCipher cipher_from_name(name):
name = name.upper()
if name == 'AES_GCM_V1':
return ParquetCipher_AES_GCM_V1
elif name == 'AES_GCM_CTR_V1':
return ParquetCipher_AES_GCM_CTR_V1
else:
raise ValueError(f'Invalid cipher name: {name!r}')
cdef cipher_to_name(ParquetCipher cipher):
if ParquetCipher_AES_GCM_V1 == cipher:
return 'AES_GCM_V1'
elif ParquetCipher_AES_GCM_CTR_V1 == cipher:
return 'AES_GCM_CTR_V1'
else:
raise ValueError('Invalid cipher value: {0}'.format(cipher))
cdef class EncryptionConfiguration(_Weakrefable):
"""Configuration of the encryption, such as which columns to encrypt"""
cdef:
shared_ptr[CEncryptionConfiguration] configuration
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, footer_key, *, column_keys=None,
encryption_algorithm=None,
plaintext_footer=None, double_wrapping=None,
cache_lifetime=None, internal_key_material=None,
data_key_length_bits=None):
self.configuration.reset(
new CEncryptionConfiguration(tobytes(footer_key)))
if column_keys is not None:
self.column_keys = column_keys
if encryption_algorithm is not None:
self.encryption_algorithm = encryption_algorithm
if plaintext_footer is not None:
self.plaintext_footer = plaintext_footer
if double_wrapping is not None:
self.double_wrapping = double_wrapping
if cache_lifetime is not None:
self.cache_lifetime = cache_lifetime
if internal_key_material is not None:
self.internal_key_material = internal_key_material
if data_key_length_bits is not None:
self.data_key_length_bits = data_key_length_bits
@property
def footer_key(self):
"""ID of the master key for footer encryption/signing"""
return frombytes(self.configuration.get().footer_key)
@property
def column_keys(self):
"""
List of columns to encrypt, with master key IDs.
"""
column_keys_str = frombytes(self.configuration.get().column_keys)
# Convert from "masterKeyID:colName,colName;masterKeyID:colName..."
# (see HIVE-21848) to dictionary of master key ID to column name lists
column_keys_to_key_list_str = dict(subString.replace(" ", "").split(
":") for subString in column_keys_str.split(";"))
column_keys_dict = {k: v.split(
",") for k, v in column_keys_to_key_list_str.items()}
return column_keys_dict
@column_keys.setter
def column_keys(self, dict value):
if value is not None:
# convert a dictionary such as
# '{"key1": ["col1 ", "col2"], "key2": ["col3 ", "col4"]}''
# to the string defined by the spec
# 'key1: col1 , col2; key2: col3 , col4'
column_keys = "; ".join(
["{}: {}".format(k, ", ".join(v)) for k, v in value.items()])
self.configuration.get().column_keys = tobytes(column_keys)
@property
def encryption_algorithm(self):
"""Parquet encryption algorithm.
Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1"."""
return cipher_to_name(self.configuration.get().encryption_algorithm)
@encryption_algorithm.setter
def encryption_algorithm(self, value):
cipher = cipher_from_name(value)
self.configuration.get().encryption_algorithm = cipher
@property
def plaintext_footer(self):
"""Write files with plaintext footer."""
return self.configuration.get().plaintext_footer
@plaintext_footer.setter
def plaintext_footer(self, value):
self.configuration.get().plaintext_footer = value
@property
def double_wrapping(self):
"""Use double wrapping - where data encryption keys (DEKs) are
encrypted with key encryption keys (KEKs), which in turn are
encrypted with master keys.
If set to false, use single wrapping - where DEKs are
encrypted directly with master keys."""
return self.configuration.get().double_wrapping
@double_wrapping.setter
def double_wrapping(self, value):
self.configuration.get().double_wrapping = value
@property
def cache_lifetime(self):
"""Lifetime of cached entities (key encryption keys,
local wrapping keys, KMS client objects)."""
return timedelta(
seconds=self.configuration.get().cache_lifetime_seconds)
@cache_lifetime.setter
def cache_lifetime(self, value):
if not isinstance(value, timedelta):
raise TypeError("cache_lifetime should be a timedelta")
self.configuration.get().cache_lifetime_seconds = value.total_seconds()
@property
def internal_key_material(self):
"""Store key material inside Parquet file footers; this mode doesnt
produce additional files. If set to false, key material is stored in
separate files in the same folder, which enables key rotation for
immutable Parquet files."""
return self.configuration.get().internal_key_material
@internal_key_material.setter
def internal_key_material(self, value):
self.configuration.get().internal_key_material = value
@property
def data_key_length_bits(self):
"""Length of data encryption keys (DEKs), randomly generated by parquet key
management tools. Can be 128, 192 or 256 bits."""
return self.configuration.get().data_key_length_bits
@data_key_length_bits.setter
def data_key_length_bits(self, value):
self.configuration.get().data_key_length_bits = value
cdef inline shared_ptr[CEncryptionConfiguration] unwrap(self) nogil:
return self.configuration
cdef class DecryptionConfiguration(_Weakrefable):
"""Configuration of the decryption, such as cache timeout."""
cdef:
shared_ptr[CDecryptionConfiguration] configuration
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, *, cache_lifetime=None):
self.configuration.reset(new CDecryptionConfiguration())
@property
def cache_lifetime(self):
"""Lifetime of cached entities (key encryption keys,
local wrapping keys, KMS client objects)."""
return timedelta(
seconds=self.configuration.get().cache_lifetime_seconds)
@cache_lifetime.setter
def cache_lifetime(self, value):
self.configuration.get().cache_lifetime_seconds = value.total_seconds()
cdef inline shared_ptr[CDecryptionConfiguration] unwrap(self) nogil:
return self.configuration
cdef class KmsConnectionConfig(_Weakrefable):
"""Configuration of the connection to the Key Management Service (KMS)"""
cdef:
shared_ptr[CKmsConnectionConfig] configuration
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, *, kms_instance_id=None, kms_instance_url=None,
key_access_token=None, custom_kms_conf=None):
self.configuration.reset(new CKmsConnectionConfig())
if kms_instance_id is not None:
self.kms_instance_id = kms_instance_id
if kms_instance_url is not None:
self.kms_instance_url = kms_instance_url
if key_access_token is None:
self.key_access_token = b'DEFAULT'
else:
self.key_access_token = key_access_token
if custom_kms_conf is not None:
self.custom_kms_conf = custom_kms_conf
@property
def kms_instance_id(self):
"""ID of the KMS instance that will be used for encryption
(if multiple KMS instances are available)."""
return frombytes(self.configuration.get().kms_instance_id)
@kms_instance_id.setter
def kms_instance_id(self, value):
self.configuration.get().kms_instance_id = tobytes(value)
@property
def kms_instance_url(self):
"""URL of the KMS instance."""
return frombytes(self.configuration.get().kms_instance_url)
@kms_instance_url.setter
def kms_instance_url(self, value):
self.configuration.get().kms_instance_url = tobytes(value)
@property
def key_access_token(self):
"""Authorization token that will be passed to KMS."""
return frombytes(self.configuration.get()
.refreshable_key_access_token.get().value())
@key_access_token.setter
def key_access_token(self, value):
self.refresh_key_access_token(value)
@property
def custom_kms_conf(self):
"""A dictionary with KMS-type-specific configuration"""
custom_kms_conf = {
frombytes(k): frombytes(v)
for k, v in self.configuration.get().custom_kms_conf
}
return custom_kms_conf
@custom_kms_conf.setter
def custom_kms_conf(self, dict value):
if value is not None:
for k, v in value.items():
if isinstance(k, str) and isinstance(v, str):
self.configuration.get().custom_kms_conf[tobytes(k)] = \
tobytes(v)
else:
raise TypeError("Expected custom_kms_conf to be " +
"a dictionary of strings")
def refresh_key_access_token(self, value):
cdef:
shared_ptr[CKeyAccessToken] c_key_access_token = \
self.configuration.get().refreshable_key_access_token
c_key_access_token.get().Refresh(tobytes(value))
cdef inline shared_ptr[CKmsConnectionConfig] unwrap(self) nogil:
return self.configuration
@staticmethod
cdef wrap(const CKmsConnectionConfig& config):
result = KmsConnectionConfig()
result.configuration = make_shared[CKmsConnectionConfig](move(config))
return result
# Callback definitions for CPyKmsClientVtable
cdef void _cb_wrap_key(
handler, const c_string& key_bytes,
const c_string& master_key_identifier, c_string* out) except *:
mkid_str = frombytes(master_key_identifier)
wrapped_key = handler.wrap_key(key_bytes, mkid_str)
out[0] = tobytes(wrapped_key)
cdef void _cb_unwrap_key(
handler, const c_string& wrapped_key,
const c_string& master_key_identifier, c_string* out) except *:
mkid_str = frombytes(master_key_identifier)
wk_str = frombytes(wrapped_key)
key = handler.unwrap_key(wk_str, mkid_str)
out[0] = tobytes(key)
cdef class KmsClient(_Weakrefable):
"""The abstract base class for KmsClient implementations."""
cdef:
shared_ptr[CKmsClient] client
def __init__(self):
self.init()
cdef init(self):
cdef:
CPyKmsClientVtable vtable = CPyKmsClientVtable()
vtable.wrap_key = _cb_wrap_key
vtable.unwrap_key = _cb_unwrap_key
self.client.reset(new CPyKmsClient(self, vtable))
def wrap_key(self, key_bytes, master_key_identifier):
"""Wrap a key - encrypt it with the master key."""
raise NotImplementedError()
def unwrap_key(self, wrapped_key, master_key_identifier):
"""Unwrap a key - decrypt it with the master key."""
raise NotImplementedError()
cdef inline shared_ptr[CKmsClient] unwrap(self) nogil:
return self.client
# Callback definition for CPyKmsClientFactoryVtable
cdef void _cb_create_kms_client(
handler,
const CKmsConnectionConfig& kms_connection_config,
shared_ptr[CKmsClient]* out) except *:
connection_config = KmsConnectionConfig.wrap(kms_connection_config)
result = handler(connection_config)
if not isinstance(result, KmsClient):
raise TypeError(
"callable must return KmsClient instances, but got {}".format(
type(result)))
out[0] = (<KmsClient> result).unwrap()
cdef class CryptoFactory(_Weakrefable):
""" A factory that produces the low-level FileEncryptionProperties and
FileDecryptionProperties objects, from the high-level parameters."""
cdef:
unique_ptr[CPyCryptoFactory] factory
# Avoid mistakingly creating attributes
__slots__ = ()
def __init__(self, kms_client_factory):
"""Create CryptoFactory.
Parameters
----------
kms_client_factory : a callable that accepts KmsConnectionConfig
and returns a KmsClient
"""
self.factory.reset(new CPyCryptoFactory())
if callable(kms_client_factory):
self.init(kms_client_factory)
else:
raise TypeError("Parameter kms_client_factory must be a callable")
cdef init(self, callable_client_factory):
cdef:
CPyKmsClientFactoryVtable vtable
shared_ptr[CPyKmsClientFactory] kms_client_factory
vtable.create_kms_client = _cb_create_kms_client
kms_client_factory.reset(
new CPyKmsClientFactory(callable_client_factory, vtable))
# A KmsClientFactory object must be registered
# via this method before calling any of
# file_encryption_properties()/file_decryption_properties() methods.
self.factory.get().RegisterKmsClientFactory(
static_pointer_cast[CKmsClientFactory, CPyKmsClientFactory](
kms_client_factory))
def file_encryption_properties(self,
KmsConnectionConfig kms_connection_config,
EncryptionConfiguration encryption_config):
"""Create file encryption properties.
Parameters
----------
kms_connection_config : KmsConnectionConfig
Configuration of connection to KMS
encryption_config : EncryptionConfiguration
Configuration of the encryption, such as which columns to encrypt
Returns
-------
file_encryption_properties : FileEncryptionProperties
File encryption properties.
"""
cdef:
CResult[shared_ptr[CFileEncryptionProperties]] \
file_encryption_properties_result
with nogil:
file_encryption_properties_result = \
self.factory.get().SafeGetFileEncryptionProperties(
deref(kms_connection_config.unwrap().get()),
deref(encryption_config.unwrap().get()))
file_encryption_properties = GetResultValue(
file_encryption_properties_result)
return FileEncryptionProperties.wrap(file_encryption_properties)
def file_decryption_properties(
self,
KmsConnectionConfig kms_connection_config,
DecryptionConfiguration decryption_config=None):
"""Create file decryption properties.
Parameters
----------
kms_connection_config : KmsConnectionConfig
Configuration of connection to KMS
decryption_config : DecryptionConfiguration, default None
Configuration of the decryption, such as cache timeout.
Can be None.
Returns
-------
file_decryption_properties : FileDecryptionProperties
File decryption properties.
"""
cdef:
CDecryptionConfiguration c_decryption_config
CResult[shared_ptr[CFileDecryptionProperties]] \
c_file_decryption_properties
if decryption_config is None:
c_decryption_config = CDecryptionConfiguration()
else:
c_decryption_config = deref(decryption_config.unwrap().get())
with nogil:
c_file_decryption_properties = \
self.factory.get().SafeGetFileDecryptionProperties(
deref(kms_connection_config.unwrap().get()),
c_decryption_config)
file_decryption_properties = GetResultValue(
c_file_decryption_properties)
return FileDecryptionProperties.wrap(file_decryption_properties)
def remove_cache_entries_for_token(self, access_token):
self.factory.get().RemoveCacheEntriesForToken(tobytes(access_token))
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()

View File

@ -0,0 +1,875 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from libcpp cimport bool as c_bool, nullptr
from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
from libcpp.string cimport string as c_string
from libcpp.vector cimport vector as c_vector
from libcpp.unordered_map cimport unordered_map
from libc.stdint cimport int64_t, uint8_t, uintptr_t
from cython.operator cimport dereference as deref, preincrement as inc
from cpython.pycapsule cimport *
from collections.abc import Sequence
import random
import socket
import warnings
import pyarrow
from pyarrow.lib cimport (Buffer, NativeFile, _Weakrefable,
check_status, pyarrow_wrap_buffer)
from pyarrow.lib import ArrowException, frombytes
from pyarrow.includes.libarrow cimport (CBuffer, CMutableBuffer,
CFixedSizeBufferWriter, CStatus)
from pyarrow.includes.libplasma cimport *
PLASMA_WAIT_TIMEOUT = 2 ** 30
cdef extern from "plasma/common.h" nogil:
cdef cppclass CCudaIpcPlaceholder" plasma::internal::CudaIpcPlaceholder":
pass
cdef cppclass CUniqueID" plasma::UniqueID":
@staticmethod
CUniqueID from_binary(const c_string& binary)
@staticmethod
CUniqueID from_random()
c_bool operator==(const CUniqueID& rhs) const
c_string hex() const
c_string binary() const
@staticmethod
int64_t size()
cdef enum CObjectState" plasma::ObjectState":
PLASMA_CREATED" plasma::ObjectState::PLASMA_CREATED"
PLASMA_SEALED" plasma::ObjectState::PLASMA_SEALED"
cdef struct CObjectTableEntry" plasma::ObjectTableEntry":
int fd
int device_num
int64_t map_size
ptrdiff_t offset
uint8_t* pointer
int64_t data_size
int64_t metadata_size
int ref_count
int64_t create_time
int64_t construct_duration
CObjectState state
shared_ptr[CCudaIpcPlaceholder] ipc_handle
ctypedef unordered_map[CUniqueID, unique_ptr[CObjectTableEntry]] \
CObjectTable" plasma::ObjectTable"
cdef extern from "plasma/common.h":
cdef int64_t kDigestSize" plasma::kDigestSize"
cdef extern from "plasma/client.h" nogil:
cdef cppclass CPlasmaClient" plasma::PlasmaClient":
CPlasmaClient()
CStatus Connect(const c_string& store_socket_name,
const c_string& manager_socket_name,
int release_delay, int num_retries)
CStatus Create(const CUniqueID& object_id,
int64_t data_size, const uint8_t* metadata, int64_t
metadata_size, const shared_ptr[CBuffer]* data)
CStatus CreateAndSeal(const CUniqueID& object_id,
const c_string& data, const c_string& metadata)
CStatus Get(const c_vector[CUniqueID] object_ids, int64_t timeout_ms,
c_vector[CObjectBuffer]* object_buffers)
CStatus Seal(const CUniqueID& object_id)
CStatus Evict(int64_t num_bytes, int64_t& num_bytes_evicted)
CStatus Hash(const CUniqueID& object_id, uint8_t* digest)
CStatus Release(const CUniqueID& object_id)
CStatus Contains(const CUniqueID& object_id, c_bool* has_object)
CStatus List(CObjectTable* objects)
CStatus Subscribe(int* fd)
CStatus DecodeNotifications(const uint8_t* buffer,
c_vector[CUniqueID]* object_ids,
c_vector[int64_t]* data_sizes,
c_vector[int64_t]* metadata_sizes)
CStatus GetNotification(int fd, CUniqueID* object_id,
int64_t* data_size, int64_t* metadata_size)
CStatus Disconnect()
CStatus Delete(const c_vector[CUniqueID] object_ids)
CStatus SetClientOptions(const c_string& client_name,
int64_t limit_output_memory)
c_string DebugString()
int64_t store_capacity()
cdef extern from "plasma/client.h" nogil:
cdef struct CObjectBuffer" plasma::ObjectBuffer":
shared_ptr[CBuffer] data
shared_ptr[CBuffer] metadata
def make_object_id(object_id):
return ObjectID(object_id)
cdef class ObjectID(_Weakrefable):
"""
An ObjectID represents a string of bytes used to identify Plasma objects.
"""
cdef:
CUniqueID data
def __cinit__(self, object_id):
if (not isinstance(object_id, bytes) or
len(object_id) != CUniqueID.size()):
raise ValueError("Object ID must by 20 bytes,"
" is " + str(object_id))
self.data = CUniqueID.from_binary(object_id)
def __eq__(self, other):
try:
return self.data == (<ObjectID?>other).data
except TypeError:
return False
def __hash__(self):
return hash(self.data.binary())
def __repr__(self):
return "ObjectID(" + self.data.hex().decode() + ")"
def __reduce__(self):
return (make_object_id, (self.data.binary(),))
def binary(self):
"""
Return the binary representation of this ObjectID.
Returns
-------
bytes
Binary representation of the ObjectID.
"""
return self.data.binary()
@staticmethod
def from_random():
"""
Returns a randomly generated ObjectID.
Returns
-------
ObjectID
A randomly generated ObjectID.
"""
random_id = bytes(bytearray(
random.getrandbits(8) for _ in range(CUniqueID.size())))
return ObjectID(random_id)
cdef class ObjectNotAvailable(_Weakrefable):
"""
Placeholder for an object that was not available within the given timeout.
"""
pass
cdef class PlasmaBuffer(Buffer):
"""
This is the type returned by calls to get with a PlasmaClient.
We define our own class instead of directly returning a buffer object so
that we can add a custom destructor which notifies Plasma that the object
is no longer being used, so the memory in the Plasma store backing the
object can potentially be freed.
Attributes
----------
object_id : ObjectID
The ID of the object in the buffer.
client : PlasmaClient
The PlasmaClient that we use to communicate with the store and manager.
"""
cdef:
ObjectID object_id
PlasmaClient client
@staticmethod
cdef PlasmaBuffer create(ObjectID object_id, PlasmaClient client,
const shared_ptr[CBuffer]& buffer):
cdef PlasmaBuffer self = PlasmaBuffer.__new__(PlasmaBuffer)
self.object_id = object_id
self.client = client
self.init(buffer)
return self
def __init__(self):
raise TypeError("Do not call PlasmaBuffer's constructor directly, use "
"`PlasmaClient.create` instead.")
def __dealloc__(self):
"""
Notify Plasma that the object is no longer needed.
If the plasma client has been shut down, then don't do anything.
"""
self.client._release(self.object_id)
class PlasmaObjectNotFound(ArrowException):
pass
class PlasmaStoreFull(ArrowException):
pass
class PlasmaObjectExists(ArrowException):
pass
cdef int plasma_check_status(const CStatus& status) nogil except -1:
if status.ok():
return 0
with gil:
message = frombytes(status.message())
if IsPlasmaObjectExists(status):
raise PlasmaObjectExists(message)
elif IsPlasmaObjectNotFound(status):
raise PlasmaObjectNotFound(message)
elif IsPlasmaStoreFull(status):
raise PlasmaStoreFull(message)
return check_status(status)
def get_socket_from_fd(fileno, family, type):
import socket
return socket.socket(fileno=fileno, family=family, type=type)
cdef class PlasmaClient(_Weakrefable):
"""
The PlasmaClient is used to interface with a plasma store and manager.
The PlasmaClient can ask the PlasmaStore to allocate a new buffer, seal a
buffer, and get a buffer. Buffers are referred to by object IDs, which are
strings.
"""
cdef:
shared_ptr[CPlasmaClient] client
int notification_fd
c_string store_socket_name
def __cinit__(self):
self.client.reset(new CPlasmaClient())
self.notification_fd = -1
self.store_socket_name = b""
cdef _get_object_buffers(self, object_ids, int64_t timeout_ms,
c_vector[CObjectBuffer]* result):
cdef:
c_vector[CUniqueID] ids
ObjectID object_id
for object_id in object_ids:
ids.push_back(object_id.data)
with nogil:
plasma_check_status(self.client.get().Get(ids, timeout_ms, result))
# XXX C++ API should instead expose some kind of CreateAuto()
cdef _make_mutable_plasma_buffer(self, ObjectID object_id, uint8_t* data,
int64_t size):
cdef shared_ptr[CBuffer] buffer
buffer.reset(new CMutableBuffer(data, size))
return PlasmaBuffer.create(object_id, self, buffer)
@property
def store_socket_name(self):
return self.store_socket_name.decode()
def create(self, ObjectID object_id, int64_t data_size,
c_string metadata=b""):
"""
Create a new buffer in the PlasmaStore for a particular object ID.
The returned buffer is mutable until ``seal()`` is called.
Parameters
----------
object_id : ObjectID
The object ID used to identify an object.
data_size : int
The size in bytes of the created buffer.
metadata : bytes
An optional string of bytes encoding whatever metadata the user
wishes to encode.
Returns
-------
buffer : Buffer
A mutable buffer where to write the object data.
Raises
------
PlasmaObjectExists
This exception is raised if the object could not be created because
there already is an object with the same ID in the plasma store.
PlasmaStoreFull
This exception is raised if the object could
not be created because the plasma store is unable to evict
enough objects to create room for it.
"""
cdef shared_ptr[CBuffer] data
with nogil:
plasma_check_status(
self.client.get().Create(object_id.data, data_size,
<uint8_t*>(metadata.data()),
metadata.size(), &data))
return self._make_mutable_plasma_buffer(object_id,
data.get().mutable_data(),
data_size)
def create_and_seal(self, ObjectID object_id, c_string data,
c_string metadata=b""):
"""
Store a new object in the PlasmaStore for a particular object ID.
Parameters
----------
object_id : ObjectID
The object ID used to identify an object.
data : bytes
The object to store.
metadata : bytes
An optional string of bytes encoding whatever metadata the user
wishes to encode.
Raises
------
PlasmaObjectExists
This exception is raised if the object could not be created because
there already is an object with the same ID in the plasma store.
PlasmaStoreFull: This exception is raised if the object could
not be created because the plasma store is unable to evict
enough objects to create room for it.
"""
with nogil:
plasma_check_status(
self.client.get().CreateAndSeal(object_id.data, data,
metadata))
def get_buffers(self, object_ids, timeout_ms=-1, with_meta=False):
"""
Returns data buffer from the PlasmaStore based on object ID.
If the object has not been sealed yet, this call will block. The
retrieved buffer is immutable.
Parameters
----------
object_ids : list
A list of ObjectIDs used to identify some objects.
timeout_ms : int
The number of milliseconds that the get call should block before
timing out and returning. Pass -1 if the call should block and 0
if the call should return immediately.
with_meta : bool
Returns
-------
list
If with_meta=False, this is a list of PlasmaBuffers for the data
associated with the object_ids and None if the object was not
available. If with_meta=True, this is a list of tuples of
PlasmaBuffer and metadata bytes.
"""
cdef c_vector[CObjectBuffer] object_buffers
self._get_object_buffers(object_ids, timeout_ms, &object_buffers)
result = []
for i in range(object_buffers.size()):
if object_buffers[i].data.get() != nullptr:
data = pyarrow_wrap_buffer(object_buffers[i].data)
else:
data = None
if not with_meta:
result.append(data)
else:
if object_buffers[i].metadata.get() != nullptr:
size = object_buffers[i].metadata.get().size()
metadata = object_buffers[i].metadata.get().data()[:size]
else:
metadata = None
result.append((metadata, data))
return result
def get_metadata(self, object_ids, timeout_ms=-1):
"""
Returns metadata buffer from the PlasmaStore based on object ID.
If the object has not been sealed yet, this call will block. The
retrieved buffer is immutable.
Parameters
----------
object_ids : list
A list of ObjectIDs used to identify some objects.
timeout_ms : int
The number of milliseconds that the get call should block before
timing out and returning. Pass -1 if the call should block and 0
if the call should return immediately.
Returns
-------
list
List of PlasmaBuffers for the metadata associated with the
object_ids and None if the object was not available.
"""
cdef c_vector[CObjectBuffer] object_buffers
self._get_object_buffers(object_ids, timeout_ms, &object_buffers)
result = []
for i in range(object_buffers.size()):
if object_buffers[i].metadata.get() != nullptr:
result.append(pyarrow_wrap_buffer(object_buffers[i].metadata))
else:
result.append(None)
return result
def put_raw_buffer(self, object value, ObjectID object_id=None,
c_string metadata=b"", int memcopy_threads=6):
"""
Store Python buffer into the object store.
Parameters
----------
value : Python object that implements the buffer protocol
A Python buffer object to store.
object_id : ObjectID, default None
If this is provided, the specified object ID will be used to refer
to the object.
metadata : bytes
An optional string of bytes encoding whatever metadata the user
wishes to encode.
memcopy_threads : int, default 6
The number of threads to use to write the serialized object into
the object store for large objects.
Returns
-------
ObjectID
The object ID associated to the Python buffer object.
"""
cdef ObjectID target_id = (object_id if object_id
else ObjectID.from_random())
cdef Buffer arrow_buffer = pyarrow.py_buffer(value)
write_buffer = self.create(target_id, len(value), metadata)
stream = pyarrow.FixedSizeBufferWriter(write_buffer)
stream.set_memcopy_threads(memcopy_threads)
stream.write(arrow_buffer)
self.seal(target_id)
return target_id
def put(self, object value, ObjectID object_id=None, int memcopy_threads=6,
serialization_context=None):
"""
Store a Python value into the object store.
Parameters
----------
value : object
A Python object to store.
object_id : ObjectID, default None
If this is provided, the specified object ID will be used to refer
to the object.
memcopy_threads : int, default 6
The number of threads to use to write the serialized object into
the object store for large objects.
serialization_context : pyarrow.SerializationContext, default None
Custom serialization and deserialization context.
Returns
-------
ObjectID
The object ID associated to the Python object.
"""
cdef ObjectID target_id = (object_id if object_id
else ObjectID.from_random())
if serialization_context is not None:
warnings.warn(
"'serialization_context' is deprecated and will be removed "
"in a future version.",
FutureWarning, stacklevel=2
)
serialized = pyarrow.lib._serialize(value, serialization_context)
buffer = self.create(target_id, serialized.total_bytes)
stream = pyarrow.FixedSizeBufferWriter(buffer)
stream.set_memcopy_threads(memcopy_threads)
serialized.write_to(stream)
self.seal(target_id)
return target_id
def get(self, object_ids, int timeout_ms=-1, serialization_context=None):
"""
Get one or more Python values from the object store.
Parameters
----------
object_ids : list or ObjectID
Object ID or list of object IDs associated to the values we get
from the store.
timeout_ms : int, default -1
The number of milliseconds that the get call should block before
timing out and returning. Pass -1 if the call should block and 0
if the call should return immediately.
serialization_context : pyarrow.SerializationContext, default None
Custom serialization and deserialization context.
Returns
-------
list or object
Python value or list of Python values for the data associated with
the object_ids and ObjectNotAvailable if the object was not
available.
"""
if serialization_context is not None:
warnings.warn(
"'serialization_context' is deprecated and will be removed "
"in a future version.",
FutureWarning, stacklevel=2
)
if isinstance(object_ids, Sequence):
results = []
buffers = self.get_buffers(object_ids, timeout_ms)
for i in range(len(object_ids)):
# buffers[i] is None if this object was not available within
# the timeout
if buffers[i]:
val = pyarrow.lib._deserialize(buffers[i],
serialization_context)
results.append(val)
else:
results.append(ObjectNotAvailable)
return results
else:
return self.get([object_ids], timeout_ms, serialization_context)[0]
def seal(self, ObjectID object_id):
"""
Seal the buffer in the PlasmaStore for a particular object ID.
Once a buffer has been sealed, the buffer is immutable and can only be
accessed through get.
Parameters
----------
object_id : ObjectID
A string used to identify an object.
"""
with nogil:
plasma_check_status(self.client.get().Seal(object_id.data))
def _release(self, ObjectID object_id):
"""
Notify Plasma that the object is no longer needed.
Parameters
----------
object_id : ObjectID
A string used to identify an object.
"""
with nogil:
plasma_check_status(self.client.get().Release(object_id.data))
def contains(self, ObjectID object_id):
"""
Check if the object is present and sealed in the PlasmaStore.
Parameters
----------
object_id : ObjectID
A string used to identify an object.
"""
cdef c_bool is_contained
with nogil:
plasma_check_status(self.client.get().Contains(object_id.data,
&is_contained))
return is_contained
def hash(self, ObjectID object_id):
"""
Compute the checksum of an object in the object store.
Parameters
----------
object_id : ObjectID
A string used to identify an object.
Returns
-------
bytes
A digest string object's hash. If the object isn't in the object
store, the string will have length zero.
"""
cdef c_vector[uint8_t] digest = c_vector[uint8_t](kDigestSize)
with nogil:
plasma_check_status(self.client.get().Hash(object_id.data,
digest.data()))
return bytes(digest[:])
def evict(self, int64_t num_bytes):
"""
Evict some objects until to recover some bytes.
Recover at least num_bytes bytes if possible.
Parameters
----------
num_bytes : int
The number of bytes to attempt to recover.
"""
cdef int64_t num_bytes_evicted = -1
with nogil:
plasma_check_status(
self.client.get().Evict(num_bytes, num_bytes_evicted))
return num_bytes_evicted
def subscribe(self):
"""Subscribe to notifications about sealed objects."""
with nogil:
plasma_check_status(
self.client.get().Subscribe(&self.notification_fd))
def get_notification_socket(self):
"""
Get the notification socket.
"""
return get_socket_from_fd(self.notification_fd,
family=socket.AF_UNIX,
type=socket.SOCK_STREAM)
def decode_notifications(self, const uint8_t* buf):
"""
Get the notification from the buffer.
Returns
-------
[ObjectID]
The list of object IDs in the notification message.
c_vector[int64_t]
The data sizes of the objects in the notification message.
c_vector[int64_t]
The metadata sizes of the objects in the notification message.
"""
cdef c_vector[CUniqueID] ids
cdef c_vector[int64_t] data_sizes
cdef c_vector[int64_t] metadata_sizes
with nogil:
status = self.client.get().DecodeNotifications(buf,
&ids,
&data_sizes,
&metadata_sizes)
plasma_check_status(status)
object_ids = []
for object_id in ids:
object_ids.append(ObjectID(object_id.binary()))
return object_ids, data_sizes, metadata_sizes
def get_next_notification(self):
"""
Get the next notification from the notification socket.
Returns
-------
ObjectID
The object ID of the object that was stored.
int
The data size of the object that was stored.
int
The metadata size of the object that was stored.
"""
cdef ObjectID object_id = ObjectID(CUniqueID.size() * b"\0")
cdef int64_t data_size
cdef int64_t metadata_size
with nogil:
status = self.client.get().GetNotification(self.notification_fd,
&object_id.data,
&data_size,
&metadata_size)
plasma_check_status(status)
return object_id, data_size, metadata_size
def to_capsule(self):
return PyCapsule_New(<void *>self.client.get(), "plasma", NULL)
def disconnect(self):
"""
Disconnect this client from the Plasma store.
"""
with nogil:
plasma_check_status(self.client.get().Disconnect())
def delete(self, object_ids):
"""
Delete the objects with the given IDs from other object store.
Parameters
----------
object_ids : list
A list of strings used to identify the objects.
"""
cdef c_vector[CUniqueID] ids
cdef ObjectID object_id
for object_id in object_ids:
ids.push_back(object_id.data)
with nogil:
plasma_check_status(self.client.get().Delete(ids))
def set_client_options(self, client_name, int64_t limit_output_memory):
cdef c_string name
name = client_name.encode()
with nogil:
plasma_check_status(
self.client.get().SetClientOptions(name, limit_output_memory))
def debug_string(self):
cdef c_string result
with nogil:
result = self.client.get().DebugString()
return result.decode()
def list(self):
"""
Experimental: List the objects in the store.
Returns
-------
dict
Dictionary from ObjectIDs to an "info" dictionary describing the
object. The "info" dictionary has the following entries:
data_size
size of the object in bytes
metadata_size
size of the object metadata in bytes
ref_count
Number of clients referencing the object buffer
create_time
Unix timestamp of the creation of the object
construct_duration
Time the creation of the object took in seconds
state
"created" if the object is still being created and
"sealed" if it is already sealed
"""
cdef CObjectTable objects
with nogil:
plasma_check_status(self.client.get().List(&objects))
result = dict()
cdef ObjectID object_id
cdef CObjectTableEntry entry
it = objects.begin()
while it != objects.end():
object_id = ObjectID(deref(it).first.binary())
entry = deref(deref(it).second)
if entry.state == CObjectState.PLASMA_CREATED:
state = "created"
else:
state = "sealed"
result[object_id] = {
"data_size": entry.data_size,
"metadata_size": entry.metadata_size,
"ref_count": entry.ref_count,
"create_time": entry.create_time,
"construct_duration": entry.construct_duration,
"state": state
}
inc(it)
return result
def store_capacity(self):
"""
Get the memory capacity of the store.
Returns
-------
int
The memory capacity of the store in bytes.
"""
return self.client.get().store_capacity()
def connect(store_socket_name, int num_retries=-1):
"""
Return a new PlasmaClient that is connected a plasma store and
optionally a manager.
Parameters
----------
store_socket_name : str
Name of the socket the plasma store is listening at.
num_retries : int, default -1
Number of times to try to connect to plasma store. Default value of -1
uses the default (50)
"""
cdef PlasmaClient result = PlasmaClient()
cdef int deprecated_release_delay = 0
result.store_socket_name = store_socket_name.encode()
with nogil:
plasma_check_status(
result.client.get().Connect(result.store_socket_name, b"",
deprecated_release_delay, num_retries))
return result

View File

@ -0,0 +1,314 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata,
pyarrow_unwrap_metadata)
from pyarrow.lib import frombytes, tobytes, KeyValueMetadata
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_fs cimport *
from pyarrow._fs cimport FileSystem
cpdef enum S3LogLevel:
Off = <int8_t> CS3LogLevel_Off
Fatal = <int8_t> CS3LogLevel_Fatal
Error = <int8_t> CS3LogLevel_Error
Warn = <int8_t> CS3LogLevel_Warn
Info = <int8_t> CS3LogLevel_Info
Debug = <int8_t> CS3LogLevel_Debug
Trace = <int8_t> CS3LogLevel_Trace
def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal):
"""
Initialize S3 support
Parameters
----------
log_level : S3LogLevel
level of logging
"""
cdef CS3GlobalOptions options
options.log_level = <CS3LogLevel> log_level
check_status(CInitializeS3(options))
def finalize_s3():
check_status(CFinalizeS3())
def resolve_s3_region(bucket):
"""
Resolve the S3 region of a bucket.
Parameters
----------
bucket : str
A S3 bucket name
Returns
-------
region : str
A S3 region name
Examples
--------
>>> resolve_s3_region('ursa-labs-taxi-data')
'us-east-2'
"""
cdef:
c_string c_bucket
c_string c_region
c_bucket = tobytes(bucket)
with nogil:
c_region = GetResultValue(ResolveS3BucketRegion(c_bucket))
return frombytes(c_region)
cdef class S3FileSystem(FileSystem):
"""
S3-backed FileSystem implementation
If neither access_key nor secret_key are provided, and role_arn is also not
provided, then attempts to initialize from AWS environment variables,
otherwise both access_key and secret_key must be provided.
If role_arn is provided instead of access_key and secret_key, temporary
credentials will be fetched by issuing a request to STS to assume the
specified role.
Note: S3 buckets are special and the operations available on them may be
limited or more expensive than desired.
Parameters
----------
access_key : str, default None
AWS Access Key ID. Pass None to use the standard AWS environment
variables and/or configuration file.
secret_key : str, default None
AWS Secret Access key. Pass None to use the standard AWS environment
variables and/or configuration file.
session_token : str, default None
AWS Session Token. An optional session token, required if access_key
and secret_key are temporary credentials from STS.
anonymous : boolean, default False
Whether to connect anonymously if access_key and secret_key are None.
If true, will not attempt to look up credentials using standard AWS
configuration methods.
role_arn : str, default None
AWS Role ARN. If provided instead of access_key and secret_key,
temporary credentials will be fetched by assuming this role.
session_name : str, default None
An optional identifier for the assumed role session.
external_id : str, default None
An optional unique identifier that might be required when you assume
a role in another account.
load_frequency : int, default 900
The frequency (in seconds) with which temporary credentials from an
assumed role session will be refreshed.
region : str, default 'us-east-1'
AWS region to connect to.
scheme : str, default 'https'
S3 connection transport scheme.
endpoint_override : str, default None
Override region with a connect string such as "localhost:9000"
background_writes : boolean, default True
Whether file writes will be issued in the background, without
blocking.
default_metadata : mapping or pyarrow.KeyValueMetadata, default None
Default metadata for open_output_stream. This will be ignored if
non-empty metadata is passed to open_output_stream.
proxy_options : dict or str, default None
If a proxy is used, provide the options here. Supported options are:
'scheme' (str: 'http' or 'https'; required), 'host' (str; required),
'port' (int; required), 'username' (str; optional),
'password' (str; optional).
A proxy URI (str) can also be provided, in which case these options
will be derived from the provided URI.
The following are equivalent::
S3FileSystem(proxy_options='http://username:password@localhost:8020')
S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost',
'port': 8020, 'username': 'username',
'password': 'password'})
"""
cdef:
CS3FileSystem* s3fs
def __init__(self, *, access_key=None, secret_key=None, session_token=None,
bint anonymous=False, region=None, scheme=None,
endpoint_override=None, bint background_writes=True,
default_metadata=None, role_arn=None, session_name=None,
external_id=None, load_frequency=900, proxy_options=None):
cdef:
CS3Options options
shared_ptr[CS3FileSystem] wrapped
if access_key is not None and secret_key is None:
raise ValueError(
'In order to initialize with explicit credentials both '
'access_key and secret_key must be provided, '
'`secret_key` is not set.'
)
elif access_key is None and secret_key is not None:
raise ValueError(
'In order to initialize with explicit credentials both '
'access_key and secret_key must be provided, '
'`access_key` is not set.'
)
elif session_token is not None and (access_key is None or
secret_key is None):
raise ValueError(
'In order to initialize a session with temporary credentials, '
'both secret_key and access_key must be provided in addition '
'to session_token.'
)
elif (access_key is not None or secret_key is not None):
if anonymous:
raise ValueError(
'Cannot pass anonymous=True together with access_key '
'and secret_key.')
if role_arn:
raise ValueError(
'Cannot provide role_arn with access_key and secret_key')
if session_token is None:
session_token = ""
options = CS3Options.FromAccessKey(
tobytes(access_key),
tobytes(secret_key),
tobytes(session_token)
)
elif anonymous:
if role_arn:
raise ValueError(
'Cannot provide role_arn with anonymous=True')
options = CS3Options.Anonymous()
elif role_arn:
options = CS3Options.FromAssumeRole(
tobytes(role_arn),
tobytes(session_name),
tobytes(external_id),
load_frequency
)
else:
options = CS3Options.Defaults()
if region is not None:
options.region = tobytes(region)
if scheme is not None:
options.scheme = tobytes(scheme)
if endpoint_override is not None:
options.endpoint_override = tobytes(endpoint_override)
if background_writes is not None:
options.background_writes = background_writes
if default_metadata is not None:
if not isinstance(default_metadata, KeyValueMetadata):
default_metadata = KeyValueMetadata(default_metadata)
options.default_metadata = pyarrow_unwrap_metadata(
default_metadata)
if proxy_options is not None:
if isinstance(proxy_options, dict):
options.proxy_options.scheme = tobytes(proxy_options["scheme"])
options.proxy_options.host = tobytes(proxy_options["host"])
options.proxy_options.port = proxy_options["port"]
proxy_username = proxy_options.get("username", None)
if proxy_username:
options.proxy_options.username = tobytes(proxy_username)
proxy_password = proxy_options.get("password", None)
if proxy_password:
options.proxy_options.password = tobytes(proxy_password)
elif isinstance(proxy_options, str):
options.proxy_options = GetResultValue(
CS3ProxyOptions.FromUriString(tobytes(proxy_options)))
else:
raise TypeError(
"'proxy_options': expected 'dict' or 'str', "
f"got {type(proxy_options)} instead.")
with nogil:
wrapped = GetResultValue(CS3FileSystem.Make(options))
self.init(<shared_ptr[CFileSystem]> wrapped)
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
FileSystem.init(self, wrapped)
self.s3fs = <CS3FileSystem*> wrapped.get()
@classmethod
def _reconstruct(cls, kwargs):
return cls(**kwargs)
def __reduce__(self):
cdef CS3Options opts = self.s3fs.options()
# if creds were explicitly provided, then use them
# else obtain them as they were last time.
if opts.credentials_kind == CS3CredentialsKind_Explicit:
access_key = frombytes(opts.GetAccessKey())
secret_key = frombytes(opts.GetSecretKey())
session_token = frombytes(opts.GetSessionToken())
else:
access_key = None
secret_key = None
session_token = None
return (
S3FileSystem._reconstruct, (dict(
access_key=access_key,
secret_key=secret_key,
session_token=session_token,
anonymous=(opts.credentials_kind ==
CS3CredentialsKind_Anonymous),
region=frombytes(opts.region),
scheme=frombytes(opts.scheme),
endpoint_override=frombytes(opts.endpoint_override),
role_arn=frombytes(opts.role_arn),
session_name=frombytes(opts.session_name),
external_id=frombytes(opts.external_id),
load_frequency=opts.load_frequency,
background_writes=opts.background_writes,
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
proxy_options={'scheme': frombytes(opts.proxy_options.scheme),
'host': frombytes(opts.proxy_options.host),
'port': opts.proxy_options.port,
'username': frombytes(
opts.proxy_options.username),
'password': frombytes(
opts.proxy_options.password)}
),)
)
@property
def region(self):
"""
The AWS region this filesystem connects to.
"""
return frombytes(self.s3fs.region())

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
def benchmark_PandasObjectIsNull(list obj):
Benchmark_PandasObjectIsNull(obj)

View File

@ -0,0 +1,21 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
from pyarrow.lib import benchmark_PandasObjectIsNull

View File

@ -0,0 +1,82 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
cdef class StringBuilder(_Weakrefable):
"""
Builder class for UTF8 strings.
This class exposes facilities for incrementally adding string values and
building the null bitmap for a pyarrow.Array (type='string').
"""
cdef:
unique_ptr[CStringBuilder] builder
def __cinit__(self, MemoryPool memory_pool=None):
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
self.builder.reset(new CStringBuilder(pool))
def append(self, value):
"""
Append a single value to the builder.
The value can either be a string/bytes object or a null value
(np.nan or None).
Parameters
----------
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
if value is None or value is np.nan:
self.builder.get().AppendNull()
elif isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
else:
raise TypeError('StringBuilder only accepts string objects')
def append_values(self, values):
"""
Append all the values from an iterable.
Parameters
----------
values : iterable of string/bytes or np.nan/None values
The values to append to the string array builder.
"""
for value in values:
self.append(value)
def finish(self):
"""
Return result of builder as an Array object; also resets the builder.
Returns
-------
array : pyarrow.Array
"""
cdef shared_ptr[CArray] out
with nogil:
self.builder.get().Finish(&out)
return pyarrow_wrap_array(out)
@property
def null_count(self):
return self.builder.get().null_count()
def __len__(self):
return self.builder.get().length()

View File

@ -0,0 +1,71 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import cffi
c_source = """
struct ArrowSchema {
// Array type description
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;
// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};
struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;
// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};
struct ArrowArrayStream {
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
const char* (*get_last_error)(struct ArrowArrayStream*);
// Release callback
void (*release)(struct ArrowArrayStream*);
// Opaque producer-specific data
void* private_data;
};
"""
# TODO use out-of-line mode for faster import and avoid C parsing
ffi = cffi.FFI()
ffi.cdef(c_source)

View File

@ -0,0 +1,77 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
def encode_file_path(path):
if isinstance(path, str):
# POSIX systems can handle utf-8. UTF8 is converted to utf16-le in
# libarrow
encoded_path = path.encode('utf-8')
else:
encoded_path = path
# Windows file system requires utf-16le for file names; Arrow C++ libraries
# will convert utf8 to utf16
return encoded_path
# Starting with Python 3.7, dicts are guaranteed to be insertion-ordered.
ordered_dict = dict
try:
import pickle5 as builtin_pickle
except ImportError:
import pickle as builtin_pickle
try:
import cloudpickle as pickle
except ImportError:
pickle = builtin_pickle
def tobytes(o):
"""
Encode a unicode or bytes string to bytes.
Parameters
----------
o : str or bytes
Input string.
"""
if isinstance(o, str):
return o.encode('utf8')
else:
return o
def frombytes(o, *, safe=False):
"""
Decode the given bytestring to unicode.
Parameters
----------
o : bytes-like
Input object.
safe : bool, default False
If true, raise on encoding errors.
"""
if safe:
return o.decode('utf8', errors='replace')
else:
return o.decode('utf8')

View File

@ -0,0 +1,29 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
import pyarrow.util as util
import warnings
warnings.warn("pyarrow.compat has been deprecated and will be removed in a "
"future release", FutureWarning)
guid = util._deprecate_api("compat.guid", "util.guid",
util.guid, "1.0.0")

View File

@ -0,0 +1,655 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from pyarrow._compute import ( # noqa
Function,
FunctionOptions,
FunctionRegistry,
HashAggregateFunction,
HashAggregateKernel,
Kernel,
ScalarAggregateFunction,
ScalarAggregateKernel,
ScalarFunction,
ScalarKernel,
VectorFunction,
VectorKernel,
# Option classes
ArraySortOptions,
AssumeTimezoneOptions,
CastOptions,
CountOptions,
DayOfWeekOptions,
DictionaryEncodeOptions,
ElementWiseAggregateOptions,
ExtractRegexOptions,
FilterOptions,
IndexOptions,
JoinOptions,
MakeStructOptions,
MapLookupOptions,
MatchSubstringOptions,
ModeOptions,
NullOptions,
PadOptions,
PartitionNthOptions,
QuantileOptions,
RandomOptions,
ReplaceSliceOptions,
ReplaceSubstringOptions,
RoundOptions,
RoundTemporalOptions,
RoundToMultipleOptions,
ScalarAggregateOptions,
SelectKOptions,
SetLookupOptions,
SliceOptions,
SortOptions,
SplitOptions,
SplitPatternOptions,
StrftimeOptions,
StrptimeOptions,
StructFieldOptions,
TakeOptions,
TDigestOptions,
TrimOptions,
Utf8NormalizeOptions,
VarianceOptions,
WeekOptions,
# Functions
call_function,
function_registry,
get_function,
list_functions,
_group_by,
# Expressions
Expression,
)
from collections import namedtuple
import inspect
from textwrap import dedent
import warnings
import pyarrow as pa
from pyarrow import _compute_docstrings
from pyarrow.vendored import docscrape
def _get_arg_names(func):
return func._doc.arg_names
_OptionsClassDoc = namedtuple('_OptionsClassDoc', ('params',))
def _scrape_options_class_doc(options_class):
if not options_class.__doc__:
return None
doc = docscrape.NumpyDocString(options_class.__doc__)
return _OptionsClassDoc(doc['Parameters'])
def _decorate_compute_function(wrapper, exposed_name, func, options_class):
# Decorate the given compute function wrapper with useful metadata
# and documentation.
cpp_doc = func._doc
wrapper.__arrow_compute_function__ = dict(
name=func.name,
arity=func.arity,
options_class=cpp_doc.options_class,
options_required=cpp_doc.options_required)
wrapper.__name__ = exposed_name
wrapper.__qualname__ = exposed_name
doc_pieces = []
# 1. One-line summary
summary = cpp_doc.summary
if not summary:
arg_str = "arguments" if func.arity > 1 else "argument"
summary = ("Call compute function {!r} with the given {}"
.format(func.name, arg_str))
doc_pieces.append(f"{summary}.\n\n")
# 2. Multi-line description
description = cpp_doc.description
if description:
doc_pieces.append(f"{description}\n\n")
doc_addition = _compute_docstrings.function_doc_additions.get(func.name)
# 3. Parameter description
doc_pieces.append(dedent("""\
Parameters
----------
"""))
# 3a. Compute function parameters
arg_names = _get_arg_names(func)
for arg_name in arg_names:
if func.kind in ('vector', 'scalar_aggregate'):
arg_type = 'Array-like'
else:
arg_type = 'Array-like or scalar-like'
doc_pieces.append(f"{arg_name} : {arg_type}\n")
doc_pieces.append(" Argument to compute function.\n")
# 3b. Compute function option values
if options_class is not None:
options_class_doc = _scrape_options_class_doc(options_class)
if options_class_doc:
for p in options_class_doc.params:
doc_pieces.append(f"{p.name} : {p.type}\n")
for s in p.desc:
doc_pieces.append(f" {s}\n")
else:
warnings.warn(f"Options class {options_class.__name__} "
f"does not have a docstring", RuntimeWarning)
options_sig = inspect.signature(options_class)
for p in options_sig.parameters.values():
doc_pieces.append(dedent("""\
{0} : optional
Parameter for {1} constructor. Either `options`
or `{0}` can be passed, but not both at the same time.
""".format(p.name, options_class.__name__)))
doc_pieces.append(dedent(f"""\
options : pyarrow.compute.{options_class.__name__}, optional
Alternative way of passing options.
"""))
doc_pieces.append(dedent("""\
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
"""))
# 4. Custom addition (e.g. examples)
if doc_addition is not None:
doc_pieces.append("\n{}\n".format(dedent(doc_addition).strip("\n")))
wrapper.__doc__ = "".join(doc_pieces)
return wrapper
def _get_options_class(func):
class_name = func._doc.options_class
if not class_name:
return None
try:
return globals()[class_name]
except KeyError:
warnings.warn("Python binding for {} not exposed"
.format(class_name), RuntimeWarning)
return None
def _handle_options(name, options_class, options, args, kwargs):
if args or kwargs:
if options is not None:
raise TypeError(
"Function {!r} called with both an 'options' argument "
"and additional arguments"
.format(name))
return options_class(*args, **kwargs)
if options is not None:
if isinstance(options, dict):
return options_class(**options)
elif isinstance(options, options_class):
return options
raise TypeError(
"Function {!r} expected a {} parameter, got {}"
.format(name, options_class, type(options)))
return None
def _make_generic_wrapper(func_name, func, options_class, arity):
if options_class is None:
def wrapper(*args, memory_pool=None):
if arity is not Ellipsis and len(args) != arity:
raise TypeError(
f"{func_name} takes {arity} positional argument(s), "
f"but {len(args)} were given"
)
if args and isinstance(args[0], Expression):
return Expression._call(func_name, list(args))
return func.call(args, None, memory_pool)
else:
def wrapper(*args, memory_pool=None, options=None, **kwargs):
if arity is not Ellipsis:
if len(args) < arity:
raise TypeError(
f"{func_name} takes {arity} positional argument(s), "
f"but {len(args)} were given"
)
option_args = args[arity:]
args = args[:arity]
else:
option_args = ()
options = _handle_options(func_name, options_class, options,
option_args, kwargs)
if args and isinstance(args[0], Expression):
return Expression._call(func_name, list(args), options)
return func.call(args, options, memory_pool)
return wrapper
def _make_signature(arg_names, var_arg_names, options_class):
from inspect import Parameter
params = []
for name in arg_names:
params.append(Parameter(name, Parameter.POSITIONAL_ONLY))
for name in var_arg_names:
params.append(Parameter(name, Parameter.VAR_POSITIONAL))
if options_class is not None:
options_sig = inspect.signature(options_class)
for p in options_sig.parameters.values():
assert p.kind in (Parameter.POSITIONAL_OR_KEYWORD,
Parameter.KEYWORD_ONLY)
if var_arg_names:
# Cannot have a positional argument after a *args
p = p.replace(kind=Parameter.KEYWORD_ONLY)
params.append(p)
params.append(Parameter("options", Parameter.KEYWORD_ONLY,
default=None))
params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY,
default=None))
return inspect.Signature(params)
def _wrap_function(name, func):
options_class = _get_options_class(func)
arg_names = _get_arg_names(func)
has_vararg = arg_names and arg_names[-1].startswith('*')
if has_vararg:
var_arg_names = [arg_names.pop().lstrip('*')]
else:
var_arg_names = []
wrapper = _make_generic_wrapper(
name, func, options_class, arity=func.arity)
wrapper.__signature__ = _make_signature(arg_names, var_arg_names,
options_class)
return _decorate_compute_function(wrapper, name, func, options_class)
def _make_global_functions():
"""
Make global functions wrapping each compute function.
Note that some of the automatically-generated wrappers may be overridden
by custom versions below.
"""
g = globals()
reg = function_registry()
# Avoid clashes with Python keywords
rewrites = {'and': 'and_',
'or': 'or_'}
for cpp_name in reg.list_functions():
name = rewrites.get(cpp_name, cpp_name)
func = reg.get_function(cpp_name)
if func.kind == "hash_aggregate":
# Hash aggregate functions are not callable,
# so let's not expose them at module level.
continue
assert name not in g, name
g[cpp_name] = g[name] = _wrap_function(name, func)
_make_global_functions()
def cast(arr, target_type, safe=True):
"""
Cast array values to another data type. Can also be invoked as an array
instance method.
Parameters
----------
arr : Array-like
target_type : DataType or str
Type to cast to
safe : bool, default True
Check for overflows or other unsafe conversions
Examples
--------
>>> from datetime import datetime
>>> import pyarrow as pa
>>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
>>> arr.type
TimestampType(timestamp[us])
You can use ``pyarrow.DataType`` objects to specify the target type:
>>> cast(arr, pa.timestamp('ms'))
<pyarrow.lib.TimestampArray object at 0x7fe93c0f6910>
[
2010-01-01 00:00:00.000,
2015-01-01 00:00:00.000
]
>>> cast(arr, pa.timestamp('ms')).type
TimestampType(timestamp[ms])
Alternatively, it is also supported to use the string aliases for these
types:
>>> arr.cast('timestamp[ms]')
<pyarrow.lib.TimestampArray object at 0x10420eb88>
[
1262304000000,
1420070400000
]
>>> arr.cast('timestamp[ms]').type
TimestampType(timestamp[ms])
Returns
-------
casted : Array
"""
if target_type is None:
raise ValueError("Cast target type must not be None")
if safe:
options = CastOptions.safe(target_type)
else:
options = CastOptions.unsafe(target_type)
return call_function("cast", [arr], options)
def index(data, value, start=None, end=None, *, memory_pool=None):
"""
Find the index of the first occurrence of a given value.
Parameters
----------
data : Array-like
value : Scalar-like object
The value to search for.
start : int, optional
end : int, optional
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
index : int
the index, or -1 if not found
"""
if start is not None:
if end is not None:
data = data.slice(start, end - start)
else:
data = data.slice(start)
elif end is not None:
data = data.slice(0, end)
if not isinstance(value, pa.Scalar):
value = pa.scalar(value, type=data.type)
elif data.type != value.type:
value = pa.scalar(value.as_py(), type=data.type)
options = IndexOptions(value=value)
result = call_function('index', [data], options, memory_pool)
if start is not None and result.as_py() >= 0:
result = pa.scalar(result.as_py() + start, type=pa.int64())
return result
def take(data, indices, *, boundscheck=True, memory_pool=None):
"""
Select values (or records) from array- or table-like data given integer
selection indices.
The result will be of the same type(s) as the input, with elements taken
from the input array (or record batch / table fields) at the given
indices. If an index is null then the corresponding value in the output
will be null.
Parameters
----------
data : Array, ChunkedArray, RecordBatch, or Table
indices : Array, ChunkedArray
Must be of integer type
boundscheck : boolean, default True
Whether to boundscheck the indices. If False and there is an out of
bounds index, will likely cause the process to crash.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
result : depends on inputs
Examples
--------
>>> import pyarrow as pa
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> indices = pa.array([0, None, 4, 3])
>>> arr.take(indices)
<pyarrow.lib.StringArray object at 0x7ffa4fc7d368>
[
"a",
null,
"e",
null
]
"""
options = TakeOptions(boundscheck=boundscheck)
return call_function('take', [data, indices], options, memory_pool)
def fill_null(values, fill_value):
"""
Replace each null element in values with fill_value. The fill_value must be
the same type as values or able to be implicitly casted to the array's
type.
This is an alias for :func:`coalesce`.
Parameters
----------
values : Array, ChunkedArray, or Scalar-like object
Each null element is replaced with the corresponding value
from fill_value.
fill_value : Array, ChunkedArray, or Scalar-like object
If not same type as data will attempt to cast.
Returns
-------
result : depends on inputs
Examples
--------
>>> import pyarrow as pa
>>> arr = pa.array([1, 2, None, 3], type=pa.int8())
>>> fill_value = pa.scalar(5, type=pa.int8())
>>> arr.fill_null(fill_value)
pyarrow.lib.Int8Array object at 0x7f95437f01a0>
[
1,
2,
5,
3
]
"""
if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)):
fill_value = pa.scalar(fill_value, type=values.type)
elif values.type != fill_value.type:
fill_value = pa.scalar(fill_value.as_py(), type=values.type)
return call_function("coalesce", [values, fill_value])
def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
"""
Select the indices of the top-k ordered elements from array- or table-like
data.
This is a specialization for :func:`select_k_unstable`. Output is not
guaranteed to be stable.
Parameters
----------
values : Array, ChunkedArray, RecordBatch, or Table
Data to sort and get top indices from.
k : int
The number of `k` elements to keep.
sort_keys : List-like
Column key names to order by when input is table-like data.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
result : Array of indices
Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> pc.top_k_unstable(arr, k=3)
<pyarrow.lib.UInt64Array object at 0x7fdcb19d7f30>
[
5,
4,
2
]
"""
if sort_keys is None:
sort_keys = []
if isinstance(values, (pa.Array, pa.ChunkedArray)):
sort_keys.append(("dummy", "descending"))
else:
sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
options = SelectKOptions(k, sort_keys)
return call_function("select_k_unstable", [values], options, memory_pool)
def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
"""
Select the indices of the bottom-k ordered elements from
array- or table-like data.
This is a specialization for :func:`select_k_unstable`. Output is not
guaranteed to be stable.
Parameters
----------
values : Array, ChunkedArray, RecordBatch, or Table
Data to sort and get bottom indices from.
k : int
The number of `k` elements to keep.
sort_keys : List-like
Column key names to order by when input is table-like data.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.
Returns
-------
result : Array of indices
Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
>>> pc.bottom_k_unstable(arr, k=3)
<pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0>
[
0,
1,
2
]
"""
if sort_keys is None:
sort_keys = []
if isinstance(values, (pa.Array, pa.ChunkedArray)):
sort_keys.append(("dummy", "ascending"))
else:
sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
options = SelectKOptions(k, sort_keys)
return call_function("select_k_unstable", [values], options, memory_pool)
def field(*name_or_index):
"""Reference a column of the dataset.
Stores only the field's name. Type and other information is known only when
the expression is bound to a dataset having an explicit scheme.
Nested references are allowed by passing multiple names or a tuple of
names. For example ``('foo', 'bar')`` references the field named "bar"
inside the field named "foo".
Parameters
----------
*name_or_index : string, multiple strings, tuple or int
The name or index of the (possibly nested) field the expression
references to.
Returns
-------
field_expr : Expression
Examples
--------
>>> import pyarrow.compute as pc
>>> pc.field("a")
<pyarrow.compute.Expression a>
>>> pc.field(1)
<pyarrow.compute.Expression FieldPath(1)>
>>> pc.field(("a", "b"))
<pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
>>> pc.field("a", "b")
<pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
"""
n = len(name_or_index)
if n == 1:
if isinstance(name_or_index[0], (str, int)):
return Expression._field(name_or_index[0])
elif isinstance(name_or_index[0], tuple):
return Expression._nested_field(name_or_index[0])
else:
raise TypeError(
"field reference should be str, multiple str, tuple or "
f"integer, got {type(name_or_index[0])}"
)
# In case of multiple strings not supplied in a tuple
else:
return Expression._nested_field(name_or_index)
def scalar(value):
"""Expression representing a scalar value.
Parameters
----------
value : bool, int, float or string
Python value of the scalar. Note that only a subset of types are
currently supported.
Returns
-------
scalar_expr : Expression
"""
return Expression._scalar(value)

View File

@ -0,0 +1,76 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from pyarrow.includes.libarrow cimport GetBuildInfo
from collections import namedtuple
VersionInfo = namedtuple('VersionInfo', ('major', 'minor', 'patch'))
BuildInfo = namedtuple(
'BuildInfo',
('version', 'version_info', 'so_version', 'full_so_version',
'compiler_id', 'compiler_version', 'compiler_flags',
'git_id', 'git_description', 'package_kind', 'build_type'))
RuntimeInfo = namedtuple('RuntimeInfo',
('simd_level', 'detected_simd_level'))
cdef _build_info():
cdef:
const CBuildInfo* c_info
c_info = &GetBuildInfo()
return BuildInfo(version=frombytes(c_info.version_string),
version_info=VersionInfo(c_info.version_major,
c_info.version_minor,
c_info.version_patch),
so_version=frombytes(c_info.so_version),
full_so_version=frombytes(c_info.full_so_version),
compiler_id=frombytes(c_info.compiler_id),
compiler_version=frombytes(c_info.compiler_version),
compiler_flags=frombytes(c_info.compiler_flags),
git_id=frombytes(c_info.git_id),
git_description=frombytes(c_info.git_description),
package_kind=frombytes(c_info.package_kind),
build_type=frombytes(c_info.build_type).lower(),
)
cpp_build_info = _build_info()
cpp_version = cpp_build_info.version
cpp_version_info = cpp_build_info.version_info
def runtime_info():
"""
Get runtime information.
Returns
-------
info : pyarrow.RuntimeInfo
"""
cdef:
CRuntimeInfo c_info
c_info = GetRuntimeInfo()
return RuntimeInfo(
simd_level=frombytes(c_info.simd_level),
detected_simd_level=frombytes(c_info.detected_simd_level))

View File

@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from pyarrow._csv import ( # noqa
ReadOptions, ParseOptions, ConvertOptions, ISO8601,
open_csv, read_csv, CSVStreamingReader, write_csv,
WriteOptions, CSVWriter, InvalidRow)

View File

@ -0,0 +1,25 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# flake8: noqa
from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer,
HostBuffer, BufferReader, BufferWriter,
new_host_buffer,
serialize_record_batch, read_message,
read_record_batch)

View File

@ -0,0 +1,935 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Dataset is currently unstable. APIs subject to change without notice."""
import pyarrow as pa
from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
from pyarrow._dataset import ( # noqa
CsvFileFormat,
CsvFragmentScanOptions,
Dataset,
DatasetFactory,
DirectoryPartitioning,
FilenamePartitioning,
FileFormat,
FileFragment,
FileSystemDataset,
FileSystemDatasetFactory,
FileSystemFactoryOptions,
FileWriteOptions,
Fragment,
FragmentScanOptions,
HivePartitioning,
IpcFileFormat,
IpcFileWriteOptions,
InMemoryDataset,
Partitioning,
PartitioningFactory,
Scanner,
TaggedRecordBatch,
UnionDataset,
UnionDatasetFactory,
_get_partition_keys,
_filesystemdataset_write,
)
# keep Expression functionality exposed here for backwards compatibility
from pyarrow.compute import Expression, scalar, field # noqa
_orc_available = False
_orc_msg = (
"The pyarrow installation is not built with support for the ORC file "
"format."
)
try:
from pyarrow._dataset_orc import OrcFileFormat
_orc_available = True
except ImportError:
pass
_parquet_available = False
_parquet_msg = (
"The pyarrow installation is not built with support for the Parquet file "
"format."
)
try:
from pyarrow._dataset_parquet import ( # noqa
ParquetDatasetFactory,
ParquetFactoryOptions,
ParquetFileFormat,
ParquetFileFragment,
ParquetFileWriteOptions,
ParquetFragmentScanOptions,
ParquetReadOptions,
RowGroupInfo,
)
_parquet_available = True
except ImportError:
pass
def __getattr__(name):
if name == "OrcFileFormat" and not _orc_available:
raise ImportError(_orc_msg)
if name == "ParquetFileFormat" and not _parquet_available:
raise ImportError(_parquet_msg)
raise AttributeError(
"module 'pyarrow.dataset' has no attribute '{0}'".format(name)
)
def partitioning(schema=None, field_names=None, flavor=None,
dictionaries=None):
"""
Specify a partitioning scheme.
The supported schemes include:
- "DirectoryPartitioning": this scheme expects one segment in the file path
for each field in the specified schema (all fields are required to be
present). For example given schema<year:int16, month:int8> the path
"/2009/11" would be parsed to ("year"_ == 2009 and "month"_ == 11).
- "HivePartitioning": a scheme for "/$key=$value/" nested directories as
found in Apache Hive. This is a multi-level, directory based partitioning
scheme. Data is partitioned by static values of a particular column in
the schema. Partition keys are represented in the form $key=$value in
directory names. Field order is ignored, as are missing or unrecognized
field names.
For example, given schema<year:int16, month:int8, day:int8>, a possible
path would be "/year=2009/month=11/day=15" (but the field order does not
need to match).
- "FilenamePartitioning": this scheme expects the partitions will have
filenames containing the field values separated by "_".
For example, given schema<year:int16, month:int8, day:int8>, a possible
partition filename "2009_11_part-0.parquet" would be parsed
to ("year"_ == 2009 and "month"_ == 11).
Parameters
----------
schema : pyarrow.Schema, default None
The schema that describes the partitions present in the file path.
If not specified, and `field_names` and/or `flavor` are specified,
the schema will be inferred from the file path (and a
PartitioningFactory is returned).
field_names : list of str, default None
A list of strings (field names). If specified, the schema's types are
inferred from the file paths (only valid for DirectoryPartitioning).
flavor : str, default None
The default is DirectoryPartitioning. Specify ``flavor="hive"`` for
a HivePartitioning, and ``flavor="filename"`` for a
FilenamePartitioning.
dictionaries : dict[str, Array]
If the type of any field of `schema` is a dictionary type, the
corresponding entry of `dictionaries` must be an array containing
every value which may be taken by the corresponding column or an
error will be raised in parsing. Alternatively, pass `infer` to have
Arrow discover the dictionary values, in which case a
PartitioningFactory is returned.
Returns
-------
Partitioning or PartitioningFactory
Examples
--------
Specify the Schema for paths like "/2009/June":
>>> partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())]))
or let the types be inferred by only specifying the field names:
>>> partitioning(field_names=["year", "month"])
For paths like "/2009/June", the year will be inferred as int32 while month
will be inferred as string.
Specify a Schema with dictionary encoding, providing dictionary values:
>>> partitioning(
... pa.schema([
... ("year", pa.int16()),
... ("month", pa.dictionary(pa.int8(), pa.string()))
... ]),
... dictionaries={
... "month": pa.array(["January", "February", "March"]),
... })
Alternatively, specify a Schema with dictionary encoding, but have Arrow
infer the dictionary values:
>>> partitioning(
... pa.schema([
... ("year", pa.int16()),
... ("month", pa.dictionary(pa.int8(), pa.string()))
... ]),
... dictionaries="infer")
Create a Hive scheme for a path like "/year=2009/month=11":
>>> partitioning(
... pa.schema([("year", pa.int16()), ("month", pa.int8())]),
... flavor="hive")
A Hive scheme can also be discovered from the directory structure (and
types will be inferred):
>>> partitioning(flavor="hive")
"""
if flavor is None:
# default flavor
if schema is not None:
if field_names is not None:
raise ValueError(
"Cannot specify both 'schema' and 'field_names'")
if dictionaries == 'infer':
return DirectoryPartitioning.discover(schema=schema)
return DirectoryPartitioning(schema, dictionaries)
elif field_names is not None:
if isinstance(field_names, list):
return DirectoryPartitioning.discover(field_names)
else:
raise ValueError(
"Expected list of field names, got {}".format(
type(field_names)))
else:
raise ValueError(
"For the default directory flavor, need to specify "
"a Schema or a list of field names")
if flavor == "filename":
# default flavor
if schema is not None:
if field_names is not None:
raise ValueError(
"Cannot specify both 'schema' and 'field_names'")
if dictionaries == 'infer':
return FilenamePartitioning.discover(schema=schema)
return FilenamePartitioning(schema, dictionaries)
elif field_names is not None:
if isinstance(field_names, list):
return FilenamePartitioning.discover(field_names)
else:
raise ValueError(
"Expected list of field names, got {}".format(
type(field_names)))
else:
raise ValueError(
"For the filename flavor, need to specify "
"a Schema or a list of field names")
elif flavor == 'hive':
if field_names is not None:
raise ValueError("Cannot specify 'field_names' for flavor 'hive'")
elif schema is not None:
if isinstance(schema, pa.Schema):
if dictionaries == 'infer':
return HivePartitioning.discover(schema=schema)
return HivePartitioning(schema, dictionaries)
else:
raise ValueError(
"Expected Schema for 'schema', got {}".format(
type(schema)))
else:
return HivePartitioning.discover()
else:
raise ValueError("Unsupported flavor")
def _ensure_partitioning(scheme):
"""
Validate input and return a Partitioning(Factory).
It passes None through if no partitioning scheme is defined.
"""
if scheme is None:
pass
elif isinstance(scheme, str):
scheme = partitioning(flavor=scheme)
elif isinstance(scheme, list):
scheme = partitioning(field_names=scheme)
elif isinstance(scheme, (Partitioning, PartitioningFactory)):
pass
else:
ValueError("Expected Partitioning or PartitioningFactory, got {}"
.format(type(scheme)))
return scheme
def _ensure_format(obj):
if isinstance(obj, FileFormat):
return obj
elif obj == "parquet":
if not _parquet_available:
raise ValueError(_parquet_msg)
return ParquetFileFormat()
elif obj in {"ipc", "arrow", "feather"}:
return IpcFileFormat()
elif obj == "csv":
return CsvFileFormat()
elif obj == "orc":
if not _orc_available:
raise ValueError(_orc_msg)
return OrcFileFormat()
else:
raise ValueError("format '{}' is not supported".format(obj))
def _ensure_multiple_sources(paths, filesystem=None):
"""
Treat a list of paths as files belonging to a single file system
If the file system is local then also validates that all paths
are referencing existing *files* otherwise any non-file paths will be
silently skipped (for example on a remote filesystem).
Parameters
----------
paths : list of path-like
Note that URIs are not allowed.
filesystem : FileSystem or str, optional
If an URI is passed, then its path component will act as a prefix for
the file paths.
Returns
-------
(FileSystem, list of str)
File system object and a list of normalized paths.
Raises
------
TypeError
If the passed filesystem has wrong type.
IOError
If the file system is local and a referenced path is not available or
not a file.
"""
from pyarrow.fs import (
LocalFileSystem, SubTreeFileSystem, _MockFileSystem, FileType,
_ensure_filesystem
)
if filesystem is None:
# fall back to local file system as the default
filesystem = LocalFileSystem()
else:
# construct a filesystem if it is a valid URI
filesystem = _ensure_filesystem(filesystem)
is_local = (
isinstance(filesystem, (LocalFileSystem, _MockFileSystem)) or
(isinstance(filesystem, SubTreeFileSystem) and
isinstance(filesystem.base_fs, LocalFileSystem))
)
# allow normalizing irregular paths such as Windows local paths
paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths]
# validate that all of the paths are pointing to existing *files*
# possible improvement is to group the file_infos by type and raise for
# multiple paths per error category
if is_local:
for info in filesystem.get_file_info(paths):
file_type = info.type
if file_type == FileType.File:
continue
elif file_type == FileType.NotFound:
raise FileNotFoundError(info.path)
elif file_type == FileType.Directory:
raise IsADirectoryError(
'Path {} points to a directory, but only file paths are '
'supported. To construct a nested or union dataset pass '
'a list of dataset objects instead.'.format(info.path)
)
else:
raise IOError(
'Path {} exists but its type is unknown (could be a '
'special file such as a Unix socket or character device, '
'or Windows NUL / CON / ...)'.format(info.path)
)
return filesystem, paths
def _ensure_single_source(path, filesystem=None):
"""
Treat path as either a recursively traversable directory or a single file.
Parameters
----------
path : path-like
filesystem : FileSystem or str, optional
If an URI is passed, then its path component will act as a prefix for
the file paths.
Returns
-------
(FileSystem, list of str or fs.Selector)
File system object and either a single item list pointing to a file or
an fs.Selector object pointing to a directory.
Raises
------
TypeError
If the passed filesystem has wrong type.
FileNotFoundError
If the referenced file or directory doesn't exist.
"""
from pyarrow.fs import FileType, FileSelector, _resolve_filesystem_and_path
# at this point we already checked that `path` is a path-like
filesystem, path = _resolve_filesystem_and_path(path, filesystem)
# ensure that the path is normalized before passing to dataset discovery
path = filesystem.normalize_path(path)
# retrieve the file descriptor
file_info = filesystem.get_file_info(path)
# depending on the path type either return with a recursive
# directory selector or as a list containing a single file
if file_info.type == FileType.Directory:
paths_or_selector = FileSelector(path, recursive=True)
elif file_info.type == FileType.File:
paths_or_selector = [path]
else:
raise FileNotFoundError(path)
return filesystem, paths_or_selector
def _filesystem_dataset(source, schema=None, filesystem=None,
partitioning=None, format=None,
partition_base_dir=None, exclude_invalid_files=None,
selector_ignore_prefixes=None):
"""
Create a FileSystemDataset which can be used to build a Dataset.
Parameters are documented in the dataset function.
Returns
-------
FileSystemDataset
"""
format = _ensure_format(format or 'parquet')
partitioning = _ensure_partitioning(partitioning)
if isinstance(source, (list, tuple)):
fs, paths_or_selector = _ensure_multiple_sources(source, filesystem)
else:
fs, paths_or_selector = _ensure_single_source(source, filesystem)
options = FileSystemFactoryOptions(
partitioning=partitioning,
partition_base_dir=partition_base_dir,
exclude_invalid_files=exclude_invalid_files,
selector_ignore_prefixes=selector_ignore_prefixes
)
factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)
return factory.finish(schema)
def _in_memory_dataset(source, schema=None, **kwargs):
if any(v is not None for v in kwargs.values()):
raise ValueError(
"For in-memory datasets, you cannot pass any additional arguments")
return InMemoryDataset(source, schema)
def _union_dataset(children, schema=None, **kwargs):
if any(v is not None for v in kwargs.values()):
raise ValueError(
"When passing a list of Datasets, you cannot pass any additional "
"arguments"
)
if schema is None:
# unify the children datasets' schemas
schema = pa.unify_schemas([child.schema for child in children])
# create datasets with the requested schema
children = [child.replace_schema(schema) for child in children]
return UnionDataset(schema, children)
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None,
partitioning=None, partition_base_dir=None):
"""
Create a FileSystemDataset from a `_metadata` file created via
`pyarrrow.parquet.write_metadata`.
Parameters
----------
metadata_path : path,
Path pointing to a single file parquet metadata file
schema : Schema, optional
Optionally provide the Schema for the Dataset, in which case it will
not be inferred from the source.
filesystem : FileSystem or URI string, default None
If a single path is given as source and filesystem is None, then the
filesystem will be inferred from the path.
If an URI string is passed, then a filesystem object is constructed
using the URI's optional path component as a directory prefix. See the
examples below.
Note that the URIs on Windows must follow 'file:///C:...' or
'file:/C:...' patterns.
format : ParquetFileFormat
An instance of a ParquetFileFormat if special options needs to be
passed.
partitioning : Partitioning, PartitioningFactory, str, list of str
The partitioning scheme specified with the ``partitioning()``
function. A flavor string can be used as shortcut, and with a list of
field names a DirectionaryPartitioning will be inferred.
partition_base_dir : str, optional
For the purposes of applying the partitioning, paths will be
stripped of the partition_base_dir. Files not matching the
partition_base_dir prefix will be skipped for partitioning discovery.
The ignored files will still be part of the Dataset, but will not
have partition information.
Returns
-------
FileSystemDataset
"""
from pyarrow.fs import LocalFileSystem, _ensure_filesystem
if format is None:
format = ParquetFileFormat()
elif not isinstance(format, ParquetFileFormat):
raise ValueError("format argument must be a ParquetFileFormat")
if filesystem is None:
filesystem = LocalFileSystem()
else:
filesystem = _ensure_filesystem(filesystem)
metadata_path = filesystem.normalize_path(_stringify_path(metadata_path))
options = ParquetFactoryOptions(
partition_base_dir=partition_base_dir,
partitioning=_ensure_partitioning(partitioning)
)
factory = ParquetDatasetFactory(
metadata_path, filesystem, format, options=options)
return factory.finish(schema)
def dataset(source, schema=None, format=None, filesystem=None,
partitioning=None, partition_base_dir=None,
exclude_invalid_files=None, ignore_prefixes=None):
"""
Open a dataset.
Datasets provides functionality to efficiently work with tabular,
potentially larger than memory and multi-file dataset.
- A unified interface for different sources, like Parquet and Feather
- Discovery of sources (crawling directories, handle directory-based
partitioned datasets, basic schema normalization)
- Optimized reading with predicate pushdown (filtering rows), projection
(selecting columns), parallel reading or fine-grained managing of tasks.
Note that this is the high-level API, to have more control over the dataset
construction use the low-level API classes (FileSystemDataset,
FilesystemDatasetFactory, etc.)
Parameters
----------
source : path, list of paths, dataset, list of datasets, (list of) \
RecordBatch or Table, iterable of RecordBatch, RecordBatchReader, or URI
Path pointing to a single file:
Open a FileSystemDataset from a single file.
Path pointing to a directory:
The directory gets discovered recursively according to a
partitioning scheme if given.
List of file paths:
Create a FileSystemDataset from explicitly given files. The files
must be located on the same filesystem given by the filesystem
parameter.
Note that in contrary of construction from a single file, passing
URIs as paths is not allowed.
List of datasets:
A nested UnionDataset gets constructed, it allows arbitrary
composition of other datasets.
Note that additional keyword arguments are not allowed.
(List of) batches or tables, iterable of batches, or RecordBatchReader:
Create an InMemoryDataset. If an iterable or empty list is given,
a schema must also be given. If an iterable or RecordBatchReader
is given, the resulting dataset can only be scanned once; further
attempts will raise an error.
schema : Schema, optional
Optionally provide the Schema for the Dataset, in which case it will
not be inferred from the source.
format : FileFormat or str
Currently "parquet", "ipc"/"arrow"/"feather", "csv", and "orc" are
supported. For Feather, only version 2 files are supported.
filesystem : FileSystem or URI string, default None
If a single path is given as source and filesystem is None, then the
filesystem will be inferred from the path.
If an URI string is passed, then a filesystem object is constructed
using the URI's optional path component as a directory prefix. See the
examples below.
Note that the URIs on Windows must follow 'file:///C:...' or
'file:/C:...' patterns.
partitioning : Partitioning, PartitioningFactory, str, list of str
The partitioning scheme specified with the ``partitioning()``
function. A flavor string can be used as shortcut, and with a list of
field names a DirectionaryPartitioning will be inferred.
partition_base_dir : str, optional
For the purposes of applying the partitioning, paths will be
stripped of the partition_base_dir. Files not matching the
partition_base_dir prefix will be skipped for partitioning discovery.
The ignored files will still be part of the Dataset, but will not
have partition information.
exclude_invalid_files : bool, optional (default True)
If True, invalid files will be excluded (file format specific check).
This will incur IO for each files in a serial and single threaded
fashion. Disabling this feature will skip the IO, but unsupported
files may be present in the Dataset (resulting in an error at scan
time).
ignore_prefixes : list, optional
Files matching any of these prefixes will be ignored by the
discovery process. This is matched to the basename of a path.
By default this is ['.', '_'].
Note that discovery happens only if a directory is passed as source.
Returns
-------
dataset : Dataset
Either a FileSystemDataset or a UnionDataset depending on the source
parameter.
Examples
--------
Opening a single file:
>>> dataset("path/to/file.parquet", format="parquet")
Opening a single file with an explicit schema:
>>> dataset("path/to/file.parquet", schema=myschema, format="parquet")
Opening a dataset for a single directory:
>>> dataset("path/to/nyc-taxi/", format="parquet")
>>> dataset("s3://mybucket/nyc-taxi/", format="parquet")
Opening a dataset from a list of relatives local paths:
>>> dataset([
... "part0/data.parquet",
... "part1/data.parquet",
... "part3/data.parquet",
... ], format='parquet')
With filesystem provided:
>>> paths = [
... 'part0/data.parquet',
... 'part1/data.parquet',
... 'part3/data.parquet',
... ]
>>> dataset(paths, filesystem='file:///directory/prefix, format='parquet')
Which is equivalent with:
>>> fs = SubTreeFileSystem("/directory/prefix", LocalFileSystem())
>>> dataset(paths, filesystem=fs, format='parquet')
With a remote filesystem URI:
>>> paths = [
... 'nested/directory/part0/data.parquet',
... 'nested/directory/part1/data.parquet',
... 'nested/directory/part3/data.parquet',
... ]
>>> dataset(paths, filesystem='s3://bucket/', format='parquet')
Similarly to the local example, the directory prefix may be included in the
filesystem URI:
>>> dataset(paths, filesystem='s3://bucket/nested/directory',
... format='parquet')
Construction of a nested dataset:
>>> dataset([
... dataset("s3://old-taxi-data", format="parquet"),
... dataset("local/path/to/data", format="ipc")
... ])
"""
# collect the keyword arguments for later reuse
kwargs = dict(
schema=schema,
filesystem=filesystem,
partitioning=partitioning,
format=format,
partition_base_dir=partition_base_dir,
exclude_invalid_files=exclude_invalid_files,
selector_ignore_prefixes=ignore_prefixes
)
if _is_path_like(source):
return _filesystem_dataset(source, **kwargs)
elif isinstance(source, (tuple, list)):
if all(_is_path_like(elem) for elem in source):
return _filesystem_dataset(source, **kwargs)
elif all(isinstance(elem, Dataset) for elem in source):
return _union_dataset(source, **kwargs)
elif all(isinstance(elem, (pa.RecordBatch, pa.Table))
for elem in source):
return _in_memory_dataset(source, **kwargs)
else:
unique_types = set(type(elem).__name__ for elem in source)
type_names = ', '.join('{}'.format(t) for t in unique_types)
raise TypeError(
'Expected a list of path-like or dataset objects, or a list '
'of batches or tables. The given list contains the following '
'types: {}'.format(type_names)
)
elif isinstance(source, (pa.RecordBatch, pa.Table)):
return _in_memory_dataset(source, **kwargs)
else:
raise TypeError(
'Expected a path-like, list of path-likes or a list of Datasets '
'instead of the given type: {}'.format(type(source).__name__)
)
def _ensure_write_partitioning(part, schema, flavor):
if isinstance(part, PartitioningFactory):
raise ValueError("A PartitioningFactory cannot be used. "
"Did you call the partitioning function "
"without supplying a schema?")
if isinstance(part, Partitioning) and flavor:
raise ValueError(
"Providing a partitioning_flavor with "
"a Partitioning object is not supported"
)
elif isinstance(part, (tuple, list)):
# Name of fields were provided instead of a partitioning object.
# Create a partitioning factory with those field names.
part = partitioning(
schema=pa.schema([schema.field(f) for f in part]),
flavor=flavor
)
elif part is None:
part = partitioning(pa.schema([]), flavor=flavor)
if not isinstance(part, Partitioning):
raise ValueError(
"partitioning must be a Partitioning object or "
"a list of column names"
)
return part
def write_dataset(data, base_dir, basename_template=None, format=None,
partitioning=None, partitioning_flavor=None, schema=None,
filesystem=None, file_options=None, use_threads=True,
max_partitions=None, max_open_files=None,
max_rows_per_file=None, min_rows_per_group=None,
max_rows_per_group=None, file_visitor=None,
existing_data_behavior='error', create_dir=True):
"""
Write a dataset to a given format and partitioning.
Parameters
----------
data : Dataset, Table/RecordBatch, RecordBatchReader, list of \
Table/RecordBatch, or iterable of RecordBatch
The data to write. This can be a Dataset instance or
in-memory Arrow data. If an iterable is given, the schema must
also be given.
base_dir : str
The root directory where to write the dataset.
basename_template : str, optional
A template string used to generate basenames of written data files.
The token '{i}' will be replaced with an automatically incremented
integer. If not specified, it defaults to
"part-{i}." + format.default_extname
format : FileFormat or str
The format in which to write the dataset. Currently supported:
"parquet", "ipc"/"arrow"/"feather", and "csv". If a FileSystemDataset
is being written and `format` is not specified, it defaults to the
same format as the specified FileSystemDataset. When writing a
Table or RecordBatch, this keyword is required.
partitioning : Partitioning or list[str], optional
The partitioning scheme specified with the ``partitioning()``
function or a list of field names. When providing a list of
field names, you can use ``partitioning_flavor`` to drive which
partitioning type should be used.
partitioning_flavor : str, optional
One of the partitioning flavors supported by
``pyarrow.dataset.partitioning``. If omitted will use the
default of ``partitioning()`` which is directory partitioning.
schema : Schema, optional
filesystem : FileSystem, optional
file_options : pyarrow.dataset.FileWriteOptions, optional
FileFormat specific write options, created using the
``FileFormat.make_write_options()`` function.
use_threads : bool, default True
Write files in parallel. If enabled, then maximum parallelism will be
used determined by the number of available CPU cores.
max_partitions : int, default 1024
Maximum number of partitions any batch may be written into.
max_open_files : int, default 1024
If greater than 0 then this will limit the maximum number of
files that can be left open. If an attempt is made to open
too many files then the least recently used file will be closed.
If this setting is set too low you may end up fragmenting your
data into many small files.
max_rows_per_file : int, default 0
Maximum number of rows per file. If greater than 0 then this will
limit how many rows are placed in any single file. Otherwise there
will be no limit and one file will be created in each output
directory unless files need to be closed to respect max_open_files
min_rows_per_group : int, default 0
Minimum number of rows per group. When the value is greater than 0,
the dataset writer will batch incoming data and only write the row
groups to the disk when sufficient rows have accumulated.
max_rows_per_group : int, default 1024 * 1024
Maximum number of rows per group. If the value is greater than 0,
then the dataset writer may split up large incoming batches into
multiple row groups. If this value is set, then min_rows_per_group
should also be set. Otherwise it could end up with very small row
groups.
file_visitor : function
If set, this function will be called with a WrittenFile instance
for each file created during the call. This object will have both
a path attribute and a metadata attribute.
The path attribute will be a string containing the path to
the created file.
The metadata attribute will be the parquet metadata of the file.
This metadata will have the file path attribute set and can be used
to build a _metadata file. The metadata attribute will be None if
the format is not parquet.
Example visitor which simple collects the filenames created::
visited_paths = []
def file_visitor(written_file):
visited_paths.append(written_file.path)
existing_data_behavior : 'error' | 'overwrite_or_ignore' | \
'delete_matching'
Controls how the dataset will handle data that already exists in
the destination. The default behavior ('error') is to raise an error
if any data exists in the destination.
'overwrite_or_ignore' will ignore any existing data and will
overwrite files with the same name as an output file. Other
existing files will be ignored. This behavior, in combination
with a unique basename_template for each write, will allow for
an append workflow.
'delete_matching' is useful when you are writing a partitioned
dataset. The first time each partition directory is encountered
the entire directory will be deleted. This allows you to overwrite
old partitions completely.
create_dir : bool, default True
If False, directories will not be created. This can be useful for
filesystems that do not require directories.
"""
from pyarrow.fs import _resolve_filesystem_and_path
if isinstance(data, (list, tuple)):
schema = schema or data[0].schema
data = InMemoryDataset(data, schema=schema)
elif isinstance(data, (pa.RecordBatch, pa.Table)):
schema = schema or data.schema
data = InMemoryDataset(data, schema=schema)
elif isinstance(data, pa.ipc.RecordBatchReader) or _is_iterable(data):
data = Scanner.from_batches(data, schema=schema)
schema = None
elif not isinstance(data, (Dataset, Scanner)):
raise ValueError(
"Only Dataset, Scanner, Table/RecordBatch, RecordBatchReader, "
"a list of Tables/RecordBatches, or iterable of batches are "
"supported."
)
if format is None and isinstance(data, FileSystemDataset):
format = data.format
else:
format = _ensure_format(format)
if file_options is None:
file_options = format.make_write_options()
if format != file_options.format:
raise TypeError("Supplied FileWriteOptions have format {}, "
"which doesn't match supplied FileFormat {}".format(
format, file_options))
if basename_template is None:
basename_template = "part-{i}." + format.default_extname
if max_partitions is None:
max_partitions = 1024
if max_open_files is None:
max_open_files = 1024
if max_rows_per_file is None:
max_rows_per_file = 0
if max_rows_per_group is None:
max_rows_per_group = 1 << 20
if min_rows_per_group is None:
min_rows_per_group = 0
# at this point data is a Scanner or a Dataset, anything else
# was converted to one of those two. So we can grab the schema
# to build the partitioning object from Dataset.
if isinstance(data, Scanner):
partitioning_schema = data.dataset_schema
else:
partitioning_schema = data.schema
partitioning = _ensure_write_partitioning(partitioning,
schema=partitioning_schema,
flavor=partitioning_flavor)
filesystem, base_dir = _resolve_filesystem_and_path(base_dir, filesystem)
if isinstance(data, Dataset):
scanner = data.scanner(use_threads=use_threads)
else:
# scanner was passed directly by the user, in which case a schema
# cannot be passed
if schema is not None:
raise ValueError("Cannot specify a schema when writing a Scanner")
scanner = data
_filesystemdataset_write(
scanner, base_dir, basename_template, filesystem, partitioning,
file_options, max_partitions, file_visitor, existing_data_behavior,
max_open_files, max_rows_per_file,
min_rows_per_group, max_rows_per_group, create_dir
)

View File

@ -0,0 +1,250 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetInterrupt
from pyarrow.includes.libarrow cimport CStatus
from pyarrow.includes.libarrow_python cimport IsPyError, RestorePyError
from pyarrow.includes.common cimport c_string
from contextlib import contextmanager
import os
import signal
import threading
from pyarrow.util import _break_traceback_cycle_from_frame
class ArrowException(Exception):
pass
class ArrowInvalid(ValueError, ArrowException):
pass
class ArrowMemoryError(MemoryError, ArrowException):
pass
class ArrowKeyError(KeyError, ArrowException):
def __str__(self):
# Override KeyError.__str__, as it uses the repr() of the key
return ArrowException.__str__(self)
class ArrowTypeError(TypeError, ArrowException):
pass
class ArrowNotImplementedError(NotImplementedError, ArrowException):
pass
class ArrowCapacityError(ArrowException):
pass
class ArrowIndexError(IndexError, ArrowException):
pass
class ArrowSerializationError(ArrowException):
pass
class ArrowCancelled(ArrowException):
def __init__(self, message, signum=None):
super().__init__(message)
self.signum = signum
# Compatibility alias
ArrowIOError = IOError
# This function could be written directly in C++ if we didn't
# define Arrow-specific subclasses (ArrowInvalid etc.)
cdef int check_status(const CStatus& status) nogil except -1:
if status.ok():
return 0
with gil:
if IsPyError(status):
RestorePyError(status)
return -1
# We don't use Status::ToString() as it would redundantly include
# the C++ class name.
message = frombytes(status.message(), safe=True)
detail = status.detail()
if detail != nullptr:
message += ". Detail: " + frombytes(detail.get().ToString(),
safe=True)
if status.IsInvalid():
raise ArrowInvalid(message)
elif status.IsIOError():
# Note: OSError constructor is
# OSError(message)
# or
# OSError(errno, message, filename=None)
# or (on Windows)
# OSError(errno, message, filename, winerror)
errno = ErrnoFromStatus(status)
winerror = WinErrorFromStatus(status)
if winerror != 0:
raise IOError(errno, message, None, winerror)
elif errno != 0:
raise IOError(errno, message)
else:
raise IOError(message)
elif status.IsOutOfMemory():
raise ArrowMemoryError(message)
elif status.IsKeyError():
raise ArrowKeyError(message)
elif status.IsNotImplemented():
raise ArrowNotImplementedError(message)
elif status.IsTypeError():
raise ArrowTypeError(message)
elif status.IsCapacityError():
raise ArrowCapacityError(message)
elif status.IsIndexError():
raise ArrowIndexError(message)
elif status.IsSerializationError():
raise ArrowSerializationError(message)
elif status.IsCancelled():
signum = SignalFromStatus(status)
if signum > 0:
raise ArrowCancelled(message, signum)
else:
raise ArrowCancelled(message)
else:
message = frombytes(status.ToString(), safe=True)
raise ArrowException(message)
# This is an API function for C++ PyArrow
cdef api int pyarrow_internal_check_status(const CStatus& status) \
nogil except -1:
return check_status(status)
cdef class StopToken:
cdef void init(self, CStopToken stop_token):
self.stop_token = move(stop_token)
cdef c_bool signal_handlers_enabled = True
def enable_signal_handlers(c_bool enable):
"""
Enable or disable interruption of long-running operations.
By default, certain long running operations will detect user
interruptions, such as by pressing Ctrl-C. This detection relies
on setting a signal handler for the duration of the long-running
operation, and may therefore interfere with other frameworks or
libraries (such as an event loop).
Parameters
----------
enable : bool
Whether to enable user interruption by setting a temporary
signal handler.
"""
global signal_handlers_enabled
signal_handlers_enabled = enable
# For internal use
# Whether we need a workaround for https://bugs.python.org/issue42248
have_signal_refcycle = (sys.version_info < (3, 8, 10) or
(3, 9) <= sys.version_info < (3, 9, 5) or
sys.version_info[:2] == (3, 10))
cdef class SignalStopHandler:
cdef:
StopToken _stop_token
vector[int] _signals
c_bool _enabled
def __cinit__(self):
self._enabled = False
self._init_signals()
if have_signal_refcycle:
_break_traceback_cycle_from_frame(sys._getframe(0))
self._stop_token = StopToken()
if not self._signals.empty():
self._stop_token.init(GetResultValue(
SetSignalStopSource()).token())
self._enabled = True
def _init_signals(self):
if (signal_handlers_enabled and
threading.current_thread() is threading.main_thread()):
self._signals = [
sig for sig in (signal.SIGINT, signal.SIGTERM)
if signal.getsignal(sig) not in (signal.SIG_DFL,
signal.SIG_IGN, None)]
def __enter__(self):
if self._enabled:
check_status(RegisterCancellingSignalHandler(self._signals))
return self
def __exit__(self, exc_type, exc_value, exc_tb):
if self._enabled:
UnregisterCancellingSignalHandler()
if exc_value is None:
# Make sure we didn't lose a signal
try:
check_status(self._stop_token.stop_token.Poll())
except ArrowCancelled as e:
exc_value = e
if isinstance(exc_value, ArrowCancelled):
if exc_value.signum:
# Re-emit the exact same signal. We restored the Python signal
# handler above, so it should receive it.
if os.name == 'nt':
SendSignal(exc_value.signum)
else:
SendSignalToThread(exc_value.signum,
threading.main_thread().ident)
else:
# Simulate Python receiving a SIGINT
# (see https://bugs.python.org/issue43356 for why we can't
# simulate the exact signal number)
PyErr_SetInterrupt()
# Maximize chances of the Python signal handler being executed now.
# Otherwise a potential KeyboardInterrupt might be missed by an
# immediately enclosing try/except block.
PyErr_CheckSignals()
# ArrowCancelled will be re-raised if PyErr_CheckSignals()
# returned successfully.
def __dealloc__(self):
if self._enabled:
ResetSignalStopSource()
@property
def stop_token(self):
return self._stop_token

View File

@ -0,0 +1,279 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
from pyarrow.pandas_compat import _pandas_api # noqa
from pyarrow.lib import (Codec, Table, # noqa
concat_tables, schema)
import pyarrow.lib as ext
from pyarrow import _feather
from pyarrow._feather import FeatherError # noqa: F401
from pyarrow.vendored.version import Version
def _check_pandas_version():
if _pandas_api.loose_version < Version('0.17.0'):
raise ImportError("feather requires pandas >= 0.17.0")
class FeatherDataset:
"""
Encapsulates details of reading a list of Feather files.
Parameters
----------
path_or_paths : List[str]
A list of file names
validate_schema : bool, default True
Check that individual file schemas are all the same / compatible
"""
def __init__(self, path_or_paths, validate_schema=True):
self.paths = path_or_paths
self.validate_schema = validate_schema
def read_table(self, columns=None):
"""
Read multiple feather files as a single pyarrow.Table
Parameters
----------
columns : List[str]
Names of columns to read from the file
Returns
-------
pyarrow.Table
Content of the file as a table (of columns)
"""
_fil = read_table(self.paths[0], columns=columns)
self._tables = [_fil]
self.schema = _fil.schema
for path in self.paths[1:]:
table = read_table(path, columns=columns)
if self.validate_schema:
self.validate_schemas(path, table)
self._tables.append(table)
return concat_tables(self._tables)
def validate_schemas(self, piece, table):
if not self.schema.equals(table.schema):
raise ValueError('Schema in {!s} was different. \n'
'{!s}\n\nvs\n\n{!s}'
.format(piece, self.schema,
table.schema))
def read_pandas(self, columns=None, use_threads=True):
"""
Read multiple Parquet files as a single pandas DataFrame
Parameters
----------
columns : List[str]
Names of columns to read from the file
use_threads : bool, default True
Use multiple threads when converting to pandas
Returns
-------
pandas.DataFrame
Content of the file as a pandas DataFrame (of columns)
"""
_check_pandas_version()
return self.read_table(columns=columns).to_pandas(
use_threads=use_threads)
def check_chunked_overflow(name, col):
if col.num_chunks == 1:
return
if col.type in (ext.binary(), ext.string()):
raise ValueError("Column '{}' exceeds 2GB maximum capacity of "
"a Feather binary column. This restriction may be "
"lifted in the future".format(name))
else:
# TODO(wesm): Not sure when else this might be reached
raise ValueError("Column '{}' of type {} was chunked on conversion "
"to Arrow and cannot be currently written to "
"Feather format".format(name, str(col.type)))
_FEATHER_SUPPORTED_CODECS = {'lz4', 'zstd', 'uncompressed'}
def write_feather(df, dest, compression=None, compression_level=None,
chunksize=None, version=2):
"""
Write a pandas.DataFrame to Feather format.
Parameters
----------
df : pandas.DataFrame or pyarrow.Table
Data to write out as Feather format.
dest : str
Local destination path.
compression : string, default None
Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses
LZ4 for V2 files if it is available, otherwise uncompressed.
compression_level : int, default None
Use a compression level particular to the chosen compressor. If None
use the default compression level
chunksize : int, default None
For V2 files, the internal maximum size of Arrow RecordBatch chunks
when writing the Arrow IPC file format. None means use the default,
which is currently 64K
version : int, default 2
Feather file version. Version 2 is the current. Version 1 is the more
limited legacy format
"""
if _pandas_api.have_pandas:
_check_pandas_version()
if (_pandas_api.has_sparse and
isinstance(df, _pandas_api.pd.SparseDataFrame)):
df = df.to_dense()
if _pandas_api.is_data_frame(df):
# Feather v1 creates a new column in the resultant Table to
# store index information if index type is not RangeIndex
if version == 1:
preserve_index = False
elif version == 2:
preserve_index = None
else:
raise ValueError("Version value should either be 1 or 2")
table = Table.from_pandas(df, preserve_index=preserve_index)
if version == 1:
# Version 1 does not chunking
for i, name in enumerate(table.schema.names):
col = table[i]
check_chunked_overflow(name, col)
else:
table = df
if version == 1:
if len(table.column_names) > len(set(table.column_names)):
raise ValueError("cannot serialize duplicate column names")
if compression is not None:
raise ValueError("Feather V1 files do not support compression "
"option")
if chunksize is not None:
raise ValueError("Feather V1 files do not support chunksize "
"option")
else:
if compression is None and Codec.is_available('lz4_frame'):
compression = 'lz4'
elif (compression is not None and
compression not in _FEATHER_SUPPORTED_CODECS):
raise ValueError('compression="{}" not supported, must be '
'one of {}'.format(compression,
_FEATHER_SUPPORTED_CODECS))
try:
_feather.write_feather(table, dest, compression=compression,
compression_level=compression_level,
chunksize=chunksize, version=version)
except Exception:
if isinstance(dest, str):
try:
os.remove(dest)
except os.error:
pass
raise
def read_feather(source, columns=None, use_threads=True, memory_map=True):
"""
Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use
feather.read_table.
Parameters
----------
source : str file path, or file-like object
columns : sequence, optional
Only read a specific set of columns. If not provided, all columns are
read.
use_threads : bool, default True
Whether to parallelize reading using multiple threads. If false the
restriction is used in the conversion to Pandas as well as in the
reading from Feather format.
memory_map : boolean, default True
Use memory mapping when opening file on disk
Returns
-------
df : pandas.DataFrame
"""
_check_pandas_version()
return (read_table(
source, columns=columns, memory_map=memory_map,
use_threads=use_threads).to_pandas(use_threads=use_threads))
def read_table(source, columns=None, memory_map=True, use_threads=True):
"""
Read a pyarrow.Table from Feather format
Parameters
----------
source : str file path, or file-like object
columns : sequence, optional
Only read a specific set of columns. If not provided, all columns are
read.
memory_map : boolean, default True
Use memory mapping when opening file on disk
use_threads : bool, default True
Whether to parallelize reading using multiple threads.
Returns
-------
table : pyarrow.Table
"""
reader = _feather.FeatherReader(
source, use_memory_map=memory_map, use_threads=use_threads)
if columns is None:
return reader.read()
column_types = [type(column) for column in columns]
if all(map(lambda t: t == int, column_types)):
table = reader.read_indices(columns)
elif all(map(lambda t: t == str, column_types)):
table = reader.read_names(columns)
else:
column_type_names = [t.__name__ for t in column_types]
raise TypeError("Columns must be indices or names. "
"Got columns {} of types {}"
.format(columns, column_type_names))
# Feather v1 already respects the column selection
if reader.version < 3:
return table
# Feather v2 reads with sorted / deduplicated selection
elif sorted(set(columns)) == columns:
return table
else:
# follow exact order / selection of names
return table.select(columns)

View File

@ -0,0 +1,511 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import posixpath
import sys
import urllib.parse
import warnings
from os.path import join as pjoin
import pyarrow as pa
from pyarrow.util import implements, _stringify_path, _is_path_like, _DEPR_MSG
_FS_DEPR_MSG = _DEPR_MSG.format(
"filesystem.LocalFileSystem", "2.0.0", "fs.LocalFileSystem"
)
class FileSystem:
"""
Abstract filesystem interface.
"""
def cat(self, path):
"""
Return contents of file as a bytes object.
Parameters
----------
path : str
File path to read content from.
Returns
-------
contents : bytes
"""
with self.open(path, 'rb') as f:
return f.read()
def ls(self, path):
"""
Return list of file paths.
Parameters
----------
path : str
Directory to list contents from.
"""
raise NotImplementedError
def delete(self, path, recursive=False):
"""
Delete the indicated file or directory.
Parameters
----------
path : str
Path to delete.
recursive : bool, default False
If True, also delete child paths for directories.
"""
raise NotImplementedError
def disk_usage(self, path):
"""
Compute bytes used by all contents under indicated path in file tree.
Parameters
----------
path : str
Can be a file path or directory.
Returns
-------
usage : int
"""
path = _stringify_path(path)
path_info = self.stat(path)
if path_info['kind'] == 'file':
return path_info['size']
total = 0
for root, directories, files in self.walk(path):
for child_path in files:
abspath = self._path_join(root, child_path)
total += self.stat(abspath)['size']
return total
def _path_join(self, *args):
return self.pathsep.join(args)
def stat(self, path):
"""
Information about a filesystem entry.
Returns
-------
stat : dict
"""
raise NotImplementedError('FileSystem.stat')
def rm(self, path, recursive=False):
"""
Alias for FileSystem.delete.
"""
return self.delete(path, recursive=recursive)
def mv(self, path, new_path):
"""
Alias for FileSystem.rename.
"""
return self.rename(path, new_path)
def rename(self, path, new_path):
"""
Rename file, like UNIX mv command.
Parameters
----------
path : str
Path to alter.
new_path : str
Path to move to.
"""
raise NotImplementedError('FileSystem.rename')
def mkdir(self, path, create_parents=True):
"""
Create a directory.
Parameters
----------
path : str
Path to the directory.
create_parents : bool, default True
If the parent directories don't exists create them as well.
"""
raise NotImplementedError
def exists(self, path):
"""
Return True if path exists.
Parameters
----------
path : str
Path to check.
"""
raise NotImplementedError
def isdir(self, path):
"""
Return True if path is a directory.
Parameters
----------
path : str
Path to check.
"""
raise NotImplementedError
def isfile(self, path):
"""
Return True if path is a file.
Parameters
----------
path : str
Path to check.
"""
raise NotImplementedError
def _isfilestore(self):
"""
Returns True if this FileSystem is a unix-style file store with
directories.
"""
raise NotImplementedError
def read_parquet(self, path, columns=None, metadata=None, schema=None,
use_threads=True, use_pandas_metadata=False):
"""
Read Parquet data from path in file system. Can read from a single file
or a directory of files.
Parameters
----------
path : str
Single file path or directory
columns : List[str], optional
Subset of columns to read.
metadata : pyarrow.parquet.FileMetaData
Known metadata to validate files against.
schema : pyarrow.parquet.Schema
Known schema to validate files against. Alternative to metadata
argument.
use_threads : bool, default True
Perform multi-threaded column reads.
use_pandas_metadata : bool, default False
If True and file has custom pandas schema metadata, ensure that
index columns are also loaded.
Returns
-------
table : pyarrow.Table
"""
from pyarrow.parquet import ParquetDataset
dataset = ParquetDataset(path, schema=schema, metadata=metadata,
filesystem=self)
return dataset.read(columns=columns, use_threads=use_threads,
use_pandas_metadata=use_pandas_metadata)
def open(self, path, mode='rb'):
"""
Open file for reading or writing.
"""
raise NotImplementedError
@property
def pathsep(self):
return '/'
class LocalFileSystem(FileSystem):
_instance = None
def __init__(self):
warnings.warn(_FS_DEPR_MSG, FutureWarning, stacklevel=2)
super().__init__()
@classmethod
def _get_instance(cls):
if cls._instance is None:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
cls._instance = LocalFileSystem()
return cls._instance
@classmethod
def get_instance(cls):
warnings.warn(_FS_DEPR_MSG, FutureWarning, stacklevel=2)
return cls._get_instance()
@implements(FileSystem.ls)
def ls(self, path):
path = _stringify_path(path)
return sorted(pjoin(path, x) for x in os.listdir(path))
@implements(FileSystem.mkdir)
def mkdir(self, path, create_parents=True):
path = _stringify_path(path)
if create_parents:
os.makedirs(path)
else:
os.mkdir(path)
@implements(FileSystem.isdir)
def isdir(self, path):
path = _stringify_path(path)
return os.path.isdir(path)
@implements(FileSystem.isfile)
def isfile(self, path):
path = _stringify_path(path)
return os.path.isfile(path)
@implements(FileSystem._isfilestore)
def _isfilestore(self):
return True
@implements(FileSystem.exists)
def exists(self, path):
path = _stringify_path(path)
return os.path.exists(path)
@implements(FileSystem.open)
def open(self, path, mode='rb'):
"""
Open file for reading or writing.
"""
path = _stringify_path(path)
return open(path, mode=mode)
@property
def pathsep(self):
return os.path.sep
def walk(self, path):
"""
Directory tree generator, see os.walk.
"""
path = _stringify_path(path)
return os.walk(path)
class DaskFileSystem(FileSystem):
"""
Wraps s3fs Dask filesystem implementation like s3fs, gcsfs, etc.
"""
def __init__(self, fs):
warnings.warn(
"The pyarrow.filesystem.DaskFileSystem/S3FSWrapper are deprecated "
"as of pyarrow 3.0.0, and will be removed in a future version.",
FutureWarning, stacklevel=2)
self.fs = fs
@implements(FileSystem.isdir)
def isdir(self, path):
raise NotImplementedError("Unsupported file system API")
@implements(FileSystem.isfile)
def isfile(self, path):
raise NotImplementedError("Unsupported file system API")
@implements(FileSystem._isfilestore)
def _isfilestore(self):
"""
Object Stores like S3 and GCSFS are based on key lookups, not true
file-paths.
"""
return False
@implements(FileSystem.delete)
def delete(self, path, recursive=False):
path = _stringify_path(path)
return self.fs.rm(path, recursive=recursive)
@implements(FileSystem.exists)
def exists(self, path):
path = _stringify_path(path)
return self.fs.exists(path)
@implements(FileSystem.mkdir)
def mkdir(self, path, create_parents=True):
path = _stringify_path(path)
if create_parents:
return self.fs.mkdirs(path)
else:
return self.fs.mkdir(path)
@implements(FileSystem.open)
def open(self, path, mode='rb'):
"""
Open file for reading or writing.
"""
path = _stringify_path(path)
return self.fs.open(path, mode=mode)
def ls(self, path, detail=False):
path = _stringify_path(path)
return self.fs.ls(path, detail=detail)
def walk(self, path):
"""
Directory tree generator, like os.walk.
"""
path = _stringify_path(path)
return self.fs.walk(path)
class S3FSWrapper(DaskFileSystem):
@implements(FileSystem.isdir)
def isdir(self, path):
path = _sanitize_s3(_stringify_path(path))
try:
contents = self.fs.ls(path)
if len(contents) == 1 and contents[0] == path:
return False
else:
return True
except OSError:
return False
@implements(FileSystem.isfile)
def isfile(self, path):
path = _sanitize_s3(_stringify_path(path))
try:
contents = self.fs.ls(path)
return len(contents) == 1 and contents[0] == path
except OSError:
return False
def walk(self, path, refresh=False):
"""
Directory tree generator, like os.walk.
Generator version of what is in s3fs, which yields a flattened list of
files.
"""
path = _sanitize_s3(_stringify_path(path))
directories = set()
files = set()
for key in list(self.fs._ls(path, refresh=refresh)):
path = key['Key']
if key['StorageClass'] == 'DIRECTORY':
directories.add(path)
elif key['StorageClass'] == 'BUCKET':
pass
else:
files.add(path)
# s3fs creates duplicate 'DIRECTORY' entries
files = sorted([posixpath.split(f)[1] for f in files
if f not in directories])
directories = sorted([posixpath.split(x)[1]
for x in directories])
yield path, directories, files
for directory in directories:
yield from self.walk(directory, refresh=refresh)
def _sanitize_s3(path):
if path.startswith('s3://'):
return path.replace('s3://', '')
else:
return path
def _ensure_filesystem(fs):
fs_type = type(fs)
# If the arrow filesystem was subclassed, assume it supports the full
# interface and return it
if not issubclass(fs_type, FileSystem):
if "fsspec" in sys.modules:
fsspec = sys.modules["fsspec"]
if isinstance(fs, fsspec.AbstractFileSystem):
# for recent fsspec versions that stop inheriting from
# pyarrow.filesystem.FileSystem, still allow fsspec
# filesystems (which should be compatible with our legacy fs)
return fs
raise OSError('Unrecognized filesystem: {}'.format(fs_type))
else:
return fs
def resolve_filesystem_and_path(where, filesystem=None):
"""
Return filesystem from path which could be an HDFS URI, a local URI,
or a plain filesystem path.
"""
if not _is_path_like(where):
if filesystem is not None:
raise ValueError("filesystem passed but where is file-like, so"
" there is nothing to open with filesystem.")
return filesystem, where
if filesystem is not None:
filesystem = _ensure_filesystem(filesystem)
if isinstance(filesystem, LocalFileSystem):
path = _stringify_path(where)
elif not isinstance(where, str):
raise TypeError(
"Expected string path; path-like objects are only allowed "
"with a local filesystem"
)
else:
path = where
return filesystem, path
path = _stringify_path(where)
parsed_uri = urllib.parse.urlparse(path)
if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs':
# Input is hdfs URI such as hdfs://host:port/myfile.parquet
netloc_split = parsed_uri.netloc.split(':')
host = netloc_split[0]
if host == '':
host = 'default'
else:
host = parsed_uri.scheme + "://" + host
port = 0
if len(netloc_split) == 2 and netloc_split[1].isnumeric():
port = int(netloc_split[1])
fs = pa.hdfs._connect(host=host, port=port)
fs_path = parsed_uri.path
elif parsed_uri.scheme == 'file':
# Input is local URI such as file:///home/user/myfile.parquet
fs = LocalFileSystem._get_instance()
fs_path = parsed_uri.path
else:
# Input is local path such as /home/user/myfile.parquet
fs = LocalFileSystem._get_instance()
fs_path = path
return fs, fs_path

View File

@ -0,0 +1,63 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from pyarrow._flight import ( # noqa:F401
connect,
Action,
ActionType,
BasicAuth,
CallInfo,
CertKeyPair,
ClientAuthHandler,
ClientMiddleware,
ClientMiddlewareFactory,
DescriptorType,
FlightCallOptions,
FlightCancelledError,
FlightClient,
FlightDataStream,
FlightDescriptor,
FlightEndpoint,
FlightError,
FlightInfo,
FlightInternalError,
FlightMetadataReader,
FlightMetadataWriter,
FlightMethod,
FlightServerBase,
FlightServerError,
FlightStreamChunk,
FlightStreamReader,
FlightStreamWriter,
FlightTimedOutError,
FlightUnauthenticatedError,
FlightUnauthorizedError,
FlightUnavailableError,
FlightWriteSizeExceededError,
GeneratorStream,
Location,
MetadataRecordBatchReader,
MetadataRecordBatchWriter,
RecordBatchStream,
Result,
SchemaResult,
ServerAuthHandler,
ServerCallContext,
ServerMiddleware,
ServerMiddlewareFactory,
Ticket,
)

View File

@ -0,0 +1,412 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
FileSystem abstraction to interact with various local and remote filesystems.
"""
from pyarrow.util import _is_path_like, _stringify_path
from pyarrow._fs import ( # noqa
FileSelector,
FileType,
FileInfo,
FileSystem,
LocalFileSystem,
SubTreeFileSystem,
_MockFileSystem,
FileSystemHandler,
PyFileSystem,
_copy_files,
_copy_files_selector,
)
# For backward compatibility.
FileStats = FileInfo
_not_imported = []
try:
from pyarrow._hdfs import HadoopFileSystem # noqa
except ImportError:
_not_imported.append("HadoopFileSystem")
try:
from pyarrow._s3fs import ( # noqa
S3FileSystem, S3LogLevel, initialize_s3, finalize_s3,
resolve_s3_region)
except ImportError:
_not_imported.append("S3FileSystem")
else:
initialize_s3()
def __getattr__(name):
if name in _not_imported:
raise ImportError(
"The pyarrow installation is not built with support for "
"'{0}'".format(name)
)
raise AttributeError(
"module 'pyarrow.fs' has no attribute '{0}'".format(name)
)
def _filesystem_from_str(uri):
# instantiate the file system from an uri, if the uri has a path
# component then it will be treated as a path prefix
filesystem, prefix = FileSystem.from_uri(uri)
prefix = filesystem.normalize_path(prefix)
if prefix:
# validate that the prefix is pointing to a directory
prefix_info = filesystem.get_file_info([prefix])[0]
if prefix_info.type != FileType.Directory:
raise ValueError(
"The path component of the filesystem URI must point to a "
"directory but it has a type: `{}`. The path component "
"is `{}` and the given filesystem URI is `{}`".format(
prefix_info.type.name, prefix_info.path, uri
)
)
filesystem = SubTreeFileSystem(prefix, filesystem)
return filesystem
def _ensure_filesystem(
filesystem, use_mmap=False, allow_legacy_filesystem=False
):
if isinstance(filesystem, FileSystem):
return filesystem
elif isinstance(filesystem, str):
if use_mmap:
raise ValueError(
"Specifying to use memory mapping not supported for "
"filesystem specified as an URI string"
)
return _filesystem_from_str(filesystem)
# handle fsspec-compatible filesystems
try:
import fsspec
except ImportError:
pass
else:
if isinstance(filesystem, fsspec.AbstractFileSystem):
if type(filesystem).__name__ == 'LocalFileSystem':
# In case its a simple LocalFileSystem, use native arrow one
return LocalFileSystem(use_mmap=use_mmap)
return PyFileSystem(FSSpecHandler(filesystem))
# map old filesystems to new ones
import pyarrow.filesystem as legacyfs
if isinstance(filesystem, legacyfs.LocalFileSystem):
return LocalFileSystem(use_mmap=use_mmap)
# TODO handle HDFS?
if allow_legacy_filesystem and isinstance(filesystem, legacyfs.FileSystem):
return filesystem
raise TypeError(
"Unrecognized filesystem: {}. `filesystem` argument must be a "
"FileSystem instance or a valid file system URI'".format(
type(filesystem))
)
def _resolve_filesystem_and_path(
path, filesystem=None, allow_legacy_filesystem=False
):
"""
Return filesystem/path from path which could be an URI or a plain
filesystem path.
"""
if not _is_path_like(path):
if filesystem is not None:
raise ValueError(
"'filesystem' passed but the specified path is file-like, so"
" there is nothing to open with 'filesystem'."
)
return filesystem, path
if filesystem is not None:
filesystem = _ensure_filesystem(
filesystem, allow_legacy_filesystem=allow_legacy_filesystem
)
if isinstance(filesystem, LocalFileSystem):
path = _stringify_path(path)
elif not isinstance(path, str):
raise TypeError(
"Expected string path; path-like objects are only allowed "
"with a local filesystem"
)
if not allow_legacy_filesystem:
path = filesystem.normalize_path(path)
return filesystem, path
path = _stringify_path(path)
# if filesystem is not given, try to automatically determine one
# first check if the file exists as a local (relative) file path
# if not then try to parse the path as an URI
filesystem = LocalFileSystem()
try:
file_info = filesystem.get_file_info(path)
except ValueError: # ValueError means path is likely an URI
file_info = None
exists_locally = False
else:
exists_locally = (file_info.type != FileType.NotFound)
# if the file or directory doesn't exists locally, then assume that
# the path is an URI describing the file system as well
if not exists_locally:
try:
filesystem, path = FileSystem.from_uri(path)
except ValueError as e:
# neither an URI nor a locally existing path, so assume that
# local path was given and propagate a nicer file not found error
# instead of a more confusing scheme parsing error
if "empty scheme" not in str(e):
raise
else:
path = filesystem.normalize_path(path)
return filesystem, path
def copy_files(source, destination,
source_filesystem=None, destination_filesystem=None,
*, chunk_size=1024*1024, use_threads=True):
"""
Copy files between FileSystems.
This functions allows you to recursively copy directories of files from
one file system to another, such as from S3 to your local machine.
Parameters
----------
source : string
Source file path or URI to a single file or directory.
If a directory, files will be copied recursively from this path.
destination : string
Destination file path or URI. If `source` is a file, `destination`
is also interpreted as the destination file (not directory).
Directories will be created as necessary.
source_filesystem : FileSystem, optional
Source filesystem, needs to be specified if `source` is not a URI,
otherwise inferred.
destination_filesystem : FileSystem, optional
Destination filesystem, needs to be specified if `destination` is not
a URI, otherwise inferred.
chunk_size : int, default 1MB
The maximum size of block to read before flushing to the
destination file. A larger chunk_size will use more memory while
copying but may help accommodate high latency FileSystems.
use_threads : bool, default True
Whether to use multiple threads to accelerate copying.
Examples
--------
Copy an S3 bucket's files to a local directory:
>>> copy_files("s3://your-bucket-name", "local-directory")
Using a FileSystem object:
>>> copy_files("your-bucket-name", "local-directory",
... source_filesystem=S3FileSystem(...))
"""
source_fs, source_path = _resolve_filesystem_and_path(
source, source_filesystem
)
destination_fs, destination_path = _resolve_filesystem_and_path(
destination, destination_filesystem
)
file_info = source_fs.get_file_info(source_path)
if file_info.type == FileType.Directory:
source_sel = FileSelector(source_path, recursive=True)
_copy_files_selector(source_fs, source_sel,
destination_fs, destination_path,
chunk_size, use_threads)
else:
_copy_files(source_fs, source_path,
destination_fs, destination_path,
chunk_size, use_threads)
class FSSpecHandler(FileSystemHandler):
"""
Handler for fsspec-based Python filesystems.
https://filesystem-spec.readthedocs.io/en/latest/index.html
Parameters
----------
fs : FSSpec-compliant filesystem instance.
Examples
--------
>>> PyFileSystem(FSSpecHandler(fsspec_fs))
"""
def __init__(self, fs):
self.fs = fs
def __eq__(self, other):
if isinstance(other, FSSpecHandler):
return self.fs == other.fs
return NotImplemented
def __ne__(self, other):
if isinstance(other, FSSpecHandler):
return self.fs != other.fs
return NotImplemented
def get_type_name(self):
protocol = self.fs.protocol
if isinstance(protocol, list):
protocol = protocol[0]
return "fsspec+{0}".format(protocol)
def normalize_path(self, path):
return path
@staticmethod
def _create_file_info(path, info):
size = info["size"]
if info["type"] == "file":
ftype = FileType.File
elif info["type"] == "directory":
ftype = FileType.Directory
# some fsspec filesystems include a file size for directories
size = None
else:
ftype = FileType.Unknown
return FileInfo(path, ftype, size=size, mtime=info.get("mtime", None))
def get_file_info(self, paths):
infos = []
for path in paths:
try:
info = self.fs.info(path)
except FileNotFoundError:
infos.append(FileInfo(path, FileType.NotFound))
else:
infos.append(self._create_file_info(path, info))
return infos
def get_file_info_selector(self, selector):
if not self.fs.isdir(selector.base_dir):
if self.fs.exists(selector.base_dir):
raise NotADirectoryError(selector.base_dir)
else:
if selector.allow_not_found:
return []
else:
raise FileNotFoundError(selector.base_dir)
if selector.recursive:
maxdepth = None
else:
maxdepth = 1
infos = []
selected_files = self.fs.find(
selector.base_dir, maxdepth=maxdepth, withdirs=True, detail=True
)
for path, info in selected_files.items():
infos.append(self._create_file_info(path, info))
return infos
def create_dir(self, path, recursive):
# mkdir also raises FileNotFoundError when base directory is not found
try:
self.fs.mkdir(path, create_parents=recursive)
except FileExistsError:
pass
def delete_dir(self, path):
self.fs.rm(path, recursive=True)
def _delete_dir_contents(self, path, missing_dir_ok):
try:
subpaths = self.fs.listdir(path, detail=False)
except FileNotFoundError:
if missing_dir_ok:
return
raise
for subpath in subpaths:
if self.fs.isdir(subpath):
self.fs.rm(subpath, recursive=True)
elif self.fs.isfile(subpath):
self.fs.rm(subpath)
def delete_dir_contents(self, path, missing_dir_ok):
if path.strip("/") == "":
raise ValueError(
"delete_dir_contents called on path '", path, "'")
self._delete_dir_contents(path, missing_dir_ok)
def delete_root_dir_contents(self):
self._delete_dir_contents("/")
def delete_file(self, path):
# fs.rm correctly raises IsADirectoryError when `path` is a directory
# instead of a file and `recursive` is not set to True
if not self.fs.exists(path):
raise FileNotFoundError(path)
self.fs.rm(path)
def move(self, src, dest):
self.fs.mv(src, dest, recursive=True)
def copy_file(self, src, dest):
# fs.copy correctly raises IsADirectoryError when `src` is a directory
# instead of a file
self.fs.copy(src, dest)
# TODO can we read/pass metadata (e.g. Content-Type) in the methods below?
def open_input_stream(self, path):
from pyarrow import PythonFile
if not self.fs.isfile(path):
raise FileNotFoundError(path)
return PythonFile(self.fs.open(path, mode="rb"), mode="r")
def open_input_file(self, path):
from pyarrow import PythonFile
if not self.fs.isfile(path):
raise FileNotFoundError(path)
return PythonFile(self.fs.open(path, mode="rb"), mode="r")
def open_output_stream(self, path, metadata):
from pyarrow import PythonFile
return PythonFile(self.fs.open(path, mode="wb"), mode="w")
def open_append_stream(self, path, metadata):
from pyarrow import PythonFile
return PythonFile(self.fs.open(path, mode="ab"), mode="w")

View File

@ -0,0 +1,568 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from libcpp cimport bool as c_bool, nullptr
from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
from libcpp.string cimport string as c_string
from libcpp.vector cimport vector as c_vector
from libcpp.unordered_set cimport unordered_set as c_unordered_set
from libc.stdint cimport int64_t, int32_t, uint8_t, uintptr_t
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (Array, DataType, Field, MemoryPool, RecordBatch,
Schema, check_status, pyarrow_wrap_array,
pyarrow_wrap_data_type, ensure_type, _Weakrefable,
pyarrow_wrap_field)
from pyarrow.lib import frombytes
from pyarrow.includes.libgandiva cimport (
CCondition, CGandivaExpression,
CNode, CProjector, CFilter,
CSelectionVector,
CSelectionVector_Mode,
_ensure_selection_mode,
CConfiguration,
CConfigurationBuilder,
TreeExprBuilder_MakeExpression,
TreeExprBuilder_MakeFunction,
TreeExprBuilder_MakeBoolLiteral,
TreeExprBuilder_MakeUInt8Literal,
TreeExprBuilder_MakeUInt16Literal,
TreeExprBuilder_MakeUInt32Literal,
TreeExprBuilder_MakeUInt64Literal,
TreeExprBuilder_MakeInt8Literal,
TreeExprBuilder_MakeInt16Literal,
TreeExprBuilder_MakeInt32Literal,
TreeExprBuilder_MakeInt64Literal,
TreeExprBuilder_MakeFloatLiteral,
TreeExprBuilder_MakeDoubleLiteral,
TreeExprBuilder_MakeStringLiteral,
TreeExprBuilder_MakeBinaryLiteral,
TreeExprBuilder_MakeField,
TreeExprBuilder_MakeIf,
TreeExprBuilder_MakeAnd,
TreeExprBuilder_MakeOr,
TreeExprBuilder_MakeCondition,
TreeExprBuilder_MakeInExpressionInt32,
TreeExprBuilder_MakeInExpressionInt64,
TreeExprBuilder_MakeInExpressionTime32,
TreeExprBuilder_MakeInExpressionTime64,
TreeExprBuilder_MakeInExpressionDate32,
TreeExprBuilder_MakeInExpressionDate64,
TreeExprBuilder_MakeInExpressionTimeStamp,
TreeExprBuilder_MakeInExpressionString,
TreeExprBuilder_MakeInExpressionBinary,
SelectionVector_MakeInt16,
SelectionVector_MakeInt32,
SelectionVector_MakeInt64,
Projector_Make,
Filter_Make,
CFunctionSignature,
GetRegisteredFunctionSignatures)
cdef class Node(_Weakrefable):
cdef:
shared_ptr[CNode] node
def __init__(self):
raise TypeError("Do not call {}'s constructor directly, use the "
"TreeExprBuilder API directly"
.format(self.__class__.__name__))
@staticmethod
cdef create(shared_ptr[CNode] node):
cdef Node self = Node.__new__(Node)
self.node = node
return self
def __str__(self):
return self.node.get().ToString().decode()
def __repr__(self):
type_format = object.__repr__(self)
return '{0}\n{1}'.format(type_format, str(self))
def return_type(self):
return pyarrow_wrap_data_type(self.node.get().return_type())
cdef class Expression(_Weakrefable):
cdef:
shared_ptr[CGandivaExpression] expression
cdef void init(self, shared_ptr[CGandivaExpression] expression):
self.expression = expression
def __str__(self):
return self.expression.get().ToString().decode()
def __repr__(self):
type_format = object.__repr__(self)
return '{0}\n{1}'.format(type_format, str(self))
def root(self):
return Node.create(self.expression.get().root())
def result(self):
return pyarrow_wrap_field(self.expression.get().result())
cdef class Condition(_Weakrefable):
cdef:
shared_ptr[CCondition] condition
def __init__(self):
raise TypeError("Do not call {}'s constructor directly, use the "
"TreeExprBuilder API instead"
.format(self.__class__.__name__))
@staticmethod
cdef create(shared_ptr[CCondition] condition):
cdef Condition self = Condition.__new__(Condition)
self.condition = condition
return self
def __str__(self):
return self.condition.get().ToString().decode()
def __repr__(self):
type_format = object.__repr__(self)
return '{0}\n{1}'.format(type_format, str(self))
def root(self):
return Node.create(self.condition.get().root())
def result(self):
return pyarrow_wrap_field(self.condition.get().result())
cdef class SelectionVector(_Weakrefable):
cdef:
shared_ptr[CSelectionVector] selection_vector
def __init__(self):
raise TypeError("Do not call {}'s constructor directly."
.format(self.__class__.__name__))
@staticmethod
cdef create(shared_ptr[CSelectionVector] selection_vector):
cdef SelectionVector self = SelectionVector.__new__(SelectionVector)
self.selection_vector = selection_vector
return self
def to_array(self):
cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray()
return pyarrow_wrap_array(result)
cdef class Projector(_Weakrefable):
cdef:
shared_ptr[CProjector] projector
MemoryPool pool
def __init__(self):
raise TypeError("Do not call {}'s constructor directly, use "
"make_projector instead"
.format(self.__class__.__name__))
@staticmethod
cdef create(shared_ptr[CProjector] projector, MemoryPool pool):
cdef Projector self = Projector.__new__(Projector)
self.projector = projector
self.pool = pool
return self
@property
def llvm_ir(self):
return self.projector.get().DumpIR().decode()
def evaluate(self, RecordBatch batch, SelectionVector selection=None):
cdef vector[shared_ptr[CArray]] results
if selection is None:
check_status(self.projector.get().Evaluate(
batch.sp_batch.get()[0], self.pool.pool, &results))
else:
check_status(
self.projector.get().Evaluate(
batch.sp_batch.get()[0], selection.selection_vector.get(),
self.pool.pool, &results))
cdef shared_ptr[CArray] result
arrays = []
for result in results:
arrays.append(pyarrow_wrap_array(result))
return arrays
cdef class Filter(_Weakrefable):
cdef:
shared_ptr[CFilter] filter
def __init__(self):
raise TypeError("Do not call {}'s constructor directly, use "
"make_filter instead"
.format(self.__class__.__name__))
@staticmethod
cdef create(shared_ptr[CFilter] filter):
cdef Filter self = Filter.__new__(Filter)
self.filter = filter
return self
@property
def llvm_ir(self):
return self.filter.get().DumpIR().decode()
def evaluate(self, RecordBatch batch, MemoryPool pool, dtype='int32'):
cdef:
DataType type = ensure_type(dtype)
shared_ptr[CSelectionVector] selection
if type.id == _Type_INT16:
check_status(SelectionVector_MakeInt16(
batch.num_rows, pool.pool, &selection))
elif type.id == _Type_INT32:
check_status(SelectionVector_MakeInt32(
batch.num_rows, pool.pool, &selection))
elif type.id == _Type_INT64:
check_status(SelectionVector_MakeInt64(
batch.num_rows, pool.pool, &selection))
else:
raise ValueError("'dtype' of the selection vector should be "
"one of 'int16', 'int32' and 'int64'.")
check_status(self.filter.get().Evaluate(
batch.sp_batch.get()[0], selection))
return SelectionVector.create(selection)
cdef class TreeExprBuilder(_Weakrefable):
def make_literal(self, value, dtype):
cdef:
DataType type = ensure_type(dtype)
shared_ptr[CNode] r
if type.id == _Type_BOOL:
r = TreeExprBuilder_MakeBoolLiteral(value)
elif type.id == _Type_UINT8:
r = TreeExprBuilder_MakeUInt8Literal(value)
elif type.id == _Type_UINT16:
r = TreeExprBuilder_MakeUInt16Literal(value)
elif type.id == _Type_UINT32:
r = TreeExprBuilder_MakeUInt32Literal(value)
elif type.id == _Type_UINT64:
r = TreeExprBuilder_MakeUInt64Literal(value)
elif type.id == _Type_INT8:
r = TreeExprBuilder_MakeInt8Literal(value)
elif type.id == _Type_INT16:
r = TreeExprBuilder_MakeInt16Literal(value)
elif type.id == _Type_INT32:
r = TreeExprBuilder_MakeInt32Literal(value)
elif type.id == _Type_INT64:
r = TreeExprBuilder_MakeInt64Literal(value)
elif type.id == _Type_FLOAT:
r = TreeExprBuilder_MakeFloatLiteral(value)
elif type.id == _Type_DOUBLE:
r = TreeExprBuilder_MakeDoubleLiteral(value)
elif type.id == _Type_STRING:
r = TreeExprBuilder_MakeStringLiteral(value.encode('UTF-8'))
elif type.id == _Type_BINARY:
r = TreeExprBuilder_MakeBinaryLiteral(value)
else:
raise TypeError("Didn't recognize dtype " + str(dtype))
return Node.create(r)
def make_expression(self, Node root_node, Field return_field):
cdef shared_ptr[CGandivaExpression] r = TreeExprBuilder_MakeExpression(
root_node.node, return_field.sp_field)
cdef Expression expression = Expression()
expression.init(r)
return expression
def make_function(self, name, children, DataType return_type):
cdef c_vector[shared_ptr[CNode]] c_children
cdef Node child
for child in children:
c_children.push_back(child.node)
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeFunction(
name.encode(), c_children, return_type.sp_type)
return Node.create(r)
def make_field(self, Field field):
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeField(field.sp_field)
return Node.create(r)
def make_if(self, Node condition, Node this_node,
Node else_node, DataType return_type):
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeIf(
condition.node, this_node.node, else_node.node,
return_type.sp_type)
return Node.create(r)
def make_and(self, children):
cdef c_vector[shared_ptr[CNode]] c_children
cdef Node child
for child in children:
c_children.push_back(child.node)
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeAnd(c_children)
return Node.create(r)
def make_or(self, children):
cdef c_vector[shared_ptr[CNode]] c_children
cdef Node child
for child in children:
c_children.push_back(child.node)
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeOr(c_children)
return Node.create(r)
def _make_in_expression_int32(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int32_t] c_values
cdef int32_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionInt32(node.node, c_values)
return Node.create(r)
def _make_in_expression_int64(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionInt64(node.node, c_values)
return Node.create(r)
def _make_in_expression_time32(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int32_t] c_values
cdef int32_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionTime32(node.node, c_values)
return Node.create(r)
def _make_in_expression_time64(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionTime64(node.node, c_values)
return Node.create(r)
def _make_in_expression_date32(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int32_t] c_values
cdef int32_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionDate32(node.node, c_values)
return Node.create(r)
def _make_in_expression_date64(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionDate64(node.node, c_values)
return Node.create(r)
def _make_in_expression_timestamp(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[int64_t] c_values
cdef int64_t v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionTimeStamp(node.node, c_values)
return Node.create(r)
def _make_in_expression_binary(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[c_string] c_values
cdef c_string v
for v in values:
c_values.insert(v)
r = TreeExprBuilder_MakeInExpressionString(node.node, c_values)
return Node.create(r)
def _make_in_expression_string(self, Node node, values):
cdef shared_ptr[CNode] r
cdef c_unordered_set[c_string] c_values
cdef c_string _v
for v in values:
_v = v.encode('UTF-8')
c_values.insert(_v)
r = TreeExprBuilder_MakeInExpressionString(node.node, c_values)
return Node.create(r)
def make_in_expression(self, Node node, values, dtype):
cdef DataType type = ensure_type(dtype)
if type.id == _Type_INT32:
return self._make_in_expression_int32(node, values)
elif type.id == _Type_INT64:
return self._make_in_expression_int64(node, values)
elif type.id == _Type_TIME32:
return self._make_in_expression_time32(node, values)
elif type.id == _Type_TIME64:
return self._make_in_expression_time64(node, values)
elif type.id == _Type_TIMESTAMP:
return self._make_in_expression_timestamp(node, values)
elif type.id == _Type_DATE32:
return self._make_in_expression_date32(node, values)
elif type.id == _Type_DATE64:
return self._make_in_expression_date64(node, values)
elif type.id == _Type_BINARY:
return self._make_in_expression_binary(node, values)
elif type.id == _Type_STRING:
return self._make_in_expression_string(node, values)
else:
raise TypeError("Data type " + str(dtype) + " not supported.")
def make_condition(self, Node condition):
cdef shared_ptr[CCondition] r = TreeExprBuilder_MakeCondition(
condition.node)
return Condition.create(r)
cpdef make_projector(Schema schema, children, MemoryPool pool,
str selection_mode="NONE"):
"""
Construct a projection using expressions.
A projector is built for a specific schema and vector of expressions.
Once the projector is built, it can be used to evaluate many row batches.
Parameters
----------
schema : pyarrow.Schema
Schema for the record batches, and the expressions.
children : list[pyarrow.gandiva.Expression]
List of projectable expression objects.
pool : pyarrow.MemoryPool
Memory pool used to allocate output arrays.
selection_mode : str, default "NONE"
Possible values are NONE, UINT16, UINT32, UINT64.
Returns
-------
Projector instance
"""
cdef:
Expression child
c_vector[shared_ptr[CGandivaExpression]] c_children
shared_ptr[CProjector] result
for child in children:
c_children.push_back(child.expression)
check_status(
Projector_Make(schema.sp_schema, c_children,
_ensure_selection_mode(selection_mode),
CConfigurationBuilder.DefaultConfiguration(),
&result))
return Projector.create(result, pool)
cpdef make_filter(Schema schema, Condition condition):
"""
Construct a filter based on a condition.
A filter is built for a specific schema and condition. Once the filter is
built, it can be used to evaluate many row batches.
Parameters
----------
schema : pyarrow.Schema
Schema for the record batches, and the condition.
condition : pyarrow.gandiva.Condition
Filter condition.
Returns
-------
Filter instance
"""
cdef shared_ptr[CFilter] result
check_status(
Filter_Make(schema.sp_schema, condition.condition, &result))
return Filter.create(result)
cdef class FunctionSignature(_Weakrefable):
"""
Signature of a Gandiva function including name, parameter types
and return type.
"""
cdef:
shared_ptr[CFunctionSignature] signature
def __init__(self):
raise TypeError("Do not call {}'s constructor directly."
.format(self.__class__.__name__))
@staticmethod
cdef create(shared_ptr[CFunctionSignature] signature):
cdef FunctionSignature self = FunctionSignature.__new__(
FunctionSignature)
self.signature = signature
return self
def return_type(self):
return pyarrow_wrap_data_type(self.signature.get().ret_type())
def param_types(self):
result = []
cdef vector[shared_ptr[CDataType]] types = \
self.signature.get().param_types()
for t in types:
result.append(pyarrow_wrap_data_type(t))
return result
def name(self):
return self.signature.get().base_name().decode()
def __repr__(self):
signature = self.signature.get().ToString().decode()
return "FunctionSignature(" + signature + ")"
def get_registered_function_signatures():
"""
Return the function in Gandiva's ExpressionRegistry.
Returns
-------
registry: a list of registered function signatures
"""
results = []
cdef vector[shared_ptr[CFunctionSignature]] signatures = \
GetRegisteredFunctionSignatures()
for signature in signatures:
results.append(FunctionSignature.create(signature))
return results

View File

@ -0,0 +1,240 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import posixpath
import sys
import warnings
from pyarrow.util import implements, _DEPR_MSG
from pyarrow.filesystem import FileSystem
import pyarrow._hdfsio as _hdfsio
class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem):
"""
DEPRECATED: FileSystem interface for HDFS cluster.
See pyarrow.hdfs.connect for full connection details
.. deprecated:: 2.0
``pyarrow.hdfs.HadoopFileSystem`` is deprecated,
please use ``pyarrow.fs.HadoopFileSystem`` instead.
"""
def __init__(self, host="default", port=0, user=None, kerb_ticket=None,
driver='libhdfs', extra_conf=None):
warnings.warn(
_DEPR_MSG.format(
"hdfs.HadoopFileSystem", "2.0.0", "fs.HadoopFileSystem"),
FutureWarning, stacklevel=2)
if driver == 'libhdfs':
_maybe_set_hadoop_classpath()
self._connect(host, port, user, kerb_ticket, extra_conf)
def __reduce__(self):
return (HadoopFileSystem, (self.host, self.port, self.user,
self.kerb_ticket, self.extra_conf))
def _isfilestore(self):
"""
Return True if this is a Unix-style file store with directories.
"""
return True
@implements(FileSystem.isdir)
def isdir(self, path):
return super().isdir(path)
@implements(FileSystem.isfile)
def isfile(self, path):
return super().isfile(path)
@implements(FileSystem.delete)
def delete(self, path, recursive=False):
return super().delete(path, recursive)
def mkdir(self, path, **kwargs):
"""
Create directory in HDFS.
Parameters
----------
path : str
Directory path to create, including any parent directories.
Notes
-----
libhdfs does not support create_parents=False, so we ignore this here
"""
return super().mkdir(path)
@implements(FileSystem.rename)
def rename(self, path, new_path):
return super().rename(path, new_path)
@implements(FileSystem.exists)
def exists(self, path):
return super().exists(path)
def ls(self, path, detail=False):
"""
Retrieve directory contents and metadata, if requested.
Parameters
----------
path : str
HDFS path to retrieve contents of.
detail : bool, default False
If False, only return list of paths.
Returns
-------
result : list of dicts (detail=True) or strings (detail=False)
"""
return super().ls(path, detail)
def walk(self, top_path):
"""
Directory tree generator for HDFS, like os.walk.
Parameters
----------
top_path : str
Root directory for tree traversal.
Returns
-------
Generator yielding 3-tuple (dirpath, dirnames, filename)
"""
contents = self.ls(top_path, detail=True)
directories, files = _libhdfs_walk_files_dirs(top_path, contents)
yield top_path, directories, files
for dirname in directories:
yield from self.walk(self._path_join(top_path, dirname))
def _maybe_set_hadoop_classpath():
import re
if re.search(r'hadoop-common[^/]+.jar', os.environ.get('CLASSPATH', '')):
return
if 'HADOOP_HOME' in os.environ:
if sys.platform != 'win32':
classpath = _derive_hadoop_classpath()
else:
hadoop_bin = '{}/bin/hadoop'.format(os.environ['HADOOP_HOME'])
classpath = _hadoop_classpath_glob(hadoop_bin)
else:
classpath = _hadoop_classpath_glob('hadoop')
os.environ['CLASSPATH'] = classpath.decode('utf-8')
def _derive_hadoop_classpath():
import subprocess
find_args = ('find', '-L', os.environ['HADOOP_HOME'], '-name', '*.jar')
find = subprocess.Popen(find_args, stdout=subprocess.PIPE)
xargs_echo = subprocess.Popen(('xargs', 'echo'),
stdin=find.stdout,
stdout=subprocess.PIPE)
jars = subprocess.check_output(('tr', "' '", "':'"),
stdin=xargs_echo.stdout)
hadoop_conf = os.environ["HADOOP_CONF_DIR"] \
if "HADOOP_CONF_DIR" in os.environ \
else os.environ["HADOOP_HOME"] + "/etc/hadoop"
return (hadoop_conf + ":").encode("utf-8") + jars
def _hadoop_classpath_glob(hadoop_bin):
import subprocess
hadoop_classpath_args = (hadoop_bin, 'classpath', '--glob')
return subprocess.check_output(hadoop_classpath_args)
def _libhdfs_walk_files_dirs(top_path, contents):
files = []
directories = []
for c in contents:
scrubbed_name = posixpath.split(c['name'])[1]
if c['kind'] == 'file':
files.append(scrubbed_name)
else:
directories.append(scrubbed_name)
return directories, files
def connect(host="default", port=0, user=None, kerb_ticket=None,
extra_conf=None):
"""
DEPRECATED: Connect to an HDFS cluster.
All parameters are optional and should only be set if the defaults need
to be overridden.
Authentication should be automatic if the HDFS cluster uses Kerberos.
However, if a username is specified, then the ticket cache will likely
be required.
.. deprecated:: 2.0
``pyarrow.hdfs.connect`` is deprecated,
please use ``pyarrow.fs.HadoopFileSystem`` instead.
Parameters
----------
host : NameNode. Set to "default" for fs.defaultFS from core-site.xml.
port : NameNode's port. Set to 0 for default or logical (HA) nodes.
user : Username when connecting to HDFS; None implies login user.
kerb_ticket : Path to Kerberos ticket cache.
extra_conf : dict, default None
extra Key/Value pairs for config; Will override any
hdfs-site.xml properties
Notes
-----
The first time you call this method, it will take longer than usual due
to JNI spin-up time.
Returns
-------
filesystem : HadoopFileSystem
"""
warnings.warn(
_DEPR_MSG.format("hdfs.connect", "2.0.0", "fs.HadoopFileSystem"),
FutureWarning, stacklevel=2
)
return _connect(
host=host, port=port, user=user, kerb_ticket=kerb_ticket,
extra_conf=extra_conf
)
def _connect(host="default", port=0, user=None, kerb_ticket=None,
extra_conf=None):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fs = HadoopFileSystem(host=host, port=port, user=user,
kerb_ticket=kerb_ticket,
extra_conf=extra_conf)
return fs

View File

@ -0,0 +1,128 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "tensorflow/core/framework/op.h"
#include "arrow/type.h"
// These utilities are supposed to be included in TensorFlow operators
// that need to be compiled separately from Arrow because of ABI issues.
// They therefore need to be header-only.
namespace arrow {
namespace adapters {
namespace tensorflow {
Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr<DataType>* out) {
switch (dtype) {
case ::tensorflow::DT_BOOL:
*out = arrow::boolean();
break;
case ::tensorflow::DT_FLOAT:
*out = arrow::float32();
break;
case ::tensorflow::DT_DOUBLE:
*out = arrow::float64();
break;
case ::tensorflow::DT_HALF:
*out = arrow::float16();
break;
case ::tensorflow::DT_INT8:
*out = arrow::int8();
break;
case ::tensorflow::DT_INT16:
*out = arrow::int16();
break;
case ::tensorflow::DT_INT32:
*out = arrow::int32();
break;
case ::tensorflow::DT_INT64:
*out = arrow::int64();
break;
case ::tensorflow::DT_UINT8:
*out = arrow::uint8();
break;
case ::tensorflow::DT_UINT16:
*out = arrow::uint16();
break;
case ::tensorflow::DT_UINT32:
*out = arrow::uint32();
break;
case ::tensorflow::DT_UINT64:
*out = arrow::uint64();
break;
default:
return Status::TypeError("TensorFlow data type is not supported");
}
return Status::OK();
}
Status GetTensorFlowType(std::shared_ptr<DataType> dtype, ::tensorflow::DataType* out) {
switch (dtype->id()) {
case Type::BOOL:
*out = ::tensorflow::DT_BOOL;
break;
case Type::UINT8:
*out = ::tensorflow::DT_UINT8;
break;
case Type::INT8:
*out = ::tensorflow::DT_INT8;
break;
case Type::UINT16:
*out = ::tensorflow::DT_UINT16;
break;
case Type::INT16:
*out = ::tensorflow::DT_INT16;
break;
case Type::UINT32:
*out = ::tensorflow::DT_UINT32;
break;
case Type::INT32:
*out = ::tensorflow::DT_INT32;
break;
case Type::UINT64:
*out = ::tensorflow::DT_UINT64;
break;
case Type::INT64:
*out = ::tensorflow::DT_INT64;
break;
case Type::HALF_FLOAT:
*out = ::tensorflow::DT_HALF;
break;
case Type::FLOAT:
*out = ::tensorflow::DT_FLOAT;
break;
case Type::DOUBLE:
*out = ::tensorflow::DT_DOUBLE;
break;
default:
return Status::TypeError("Arrow data type is not supported");
}
return arrow::Status::OK();
}
} // namespace tensorflow
} // namespace adapters
} // namespace arrow

View File

@ -0,0 +1,46 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Coarse public API while the library is in development
#pragma once
#include "arrow/array.h" // IYWU pragma: export
#include "arrow/array/concatenate.h" // IYWU pragma: export
#include "arrow/buffer.h" // IYWU pragma: export
#include "arrow/builder.h" // IYWU pragma: export
#include "arrow/chunked_array.h" // IYWU pragma: export
#include "arrow/compare.h" // IYWU pragma: export
#include "arrow/config.h" // IYWU pragma: export
#include "arrow/datum.h" // IYWU pragma: export
#include "arrow/extension_type.h" // IYWU pragma: export
#include "arrow/memory_pool.h" // IYWU pragma: export
#include "arrow/pretty_print.h" // IYWU pragma: export
#include "arrow/record_batch.h" // IYWU pragma: export
#include "arrow/result.h" // IYWU pragma: export
#include "arrow/status.h" // IYWU pragma: export
#include "arrow/table.h" // IYWU pragma: export
#include "arrow/table_builder.h" // IYWU pragma: export
#include "arrow/tensor.h" // IYWU pragma: export
#include "arrow/type.h" // IYWU pragma: export
#include "arrow/util/key_value_metadata.h" // IWYU pragma: export
#include "arrow/visit_array_inline.h" // IYWU pragma: export
#include "arrow/visit_scalar_inline.h" // IYWU pragma: export
#include "arrow/visitor.h" // IYWU pragma: export
/// \brief Top-level namespace for Apache Arrow C++ API
namespace arrow {}

View File

@ -0,0 +1,44 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Kitchen-sink public API for arrow::Array data structures. C++ library code
// (especially header files) in Apache Arrow should use more specific headers
// unless it's a file that uses most or all Array types in which case using
// arrow/array.h is fine.
#pragma once
/// \defgroup numeric-arrays Concrete classes for numeric arrays
/// @{
/// @}
/// \defgroup binary-arrays Concrete classes for binary/string arrays
/// @{
/// @}
/// \defgroup nested-arrays Concrete classes for nested arrays
/// @{
/// @}
#include "arrow/array/array_base.h" // IWYU pragma: keep
#include "arrow/array/array_binary.h" // IWYU pragma: keep
#include "arrow/array/array_decimal.h" // IWYU pragma: keep
#include "arrow/array/array_dict.h" // IWYU pragma: keep
#include "arrow/array/array_nested.h" // IWYU pragma: keep
#include "arrow/array/array_primitive.h" // IWYU pragma: keep
#include "arrow/array/data.h" // IWYU pragma: keep
#include "arrow/array/util.h" // IWYU pragma: keep

View File

@ -0,0 +1,264 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <iosfwd>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
#include "arrow/visitor.h"
namespace arrow {
// ----------------------------------------------------------------------
// User array accessor types
/// \brief Array base type
/// Immutable data array with some logical type and some length.
///
/// Any memory is owned by the respective Buffer instance (or its parents).
///
/// The base class is only required to have a null bitmap buffer if the null
/// count is greater than 0
///
/// If known, the null count can be provided in the base Array constructor. If
/// the null count is not known, pass -1 to indicate that the null count is to
/// be computed on the first call to null_count()
class ARROW_EXPORT Array {
public:
virtual ~Array() = default;
/// \brief Return true if value at index is null. Does not boundscheck
bool IsNull(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? !bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count == data_->length;
}
/// \brief Return true if value at index is valid (not null). Does not
/// boundscheck
bool IsValid(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count != data_->length;
}
/// \brief Return a Scalar containing the value of this array at i
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
/// Size in the number of elements this array contains.
int64_t length() const { return data_->length; }
/// A relative position into another array's data, to enable zero-copy
/// slicing. This value defaults to zero
int64_t offset() const { return data_->offset; }
/// The number of null entries in the array. If the null count was not known
/// at time of construction (and set to a negative value), then the null
/// count will be computed and cached on the first invocation of this
/// function
int64_t null_count() const;
std::shared_ptr<DataType> type() const { return data_->type; }
Type::type type_id() const { return data_->type->id(); }
/// Buffer for the validity (null) bitmap, if any. Note that Union types
/// never have a null bitmap.
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
/// Raw pointer to the null bitmap.
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
/// Equality comparison with another array
bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
bool Equals(const std::shared_ptr<Array>& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
/// \brief Return the formatted unified diff of arrow::Diff between this
/// Array and another Array
std::string Diff(const Array& other) const;
/// Approximate equality comparison with another array
///
/// epsilon is only used if this is FloatArray or DoubleArray
bool ApproxEquals(const std::shared_ptr<Array>& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
bool ApproxEquals(const Array& arr,
const EqualOptions& = EqualOptions::Defaults()) const;
/// Compare if the range of slots specified are equal for the given array and
/// this array. end_idx exclusive. This methods does not bounds check.
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
const Array& other,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
const std::shared_ptr<Array>& other,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
int64_t other_start_idx,
const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
int64_t end_idx, int64_t other_start_idx,
const EqualOptions& = EqualOptions::Defaults()) const;
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
Status Accept(ArrayVisitor* visitor) const;
/// Construct a zero-copy view of this array with the given type.
///
/// This method checks if the types are layout-compatible.
/// Nested types are traversed in depth-first order. Data buffers must have
/// the same item sizes, even though the logical types may be different.
/// An error is returned if the types are not layout-compatible.
Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
/// Construct a zero-copy slice of the array with the indicated offset and
/// length
///
/// \param[in] offset the position of the first element in the constructed
/// slice
/// \param[in] length the length of the slice. If there are not enough
/// elements in the array, the length will be adjusted accordingly
///
/// \return a new object wrapped in std::shared_ptr<Array>
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
/// Slice from offset until end of the array
std::shared_ptr<Array> Slice(int64_t offset) const;
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
const std::shared_ptr<ArrayData>& data() const { return data_; }
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
/// \return PrettyPrint representation of array suitable for debugging
std::string ToString() const;
/// \brief Perform cheap validation checks to determine obvious inconsistencies
/// within the array's internal data.
///
/// This is O(k) where k is the number of descendents.
///
/// \return Status
Status Validate() const;
/// \brief Perform extensive validation checks to determine inconsistencies
/// within the array's internal data.
///
/// This is potentially O(k*n) where k is the number of descendents and n
/// is the array length.
///
/// \return Status
Status ValidateFull() const;
protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
std::shared_ptr<ArrayData> data_;
const uint8_t* null_bitmap_data_ = NULLPTR;
/// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
if (data->buffers.size() > 0) {
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
} else {
null_bitmap_data_ = NULLPTR;
}
data_ = data;
}
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
ARROW_EXPORT friend void PrintTo(const Array& x, std::ostream* os);
};
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
os << x.ToString();
return os;
}
/// Base class for non-nested arrays
class ARROW_EXPORT FlatArray : public Array {
protected:
using Array::Array;
};
/// Base class for arrays of fixed-size logical types
class ARROW_EXPORT PrimitiveArray : public FlatArray {
public:
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// Does not account for any slice offset
std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
protected:
PrimitiveArray() : raw_values_(NULLPTR) {}
void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
}
explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
const uint8_t* raw_values_;
};
/// Degenerate null type Array
class ARROW_EXPORT NullArray : public FlatArray {
public:
using TypeClass = NullType;
explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
explicit NullArray(int64_t length);
private:
void SetData(const std::shared_ptr<ArrayData>& data) {
null_bitmap_data_ = NULLPTR;
data->null_count = data->length;
data_ = data;
}
};
} // namespace arrow

View File

@ -0,0 +1,269 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes for Binary, LargeBinart, String, LargeString,
// FixedSizeBinary
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h" // IWYU pragma: export
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup binary-arrays
///
/// @{
// ----------------------------------------------------------------------
// Binary and String
/// Base class for variable-sized binary arrays, regardless of offset size
/// and logical interpretation.
template <typename TYPE>
class BaseBinaryArray : public FlatArray {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
// Account for base offset
i += data_->offset;
const offset_type pos = raw_value_offsets_[i];
*out_length = raw_value_offsets_[i + 1] - pos;
return raw_data_ + pos;
}
/// \brief Get binary value as a string_view
///
/// \param i the value index
/// \return the view over the selected value
util::string_view GetView(int64_t i) const {
// Account for base offset
i += data_->offset;
const offset_type pos = raw_value_offsets_[i];
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
raw_value_offsets_[i + 1] - pos);
}
util::optional<util::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
/// \brief Get binary value as a string_view
/// Provided for consistency with other arrays.
///
/// \param i the value index
/// \return the view over the selected value
util::string_view Value(int64_t i) const { return GetView(i); }
/// \brief Get binary value as a std::string
///
/// \param i the value index
/// \return the value copied into a std::string
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
const offset_type* raw_value_offsets() const {
return raw_value_offsets_ + data_->offset;
}
const uint8_t* raw_data() const { return raw_data_; }
/// \brief Return the data buffer absolute offset of the data for the value
/// at the passed index.
///
/// Does not perform boundschecking
offset_type value_offset(int64_t i) const {
return raw_value_offsets_[i + data_->offset];
}
/// \brief Return the length of the data for the value at the passed index.
///
/// Does not perform boundschecking
offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
/// \brief Return the total length of the memory in the data buffer
/// referenced by this array. If the array has been sliced then this may be
/// less than the size of the data buffer (data_->buffers[2]).
offset_type total_values_length() const {
if (data_->length > 0) {
return raw_value_offsets_[data_->length + data_->offset] -
raw_value_offsets_[data_->offset];
} else {
return 0;
}
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
// For subclasses
BaseBinaryArray() = default;
// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
}
const offset_type* raw_value_offsets_ = NULLPTR;
const uint8_t* raw_data_ = NULLPTR;
};
/// Concrete Array class for variable-size binary data
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
public:
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as StringArray
BinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for variable-size string (utf-8) data
class ARROW_EXPORT StringArray : public BinaryArray {
public:
using TypeClass = StringType;
explicit StringArray(const std::shared_ptr<ArrayData>& data);
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
/// Concrete Array class for large variable-size binary data
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
public:
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
protected:
// For subclasses such as LargeStringArray
LargeBinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for large variable-size string (utf-8) data
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
public:
using TypeClass = LargeStringType;
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};
// ----------------------------------------------------------------------
// Fixed width binary
/// Concrete Array class for fixed-size binary data
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
public:
using TypeClass = FixedSizeBinaryType;
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const uint8_t* GetValue(int64_t i) const;
const uint8_t* Value(int64_t i) const { return GetValue(i); }
util::string_view GetView(int64_t i) const {
return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
}
util::optional<util::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
int32_t byte_width() const { return byte_width_; }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
void SetData(const std::shared_ptr<ArrayData>& data) {
this->PrimitiveArray::SetData(data);
byte_width_ =
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
}
int32_t byte_width_;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,72 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include "arrow/array/array_binary.h"
#include "arrow/array/data.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-arrays
///
/// @{
// ----------------------------------------------------------------------
// Decimal128Array
/// Concrete Array class for 128-bit decimal data
class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal128Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal128Array from ArrayData instance
explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
// Backward compatibility
using DecimalArray = Decimal128Array;
// ----------------------------------------------------------------------
// Decimal256Array
/// Concrete Array class for 256-bit decimal data
class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
public:
using TypeClass = Decimal256Type;
using FixedSizeBinaryArray::FixedSizeBinaryArray;
/// \brief Construct Decimal256Array from ArrayData instance
explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
std::string FormatValue(int64_t i) const;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,180 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// DictionaryArray
/// \brief Array type for dictionary-encoded data with a
/// data-dependent dictionary
///
/// A dictionary array contains an array of non-negative integers (the
/// "dictionary indices") along with a data type containing a "dictionary"
/// corresponding to the distinct values represented in the data.
///
/// For example, the array
///
/// ["foo", "bar", "foo", "bar", "foo", "bar"]
///
/// with dictionary ["bar", "foo"], would have dictionary array representation
///
/// indices: [1, 0, 1, 0, 1, 0]
/// dictionary: ["bar", "foo"]
///
/// The indices in principle may be any integer type.
class ARROW_EXPORT DictionaryArray : public Array {
public:
using TypeClass = DictionaryType;
explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
DictionaryArray(const std::shared_ptr<DataType>& type,
const std::shared_ptr<Array>& indices,
const std::shared_ptr<Array>& dictionary);
/// \brief Construct DictionaryArray from dictionary and indices
/// array and validate
///
/// This function does the validation of the indices and input type. It checks if
/// all indices are non-negative and smaller than the size of the dictionary.
///
/// \param[in] type a dictionary type
/// \param[in] dictionary the dictionary with same value type as the
/// type object
/// \param[in] indices an array of non-negative integers smaller than the
/// size of the dictionary
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
const std::shared_ptr<Array>& dictionary);
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
dictionary);
}
/// \brief Transpose this DictionaryArray
///
/// This method constructs a new dictionary array with the given dictionary
/// type, transposing indices using the transpose map. The type and the
/// transpose map are typically computed using DictionaryUnifier.
///
/// \param[in] type the new type object
/// \param[in] dictionary the new dictionary
/// \param[in] transpose_map transposition array of this array's indices
/// into the target array's indices
/// \param[in] pool a pool to allocate the array data from
Result<std::shared_ptr<Array>> Transpose(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
/// \brief Determine whether dictionary arrays may be compared without unification
bool CanCompareIndices(const DictionaryArray& other) const;
/// \brief Return the dictionary for this array, which is stored as
/// a member of the ArrayData internal structure
std::shared_ptr<Array> dictionary() const;
std::shared_ptr<Array> indices() const;
/// \brief Return the ith value of indices, cast to int64_t. Not recommended
/// for use in performance-sensitive code. Does not validate whether the
/// value is null or out-of-bounds.
int64_t GetValueIndex(int64_t i) const;
const DictionaryType* dict_type() const { return dict_type_; }
private:
void SetData(const std::shared_ptr<ArrayData>& data);
const DictionaryType* dict_type_;
std::shared_ptr<Array> indices_;
// Lazily initialized when invoking dictionary()
mutable std::shared_ptr<Array> dictionary_;
};
/// \brief Helper class for incremental dictionary unification
class ARROW_EXPORT DictionaryUnifier {
public:
virtual ~DictionaryUnifier() = default;
/// \brief Construct a DictionaryUnifier
/// \param[in] value_type the data type of the dictionaries
/// \param[in] pool MemoryPool to use for memory allocations
static Result<std::unique_ptr<DictionaryUnifier>> Make(
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
/// \brief Unify dictionaries accross array chunks
///
/// The dictionaries in the array chunks will be unified, their indices
/// accordingly transposed.
///
/// Only dictionaries with a primitive value type are currently supported.
/// However, dictionaries nested inside a more complex type are correctly unified.
static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
const std::shared_ptr<ChunkedArray>& array,
MemoryPool* pool = default_memory_pool());
/// \brief Unify dictionaries accross the chunks of each table column
///
/// The dictionaries in each table column will be unified, their indices
/// accordingly transposed.
///
/// Only dictionaries with a primitive value type are currently supported.
/// However, dictionaries nested inside a more complex type are correctly unified.
static Result<std::shared_ptr<Table>> UnifyTable(
const Table& table, MemoryPool* pool = default_memory_pool());
/// \brief Append dictionary to the internal memo
virtual Status Unify(const Array& dictionary) = 0;
/// \brief Append dictionary and compute transpose indices
/// \param[in] dictionary the dictionary values to unify
/// \param[out] out_transpose a Buffer containing computed transpose indices
/// as int32_t values equal in length to the passed dictionary. The value in
/// each slot corresponds to the new index value for each original index
/// for a DictionaryArray with the old dictionary
virtual Status Unify(const Array& dictionary,
std::shared_ptr<Buffer>* out_transpose) = 0;
/// \brief Return a result DictionaryType with the smallest possible index
/// type to accommodate the unified dictionary. The unifier cannot be used
/// after this is called
virtual Status GetResult(std::shared_ptr<DataType>* out_type,
std::shared_ptr<Array>* out_dict) = 0;
/// \brief Return a unified dictionary with the given index type. If
/// the index type is not large enough then an invalid status will be returned.
/// The unifier cannot be used after this is called
virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
std::shared_ptr<Array>* out_dict) = 0;
};
} // namespace arrow

View File

@ -0,0 +1,569 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and
// Union
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-arrays
///
/// @{
// ----------------------------------------------------------------------
// ListArray
template <typename TYPE>
class BaseListArray;
namespace internal {
// Private helper for ListArray::SetData.
// Unfortunately, trying to define BaseListArray::SetData outside of this header
// doesn't play well with MSVC.
template <typename TYPE>
void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id = TYPE::type_id);
} // namespace internal
/// Base class for variable-sized list arrays, regardless of offset size.
template <typename TYPE>
class BaseListArray : public Array {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
const TypeClass* list_type() const { return list_type_; }
/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const { return values_; }
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
/// Return pointer to raw value offsets accounting for any slice offset
const offset_type* raw_value_offsets() const {
return raw_value_offsets_ + data_->offset;
}
// The following functions will not perform boundschecking
offset_type value_offset(int64_t i) const {
return raw_value_offsets_[i + data_->offset];
}
offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}
protected:
friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self,
const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id);
const TypeClass* list_type_ = NULLPTR;
std::shared_ptr<Array> values_;
const offset_type* raw_value_offsets_ = NULLPTR;
};
/// Concrete Array class for list data
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
public:
explicit ListArray(std::shared_ptr<ArrayData> data);
ListArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct ListArray from array of offsets and child value array
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
static Result<std::shared_ptr<ListArray>> FromArrays(
const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
static Result<std::shared_ptr<ListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration of this array's offsets as well as null elements backed
/// by non-empty lists (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list offsets as an Int32Array
///
/// The returned array will not have a validity bitmap, so you cannot expect
/// to pass it to ListArray::FromArrays() and get back the same list array
/// if the original one has nulls.
std::shared_ptr<Array> offsets() const;
protected:
// This constructor defers SetData to a derived array class
ListArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
};
/// Concrete Array class for large list data (with 64-bit offsets)
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
public:
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct LargeListArray from array of offsets and child value array
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int64 type
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
static Result<std::shared_ptr<LargeListArray>> FromArrays(
const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
static Result<std::shared_ptr<LargeListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration of this array's offsets as well as null elements backed
/// by non-empty lists (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Return list offsets as an Int64Array
std::shared_ptr<Array> offsets() const;
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
};
// ----------------------------------------------------------------------
// MapArray
/// Concrete Array class for map data
///
/// NB: "value" in this context refers to a pair of a key and the corresponding item
class ARROW_EXPORT MapArray : public ListArray {
public:
using TypeClass = MapType;
explicit MapArray(const std::shared_ptr<ArrayData>& data);
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Construct MapArray from array of offsets and child key, item arrays
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
/// \param[in] keys Array containing key values
/// \param[in] items Array containing item values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool());
static Result<std::shared_ptr<Array>> FromArrays(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
MemoryPool* pool = default_memory_pool());
const MapType* map_type() const { return map_type_; }
/// \brief Return array object containing all map keys
std::shared_ptr<Array> keys() const { return keys_; }
/// \brief Return array object containing all mapped items
std::shared_ptr<Array> items() const { return items_; }
/// Validate child data before constructing the actual MapArray.
static Status ValidateChildData(
const std::vector<std::shared_ptr<ArrayData>>& child_data);
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
static Result<std::shared_ptr<Array>> FromArraysInternal(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
MemoryPool* pool);
private:
const MapType* map_type_;
std::shared_ptr<Array> keys_, items_;
};
// ----------------------------------------------------------------------
// FixedSizeListArray
/// Concrete Array class for fixed size list data
class ARROW_EXPORT FixedSizeListArray : public Array {
public:
using TypeClass = FixedSizeListType;
using offset_type = TypeClass::offset_type;
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const FixedSizeListType* list_type() const;
/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;
std::shared_ptr<DataType> value_type() const;
// The following functions will not perform boundschecking
int64_t value_offset(int64_t i) const {
i += data_->offset;
return list_size_ * i;
}
int32_t value_length(int64_t i = 0) const {
ARROW_UNUSED(i);
return list_size_;
}
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}
/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
/// consideration null elements (they are skipped, thus copying may be needed).
Result<std::shared_ptr<Array>> Flatten(
MemoryPool* memory_pool = default_memory_pool()) const;
/// \brief Construct FixedSizeListArray from child value array and value_length
///
/// \param[in] values Array containing list values
/// \param[in] list_size The fixed length of each list
/// \return Will have length equal to values.length() / list_size
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
int32_t list_size);
/// \brief Construct FixedSizeListArray from child value array and type
///
/// \param[in] values Array containing list values
/// \param[in] type The fixed sized list type
/// \return Will have length equal to values.length() / type.list_size()
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
std::shared_ptr<DataType> type);
protected:
void SetData(const std::shared_ptr<ArrayData>& data);
int32_t list_size_;
private:
std::shared_ptr<Array> values_;
};
// ----------------------------------------------------------------------
// Struct
/// Concrete Array class for struct data
class ARROW_EXPORT StructArray : public Array {
public:
using TypeClass = StructType;
explicit StructArray(const std::shared_ptr<ArrayData>& data);
StructArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Array>>& children,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Return a StructArray from child arrays and field names.
///
/// The length and data type are automatically inferred from the arguments.
/// There should be at least one child array.
static Result<std::shared_ptr<StructArray>> Make(
const ArrayVector& children, const std::vector<std::string>& field_names,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
/// \brief Return a StructArray from child arrays and fields.
///
/// The length is automatically inferred from the arguments.
/// There should be at least one child array. This method does not
/// check that field types and child array types are consistent.
static Result<std::shared_ptr<StructArray>> Make(
const ArrayVector& children, const FieldVector& fields,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
const StructType* struct_type() const;
// Return a shared pointer in case the requestor desires to share ownership
// with this array. The returned array has its offset, length and null
// count adjusted.
std::shared_ptr<Array> field(int pos) const;
const ArrayVector& fields() const;
/// Returns null if name not found
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
/// \brief Flatten this array as a vector of arrays, one for each field
///
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
/// \brief Get one of the child arrays, combining its null bitmap
/// with the parent struct array's bitmap.
///
/// \param[in] index Which child array to get
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<std::shared_ptr<Array>> GetFlattenedField(
int index, MemoryPool* pool = default_memory_pool()) const;
private:
// For caching boxed child data
// XXX This is not handled in a thread-safe manner.
mutable ArrayVector boxed_fields_;
};
// ----------------------------------------------------------------------
// Union
/// Base class for SparseUnionArray and DenseUnionArray
class ARROW_EXPORT UnionArray : public Array {
public:
using type_code_t = int8_t;
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
/// The logical type code of the value at index.
type_code_t type_code(int64_t i) const { return raw_type_codes_[i + data_->offset]; }
/// The physical child id containing value at index.
int child_id(int64_t i) const {
return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
}
const UnionType* union_type() const { return union_type_; }
UnionMode::type mode() const { return union_type_->mode(); }
/// \brief Return the given field as an individual array.
///
/// For sparse unions, the returned array has its offset, length and null
/// count adjusted.
std::shared_ptr<Array> field(int pos) const;
protected:
void SetData(std::shared_ptr<ArrayData> data);
const type_code_t* raw_type_codes_;
const UnionType* union_type_;
// For caching boxed child data
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
};
/// Concrete Array class for sparse union data
class ARROW_EXPORT SparseUnionArray : public UnionArray {
public:
using TypeClass = SparseUnionType;
explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
/// \brief Construct SparseUnionArray from type_ids and children
///
/// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
std::vector<type_code_t> type_codes) {
return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
std::move(type_codes));
}
/// \brief Construct SparseUnionArray with custom field names from type_ids and children
///
/// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
std::vector<std::string> field_names = {},
std::vector<type_code_t> type_codes = {});
const SparseUnionType* union_type() const {
return internal::checked_cast<const SparseUnionType*>(union_type_);
}
/// \brief Get one of the child arrays, adjusting its null bitmap
/// where the union array type code does not match.
///
/// \param[in] index Which child array to get (i.e. the physical index, not the type
/// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
Result<std::shared_ptr<Array>> GetFlattenedField(
int index, MemoryPool* pool = default_memory_pool()) const;
protected:
void SetData(std::shared_ptr<ArrayData> data);
};
/// \brief Concrete Array class for dense union data
///
/// Note that union types do not have a validity bitmap
class ARROW_EXPORT DenseUnionArray : public UnionArray {
public:
using TypeClass = DenseUnionType;
explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
std::shared_ptr<Buffer> type_ids,
std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
/// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] value_offsets An array of signed int32 values indicating the
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
const Array& value_offsets,
ArrayVector children,
std::vector<type_code_t> type_codes) {
return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
std::move(type_codes));
}
/// \brief Construct DenseUnionArray with custom field names from type_ids,
/// value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
/// \param[in] value_offsets An array of signed int32 values indicating the
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
const Array& value_offsets,
ArrayVector children,
std::vector<std::string> field_names = {},
std::vector<type_code_t> type_codes = {});
const DenseUnionType* union_type() const {
return internal::checked_cast<const DenseUnionType*>(union_type_);
}
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
protected:
const int32_t* raw_value_offsets_;
void SetData(const std::shared_ptr<ArrayData>& data);
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,202 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Array accessor types for primitive/C-type-based arrays, such as numbers,
// boolean, and temporal types.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// Concrete Array class for boolean data
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
public:
using TypeClass = BooleanType;
using IteratorType = stl::ArrayIterator<BooleanArray>;
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
bool Value(int64_t i) const {
return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
i + data_->offset);
}
bool GetView(int64_t i) const { return Value(i); }
util::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
/// \brief Return the number of false (0) values among the valid
/// values. Result is not cached.
int64_t false_count() const;
/// \brief Return the number of true (1) values among the valid
/// values. Result is not cached.
int64_t true_count() const;
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
using PrimitiveArray::PrimitiveArray;
};
/// \addtogroup numeric-arrays
///
/// @{
/// \brief Concrete Array class for numeric data with a corresponding C type
///
/// This class is templated on the corresponding DataType subclass for the
/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
///
/// Note that convenience aliases are available for all accepted types
/// (for example Int8Array for NumericArray<Int8Type>).
template <typename TYPE>
class NumericArray : public PrimitiveArray {
public:
using TypeClass = TYPE;
using value_type = typename TypeClass::c_type;
using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
// Only enable this constructor without a type argument for types without additional
// metadata
template <typename T1 = TYPE>
NumericArray(enable_if_parameter_free<T1, int64_t> length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
null_count, offset) {}
const value_type* raw_values() const {
return reinterpret_cast<const value_type*>(raw_values_) + data_->offset;
}
value_type Value(int64_t i) const { return raw_values()[i]; }
// For API compatibility with BinaryArray etc.
value_type GetView(int64_t i) const { return Value(i); }
util::optional<value_type> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
protected:
using PrimitiveArray::PrimitiveArray;
};
/// DayTimeArray
/// ---------------------
/// \brief Array of Day and Millisecond values.
class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
public:
using TypeClass = DayTimeIntervalType;
using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
TypeClass::DayMilliseconds GetValue(int64_t i) const;
TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
// For compatibility with Take kernel.
TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
util::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
};
/// \brief Array of Month, Day and nanosecond values.
class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
public:
using TypeClass = MonthDayNanoIntervalType;
using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
TypeClass::MonthDayNanos GetValue(int64_t i) const;
TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
// For compatibility with Take kernel.
TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }
util::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
return *IteratorType(*this, i);
}
int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,213 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <cstring>
#include <memory>
#include <type_traits>
#include "arrow/array/builder_base.h"
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-builders
///
/// @{
namespace internal {
class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
public:
AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool);
explicit AdaptiveIntBuilderBase(MemoryPool* pool)
: AdaptiveIntBuilderBase(sizeof(uint8_t), pool) {}
/// \brief Append multiple nulls
/// \param[in] length the number of nulls to append
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(CommitPendingData());
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(Reserve(length));
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
UnsafeSetNull(length);
}
return Status::OK();
}
Status AppendNull() final {
pending_data_[pending_pos_] = 0;
pending_valid_[pending_pos_] = 0;
pending_has_nulls_ = true;
++pending_pos_;
++length_;
++null_count_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(CommitPendingData());
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(Reserve(length));
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
UnsafeSetNotNull(length);
}
return Status::OK();
}
Status AppendEmptyValue() final {
pending_data_[pending_pos_] = 0;
pending_valid_[pending_pos_] = 1;
++pending_pos_;
++length_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
void Reset() override;
Status Resize(int64_t capacity) override;
protected:
Status AppendInternal(const uint64_t val) {
pending_data_[pending_pos_] = val;
pending_valid_[pending_pos_] = 1;
++pending_pos_;
++length_;
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
return CommitPendingData();
}
return Status::OK();
}
virtual Status CommitPendingData() = 0;
template <typename new_type, typename old_type>
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
ExpandIntSizeInternal();
template <typename new_type, typename old_type>
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
ExpandIntSizeInternal();
std::shared_ptr<ResizableBuffer> data_;
uint8_t* raw_data_ = NULLPTR;
const uint8_t start_int_size_;
uint8_t int_size_;
static constexpr int32_t pending_size_ = 1024;
uint8_t pending_valid_[pending_size_];
uint64_t pending_data_[pending_size_];
int32_t pending_pos_ = 0;
bool pending_has_nulls_ = false;
};
} // namespace internal
class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
public:
explicit AdaptiveUIntBuilder(uint8_t start_int_size,
MemoryPool* pool = default_memory_pool());
explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
: AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
using ArrayBuilder::Advance;
using internal::AdaptiveIntBuilderBase::Reset;
/// Scalar append
Status Append(const uint64_t val) { return AppendInternal(val); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const uint64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
std::shared_ptr<DataType> type() const override;
protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Status AppendValuesInternal(const uint64_t* values, int64_t length,
const uint8_t* valid_bytes);
template <typename new_type>
Status ExpandIntSizeN();
};
class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
public:
explicit AdaptiveIntBuilder(uint8_t start_int_size,
MemoryPool* pool = default_memory_pool());
explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool())
: AdaptiveIntBuilder(sizeof(uint8_t), pool) {}
using ArrayBuilder::Advance;
using internal::AdaptiveIntBuilderBase::Reset;
/// Scalar append
Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
std::shared_ptr<DataType> type() const override;
protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Status AppendValuesInternal(const int64_t* values, int64_t length,
const uint8_t* valid_bytes);
template <typename new_type>
Status ExpandIntSizeN();
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,350 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm> // IWYU pragma: keep
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/array_primitive.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \defgroup numeric-builders Concrete builder subclasses for numeric types
/// @{
/// @}
/// \defgroup temporal-builders Concrete builder subclasses for temporal types
/// @{
/// @}
/// \defgroup binary-builders Concrete builder subclasses for binary types
/// @{
/// @}
/// \defgroup nested-builders Concrete builder subclasses for nested types
/// @{
/// @}
/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
/// @{
/// @}
constexpr int64_t kMinBuilderCapacity = 1 << 5;
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
/// Base class for all data array builders.
///
/// This class provides a facilities for incrementally building the null bitmap
/// (see Append methods) and as a side effect the current number of slots and
/// the null count.
///
/// \note Users are expected to use builders as one of the concrete types below.
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
class ARROW_EXPORT ArrayBuilder {
public:
explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
virtual ~ArrayBuilder() = default;
/// For nested types. Since the objects are owned by this class instance, we
/// skip shared pointers and just return a raw pointer
ArrayBuilder* child(int i) { return children_[i].get(); }
const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
int num_children() const { return static_cast<int>(children_.size()); }
virtual int64_t length() const { return length_; }
int64_t null_count() const { return null_count_; }
int64_t capacity() const { return capacity_; }
/// \brief Ensure that enough memory has been allocated to fit the indicated
/// number of total elements in the builder, including any that have already
/// been appended. Does not account for reallocations that may be due to
/// variable size data, like binary values. To make space for incremental
/// appends, use Reserve instead.
///
/// \param[in] capacity the minimum number of total array values to
/// accommodate. Must be greater than the current capacity.
/// \return Status
virtual Status Resize(int64_t capacity);
/// \brief Ensure that there is enough space allocated to append the indicated
/// number of elements without any further reallocation. Overallocation is
/// used in order to minimize the impact of incremental Reserve() calls.
/// Note that additional_capacity is relative to the current number of elements
/// rather than to the current capacity, so calls to Reserve() which are not
/// interspersed with addition of new elements may not increase the capacity.
///
/// \param[in] additional_capacity the number of additional array values
/// \return Status
Status Reserve(int64_t additional_capacity) {
auto current_capacity = capacity();
auto min_capacity = length() + additional_capacity;
if (min_capacity <= current_capacity) return Status::OK();
// leave growth factor up to BufferBuilder
auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
return Resize(new_capacity);
}
/// Reset the builder.
virtual void Reset();
/// \brief Append a null value to builder
virtual Status AppendNull() = 0;
/// \brief Append a number of null values to builder
virtual Status AppendNulls(int64_t length) = 0;
/// \brief Append a non-null value to builder
///
/// The appended value is an implementation detail, but the corresponding
/// memory slot is guaranteed to be initialized.
/// This method is useful when appending a null value to a parent nested type.
virtual Status AppendEmptyValue() = 0;
/// \brief Append a number of non-null values to builder
///
/// The appended values are an implementation detail, but the corresponding
/// memory slot is guaranteed to be initialized.
/// This method is useful when appending null values to a parent nested type.
virtual Status AppendEmptyValues(int64_t length) = 0;
/// \brief Append a value from a scalar
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
virtual Status AppendScalars(const ScalarVector& scalars);
/// \brief Append a range of values from an array.
///
/// The given array must be the same type as the builder.
virtual Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) {
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
}
/// For cases where raw data was memcpy'd into the internal buffers, allows us
/// to advance the length of the builder. It is your responsibility to use
/// this function responsibly.
ARROW_DEPRECATED(
"Deprecated in 6.0.0. ArrayBuilder::Advance is poorly supported and mostly "
"untested.\nFor low-level control over buffer construction, use BufferBuilder "
"or TypedBufferBuilder directly.")
Status Advance(int64_t elements);
/// \brief Return result of builder as an internal generic ArrayData
/// object. Resets builder except for dictionary builder
///
/// \param[out] out the finalized ArrayData object
/// \return Status
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
/// \brief Return result of builder as an Array object.
///
/// The builder is reset except for DictionaryBuilder.
///
/// \param[out] out the finalized Array object
/// \return Status
Status Finish(std::shared_ptr<Array>* out);
/// \brief Return result of builder as an Array object.
///
/// The builder is reset except for DictionaryBuilder.
///
/// \return The finalized Array object
Result<std::shared_ptr<Array>> Finish();
/// \brief Return the type of the built Array
virtual std::shared_ptr<DataType> type() const = 0;
protected:
/// Append to null bitmap
Status AppendToBitmap(bool is_valid);
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
/// assume all of length bits are valid.
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
/// Uniform append. Append N times the same validity bit.
Status AppendToBitmap(int64_t num_bits, bool value);
/// Set the next length bits to not null (i.e. valid).
Status SetNotNull(int64_t length);
// Unsafe operations (don't check capacity/don't resize)
void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
// Append to null bitmap, update the length
void UnsafeAppendToBitmap(bool is_valid) {
null_bitmap_builder_.UnsafeAppend(is_valid);
++length_;
if (!is_valid) ++null_count_;
}
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
// assume all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
if (valid_bytes == NULLPTR) {
return UnsafeSetNotNull(length);
}
null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
length_ += length;
null_count_ = null_bitmap_builder_.false_count();
}
// Vector append. Copy from a given bitmap. If bitmap is null assume
// all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
if (bitmap == NULLPTR) {
return UnsafeSetNotNull(length);
}
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
length_ += length;
null_count_ = null_bitmap_builder_.false_count();
}
// Append the same validity value a given number of times.
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
if (value) {
UnsafeSetNotNull(num_bits);
} else {
UnsafeSetNull(num_bits);
}
}
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
// Set the next validity bits to not null (i.e. valid).
void UnsafeSetNotNull(int64_t length);
// Set the next validity bits to null (i.e. invalid).
void UnsafeSetNull(int64_t length);
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
/// \brief Finish to an array of the specified ArrayType
template <typename ArrayType>
Status FinishTyped(std::shared_ptr<ArrayType>* out) {
std::shared_ptr<Array> out_untyped;
ARROW_RETURN_NOT_OK(Finish(&out_untyped));
*out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
return Status::OK();
}
// Check the requested capacity for validity
Status CheckCapacity(int64_t new_capacity) {
if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
return Status::Invalid(
"Resize capacity must be positive (requested: ", new_capacity, ")");
}
if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
", current length: ", length_, ")");
}
return Status::OK();
}
// Check for array type
Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
const Array& array, const char* message);
Status CheckArrayType(Type::type expected_type, const Array& array,
const char* message);
MemoryPool* pool_;
TypedBufferBuilder<bool> null_bitmap_builder_;
int64_t null_count_ = 0;
// Array length, so far. Also, the index of the next element to be added
int64_t length_ = 0;
int64_t capacity_ = 0;
// Child value array builders. These are owned by this class
std::vector<std::shared_ptr<ArrayBuilder>> children_;
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};
/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the data type to create the builder for
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
return std::move(out);
}
/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type, where any top-level or nested dictionary builders return the
/// exact index type specified by the type.
ARROW_EXPORT
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
return std::move(out);
}
/// \brief Construct an empty DictionaryBuilder initialized optionally
/// with a pre-existing dictionary
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the dictionary type to create the builder for
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
const std::shared_ptr<Array>& dictionary,
std::unique_ptr<ArrayBuilder>* out);
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool()) {
std::unique_ptr<ArrayBuilder> out;
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
return std::move(out);
}
} // namespace arrow

View File

@ -0,0 +1,703 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/array/array_binary.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h" // IWYU pragma: export
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup binary-builders
///
/// @{
// ----------------------------------------------------------------------
// Binary and String
template <typename TYPE>
class BaseBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {}
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
: BaseBinaryBuilder(pool) {}
Status Append(const uint8_t* value, offset_type length) {
ARROW_RETURN_NOT_OK(Reserve(1));
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
UnsafeAppendToBitmap(true);
return Status::OK();
}
Status Append(const char* value, offset_type length) {
return Append(reinterpret_cast<const uint8_t*>(value), length);
}
Status Append(util::string_view value) {
return Append(value.data(), static_cast<offset_type>(value.size()));
}
/// Extend the last appended value by appending more data at the end
///
/// Unlike Append, this does not create a new offset.
Status ExtendCurrent(const uint8_t* value, offset_type length) {
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
return Status::OK();
}
Status ExtendCurrent(util::string_view value) {
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}
Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, false);
return Status::OK();
}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(AppendNextOffset());
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(false);
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(AppendNextOffset());
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(true);
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, true);
return Status::OK();
}
/// \brief Append without checking capacity
///
/// Offsets and data should have been presized using Reserve() and
/// ReserveData(), respectively.
void UnsafeAppend(const uint8_t* value, offset_type length) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(value, length);
UnsafeAppendToBitmap(true);
}
void UnsafeAppend(const char* value, offset_type length) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
}
void UnsafeAppend(const std::string& value) {
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
}
void UnsafeAppend(util::string_view value) {
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}
/// Like ExtendCurrent, but do not check capacity
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
value_data_builder_.UnsafeAppend(value, length);
}
void UnsafeExtendCurrent(util::string_view value) {
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}
void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}
void UnsafeAppendEmptyValue() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(true);
}
/// \brief Append a sequence of strings in one shot.
///
/// \param[in] values a vector of strings
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const std::vector<std::string>& values,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = std::accumulate(
values.begin(), values.end(), 0ULL,
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
ARROW_RETURN_NOT_OK(Reserve(values.size()));
ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
if (valid_bytes != NULLPTR) {
for (std::size_t i = 0; i < values.size(); ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
value_data_builder_.UnsafeAppend(
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
}
}
} else {
for (std::size_t i = 0; i < values.size(); ++i) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
}
}
UnsafeAppendToBitmap(valid_bytes, values.size());
return Status::OK();
}
/// \brief Append a sequence of nul-terminated strings in one shot.
/// If one of the values is NULL, it is processed as a null
/// value even if the corresponding valid_bytes entry is 1.
///
/// \param[in] values a contiguous C array of nul-terminated char *
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const char** values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = 0;
std::vector<std::size_t> value_lengths(length);
bool have_null_value = false;
for (int64_t i = 0; i < length; ++i) {
if (values[i] != NULLPTR) {
auto value_length = strlen(values[i]);
value_lengths[i] = value_length;
total_length += value_length;
} else {
have_null_value = true;
}
}
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ReserveData(total_length));
if (valid_bytes) {
int64_t valid_bytes_offset = 0;
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
} else {
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
i - valid_bytes_offset);
UnsafeAppendToBitmap(false);
valid_bytes_offset = i + 1;
}
}
}
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
} else {
if (have_null_value) {
std::vector<uint8_t> valid_vector(length, 0);
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
valid_vector[i] = 1;
}
}
UnsafeAppendToBitmap(valid_vector.data(), length);
} else {
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
}
UnsafeAppendToBitmap(NULLPTR, length);
}
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
auto bitmap = array.GetValues<uint8_t>(0, 0);
auto offsets = array.GetValues<offset_type>(1);
auto data = array.GetValues<uint8_t>(2, 0);
for (int64_t i = 0; i < length; i++) {
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
const offset_type start = offsets[offset + i];
const offset_type end = offsets[offset + i + 1];
ARROW_RETURN_NOT_OK(Append(data + start, end - start));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
value_data_builder_.Reset();
}
Status ValidateOverflow(int64_t new_bytes) {
auto new_size = value_data_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
// One more than requested for offsets
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return value_data_builder_.Reserve(elements);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
// Write final offset (values length)
ARROW_RETURN_NOT_OK(AppendNextOffset());
// These buffers' padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
null_count_, 0);
Reset();
return Status::OK();
}
/// \return data pointer of the value date builder
const uint8_t* value_data() const { return value_data_builder_.data(); }
/// \return size of values buffer so far
int64_t value_data_length() const { return value_data_builder_.length(); }
/// \return capacity of values buffer
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
/// \return data pointer of the value date builder
const offset_type* offsets_data() const { return offsets_builder_.data(); }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
const offset_type* offsets = offsets_builder_.data();
const auto offset = offsets[i];
if (i == (length_ - 1)) {
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
} else {
*out_length = offsets[i + 1] - offset;
}
return value_data_builder_.data() + offset;
}
offset_type offset(int64_t i) const { return offsets_data()[i]; }
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const {
offset_type value_length;
const uint8_t* value_data = GetValue(i, &value_length);
return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
}
// Cannot make this a static attribute because of linking issues
static constexpr int64_t memory_limit() {
return std::numeric_limits<offset_type>::max() - 1;
}
protected:
TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
Status AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
}
void UnsafeAppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
};
/// \class BinaryBuilder
/// \brief Builder class for variable-length binary data
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
public:
using BaseBinaryBuilder::BaseBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return binary(); }
};
/// \class StringBuilder
/// \brief Builder class for UTF8 strings
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
public:
using BinaryBuilder::BinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return utf8(); }
};
/// \class LargeBinaryBuilder
/// \brief Builder class for large variable-length binary data
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
public:
using BaseBinaryBuilder::BaseBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return large_binary(); }
};
/// \class LargeStringBuilder
/// \brief Builder class for large UTF8 strings
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
public:
using LargeBinaryBuilder::LargeBinaryBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return large_utf8(); }
};
// ----------------------------------------------------------------------
// FixedSizeBinaryBuilder
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = FixedSizeBinaryType;
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
Status Append(const uint8_t* value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
return Status::OK();
}
Status Append(const char* value) {
return Append(reinterpret_cast<const uint8_t*>(value));
}
Status Append(const util::string_view& view) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(view);
return Status::OK();
}
Status Append(const std::string& s) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(s);
return Status::OK();
}
Status Append(const Buffer& s) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(util::string_view(s));
return Status::OK();
}
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
template <size_t NBYTES>
Status Append(const std::array<uint8_t, NBYTES>& value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(
util::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
return Status::OK();
}
Status AppendValues(const uint8_t* data, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
int64_t bitmap_offset);
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
return AppendValues(
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
void UnsafeAppend(const uint8_t* value) {
UnsafeAppendToBitmap(true);
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
byte_builder_.UnsafeAppend(value, byte_width_);
}
}
void UnsafeAppend(const char* value) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
}
void UnsafeAppend(util::string_view value) {
#ifndef NDEBUG
CheckValueSize(static_cast<size_t>(value.size()));
#endif
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
}
void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
void UnsafeAppendNull() {
UnsafeAppendToBitmap(false);
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
}
Status ValidateOverflow(int64_t new_bytes) const {
auto new_size = byte_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return byte_builder_.Reserve(elements);
}
void Reset() override;
Status Resize(int64_t capacity) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
/// \return size of values buffer so far
int64_t value_data_length() const { return byte_builder_.length(); }
int32_t byte_width() const { return byte_width_; }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i) const;
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const;
static constexpr int64_t memory_limit() {
return std::numeric_limits<int64_t>::max() - 1;
}
std::shared_ptr<DataType> type() const override {
return fixed_size_binary(byte_width_);
}
protected:
int32_t byte_width_;
BufferBuilder byte_builder_;
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
uint8_t* GetMutableValue(int64_t i) {
uint8_t* data_ptr = byte_builder_.mutable_data();
return data_ptr + i * byte_width_;
}
void CheckValueSize(int64_t size);
};
/// @}
// ----------------------------------------------------------------------
// Chunked builders: build a sequence of BinaryArray or StringArray that are
// limited to a particular size (to the upper limit of 2GB)
namespace internal {
class ARROW_EXPORT ChunkedBinaryBuilder {
public:
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
MemoryPool* pool = default_memory_pool());
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
MemoryPool* pool = default_memory_pool());
virtual ~ChunkedBinaryBuilder() = default;
Status Append(const uint8_t* value, int32_t length) {
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
max_chunk_value_length_)) {
if (builder_->value_data_length() == 0) {
// The current item is larger than max_chunk_size_;
// this chunk will be oversize and hold *only* this item
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
return NextChunk();
}
// The current item would cause builder_->value_data_length() to exceed
// max_chunk_size_, so finish this chunk and append the current item to the next
// chunk
ARROW_RETURN_NOT_OK(NextChunk());
return Append(value, length);
}
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
// The current item would cause builder_->length() to exceed max_chunk_length_, so
// finish this chunk and append the current item to the next chunk
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->Append(value, length);
}
Status Append(const util::string_view& value) {
return Append(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<int32_t>(value.size()));
}
Status AppendNull() {
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->AppendNull();
}
Status Reserve(int64_t values);
virtual Status Finish(ArrayVector* out);
protected:
Status NextChunk();
// maximum total character data size per chunk
int64_t max_chunk_value_length_;
// maximum elements allowed per chunk
int64_t max_chunk_length_ = kListMaximumElements;
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
// add to extra_capacity_ instead and wait to reserve until the next chunk
int64_t extra_capacity_ = 0;
std::unique_ptr<BinaryBuilder> builder_;
std::vector<std::shared_ptr<Array>> chunks_;
};
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
public:
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
Status Finish(ArrayVector* out) override;
};
} // namespace internal
} // namespace arrow

View File

@ -0,0 +1,100 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/array/array_decimal.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/data.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup numeric-builders
///
/// @{
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal128Type;
using ValueType = Decimal128;
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(Decimal128 val);
void UnsafeAppend(Decimal128 val);
void UnsafeAppend(util::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal128Type> decimal_type_;
};
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal256Type;
using ValueType = Decimal256;
explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
using FixedSizeBinaryBuilder::Append;
using FixedSizeBinaryBuilder::AppendValues;
using FixedSizeBinaryBuilder::Reset;
Status Append(const Decimal256& val);
void UnsafeAppend(const Decimal256& val);
void UnsafeAppend(util::string_view val);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override { return decimal_type_; }
protected:
std::shared_ptr<Decimal256Type> decimal_type_;
};
using DecimalBuilder = Decimal128Builder;
/// @}
} // namespace arrow

View File

@ -0,0 +1,722 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "arrow/array/array_base.h"
#include "arrow/array/array_binary.h"
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
#include "arrow/array/builder_base.h" // IWYU pragma: export
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// Dictionary builder
namespace internal {
template <typename T, typename Enable = void>
struct DictionaryValue {
using type = typename T::c_type;
using PhysicalType = T;
};
template <typename T>
struct DictionaryValue<T, enable_if_base_binary<T>> {
using type = util::string_view;
using PhysicalType =
typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
BinaryType, LargeBinaryType>::type;
};
template <typename T>
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
using type = util::string_view;
using PhysicalType = BinaryType;
};
class ARROW_EXPORT DictionaryMemoTable {
public:
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
~DictionaryMemoTable();
Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
/// \brief Insert new memo values
Status InsertValues(const Array& values);
int32_t size() const;
template <typename T>
Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
// We want to keep the DictionaryMemoTable implementation private, also we can't
// use extern template classes because of compiler issues (MinGW?). Instead,
// we expose explicit function overrides for each supported physical type.
const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
return GetOrInsert(physical_type, value, out);
}
private:
Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const MonthDayNanoIntervalType*,
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
Status GetOrInsert(const DayTimeIntervalType*,
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
Status GetOrInsert(const FloatType*, float value, int32_t* out);
Status GetOrInsert(const DoubleType*, double value, int32_t* out);
Status GetOrInsert(const BinaryType*, util::string_view value, int32_t* out);
Status GetOrInsert(const LargeBinaryType*, util::string_view value, int32_t* out);
class DictionaryMemoTableImpl;
std::unique_ptr<DictionaryMemoTableImpl> impl_;
};
} // namespace internal
/// \addtogroup dictionary-builders
///
/// @{
namespace internal {
/// \brief Array builder for created encoded DictionaryArray from
/// dense array
///
/// Unlike other builders, dictionary builder does not completely
/// reset the state on Finish calls.
template <typename BuilderType, typename T>
class DictionaryBuilderBase : public ArrayBuilder {
public:
using TypeClass = DictionaryType;
using Value = typename DictionaryValue<T>::type;
// WARNING: the type given below is the value type, not the DictionaryType.
// The DictionaryType is instantiated on the Finish() call.
template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
!is_fixed_size_binary_type<T1>::value,
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(start_int_size, pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(index_type, pool),
value_type_(value_type) {}
template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
is_fixed_size_binary_type<T1>::value,
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(start_int_size, pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(index_type, pool),
value_type_(value_type) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
// This constructor doesn't check for errors. Use InsertMemoValues instead.
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool),
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool),
value_type_(dictionary->type()) {}
~DictionaryBuilderBase() override = default;
/// \brief The current number of entries in the dictionary
int64_t dictionary_length() const { return memo_table_->size(); }
/// \brief The value byte width (for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
return byte_width_;
}
/// \brief Append a scalar value
Status Append(Value value) {
ARROW_RETURN_NOT_OK(Reserve(1));
int32_t memo_index;
ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
length_ += 1;
return Status::OK();
}
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
return Append(util::string_view(reinterpret_cast<const char*>(value), byte_width_));
}
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
return Append(util::string_view(value, byte_width_));
}
/// \brief Append a string (only for binary types)
template <typename T1 = T>
enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
return Append(reinterpret_cast<const char*>(value), length);
}
/// \brief Append a string (only for binary types)
template <typename T1 = T>
enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
return Append(util::string_view(value, length));
}
/// \brief Append a string (only for string types)
template <typename T1 = T>
enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
return Append(util::string_view(value, length));
}
/// \brief Append a decimal (only for Decimal128Type)
template <typename T1 = T>
enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
uint8_t data[16];
value.ToBytes(data);
return Append(data, 16);
}
/// \brief Append a decimal (only for Decimal128Type)
template <typename T1 = T>
enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
uint8_t data[32];
value.ToBytes(data);
return Append(data, 32);
}
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
null_count_ += 1;
return indices_builder_.AppendNull();
}
Status AppendNulls(int64_t length) final {
length_ += length;
null_count_ += length;
return indices_builder_.AppendNulls(length);
}
Status AppendEmptyValue() final {
length_ += 1;
return indices_builder_.AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
length_ += length;
return indices_builder_.AppendEmptyValues(length);
}
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
if (!scalar.is_valid) return AppendNulls(n_repeats);
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
const DictionaryScalar& dict_scalar =
internal::checked_cast<const DictionaryScalar&>(scalar);
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
*dict_scalar.value.dictionary);
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT8:
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT16:
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT16:
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT32:
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT32:
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT64:
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT64:
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}
Status AppendScalars(const ScalarVector& scalars) override {
for (const auto& scalar : scalars) {
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
// Visit the indices and insert the unpacked values.
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
const typename TypeTraits<T>::ArrayType dict(array.dictionary);
ARROW_RETURN_NOT_OK(Reserve(length));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
case Type::INT8:
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
case Type::UINT16:
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
case Type::INT16:
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
case Type::UINT32:
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
case Type::INT32:
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
case Type::UINT64:
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
case Type::INT64:
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}
/// \brief Insert values into the dictionary's memo, but do not append any
/// indices. Can be used to initialize a new builder with known dictionary
/// values
/// \param[in] values dictionary values to add to memo. Type must match
/// builder type
Status InsertMemoValues(const Array& values) {
return memo_table_->InsertValues(values);
}
/// \brief Append a whole dense array to the builder
template <typename T1 = T>
enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
const Array& array) {
using ArrayType = typename TypeTraits<T>::ArrayType;
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
value_type_, array, "Wrong value type of array to be appended"));
#endif
const auto& concrete_array = static_cast<const ArrayType&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
ARROW_RETURN_NOT_OK(AppendNull());
} else {
ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
}
}
return Status::OK();
}
template <typename T1 = T>
enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
value_type_, array, "Wrong value type of array to be appended"));
#endif
const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
ARROW_RETURN_NOT_OK(AppendNull());
} else {
ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
}
}
return Status::OK();
}
void Reset() override {
// Perform a partial reset. Call ResetFull to also reset the accumulated
// dictionary values
ArrayBuilder::Reset();
indices_builder_.Reset();
}
/// \brief Reset and also clear accumulated dictionary values in memo table
void ResetFull() {
Reset();
memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
capacity_ = indices_builder_.capacity();
return Status::OK();
}
/// \brief Return dictionary indices and a delta dictionary since the last
/// time that Finish or FinishDelta were called, and reset state of builder
/// (except the memo table)
Status FinishDelta(std::shared_ptr<Array>* out_indices,
std::shared_ptr<Array>* out_delta) {
std::shared_ptr<ArrayData> indices_data;
std::shared_ptr<ArrayData> delta_data;
ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
*out_indices = MakeArray(indices_data);
*out_delta = MakeArray(delta_data);
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), value_type_);
}
protected:
template <typename c_type>
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
const ArrayData& array, int64_t offset, int64_t length) {
const c_type* values = array.GetValues<c_type>(1) + offset;
return VisitBitBlocks(
array.buffers[0], array.offset + offset, length,
[&](const int64_t position) {
const int64_t index = static_cast<int64_t>(values[position]);
if (dict.IsValid(index)) {
return Append(dict.GetView(index));
}
return AppendNull();
},
[&]() { return AppendNull(); });
}
template <typename IndexType>
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
const Scalar& index_scalar, int64_t n_repeats) {
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
if (index_scalar.is_valid && dict.IsValid(index)) {
const auto& value = dict.GetView(index);
for (int64_t i = 0; i < n_repeats; i++) {
ARROW_RETURN_NOT_OK(Append(value));
}
return Status::OK();
}
return AppendNulls(n_repeats);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
std::shared_ptr<ArrayData> dictionary;
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
// Set type of array data to the right dictionary type
(*out)->type = type();
(*out)->dictionary = dictionary;
return Status::OK();
}
Status FinishWithDictOffset(int64_t dict_offset,
std::shared_ptr<ArrayData>* out_indices,
std::shared_ptr<ArrayData>* out_dictionary) {
// Finalize indices array
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
// Generate dictionary array from hash table contents
ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
delta_offset_ = memo_table_->size();
// Update internals for further uses of this DictionaryBuilder
ArrayBuilder::Reset();
return Status::OK();
}
std::unique_ptr<DictionaryMemoTable> memo_table_;
// The size of the dictionary memo at last invocation of Finish, to use in
// FinishDelta for computing dictionary deltas
int32_t delta_offset_;
// Only used for FixedSizeBinaryType
int32_t byte_width_;
BuilderType indices_builder_;
std::shared_ptr<DataType> value_type_;
};
template <typename BuilderType>
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
public:
template <typename B = BuilderType>
DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
template <typename B = BuilderType>
explicit DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
null_count_ += 1;
return indices_builder_.AppendNull();
}
Status AppendNulls(int64_t length) final {
length_ += length;
null_count_ += length;
return indices_builder_.AppendNulls(length);
}
Status AppendEmptyValue() final {
length_ += 1;
return indices_builder_.AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
length_ += length;
return indices_builder_.AppendEmptyValues(length);
}
/// \brief Append a whole dense array to the builder
Status AppendArray(const Array& array) {
#ifndef NDEBUG
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
Type::NA, array, "Wrong value type of array to be appended"));
#endif
for (int64_t i = 0; i < array.length(); i++) {
ARROW_RETURN_NOT_OK(AppendNull());
}
return Status::OK();
}
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
capacity_ = indices_builder_.capacity();
return Status::OK();
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
(*out)->type = dictionary((*out)->type, null());
(*out)->dictionary = NullArray(0).data();
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), null());
}
protected:
BuilderType indices_builder_;
};
} // namespace internal
/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
/// smallest index size that can accommodate the dictionary indices
template <typename T>
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
public:
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
using BASE::BASE;
/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->indices_builder_.null_count();
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
this->capacity_ = this->indices_builder_.capacity();
this->length_ += length;
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
return Status::OK();
}
};
/// \brief A DictionaryArray builder that always returns int32 dictionary
/// indices so that data cast to dictionary form will have a consistent index
/// type, e.g. for creating a ChunkedArray
template <typename T>
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
public:
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
using BASE::BASE;
/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int32_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->indices_builder_.null_count();
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
this->capacity_ = this->indices_builder_.capacity();
this->length_ += length;
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
return Status::OK();
}
};
// ----------------------------------------------------------------------
// Binary / Unicode builders
// (compatibility aliases; those used to be derived classes with additional
// Append() overloads, but they have been folded into DictionaryBuilderBase)
using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
using StringDictionaryBuilder = DictionaryBuilder<StringType>;
using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
using StringDictionary32Builder = Dictionary32Builder<StringType>;
/// @}
} // namespace arrow

View File

@ -0,0 +1,561 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-builders
///
/// @{
// ----------------------------------------------------------------------
// List builder
template <typename TYPE>
class BaseListBuilder : public ArrayBuilder {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
/// Use this constructor to incrementally build the value array along with offsets and
/// null bitmap.
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
const std::shared_ptr<DataType>& type)
: ArrayBuilder(pool),
offsets_builder_(pool),
value_builder_(value_builder),
value_field_(type->field(0)->WithType(NULLPTR)) {}
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder)
: BaseListBuilder(pool, value_builder, list(value_builder->type())) {}
Status Resize(int64_t capacity) override {
if (capacity > maximum_elements()) {
return Status::CapacityError("List array cannot reserve space for more than ",
maximum_elements(), " got ", capacity);
}
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
// One more than requested for offsets
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
value_builder_->Reset();
}
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status AppendValues(const offset_type* offsets, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(valid_bytes, length);
offsets_builder_.UnsafeAppend(offsets, length);
return Status::OK();
}
/// \brief Start a new variable-length list slot
///
/// This function should be called before beginning to append elements to the
/// value builder
Status Append(bool is_valid = true) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(is_valid);
return AppendNextOffset();
}
Status AppendNull() final { return Append(false); }
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
UnsafeAppendToBitmap(length, false);
const int64_t num_values = value_builder_->length();
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
}
return Status::OK();
}
Status AppendEmptyValue() final { return Append(true); }
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
UnsafeAppendToBitmap(length, true);
const int64_t num_values = value_builder_->length();
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
const offset_type* offsets = array.GetValues<offset_type>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(Append());
int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0],
offsets[row], slot_length));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Offset padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets, null_bitmap;
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
if (value_builder_->length() == 0) {
// Try to make sure we get a non-null values buffer (ARROW-2744)
ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
}
std::shared_ptr<ArrayData> items;
ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
null_count_);
Reset();
return Status::OK();
}
Status ValidateOverflow(int64_t new_elements) const {
auto new_length = value_builder_->length() + new_elements;
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
return Status::CapacityError("List array cannot contain more than ",
maximum_elements(), " elements, have ", new_elements);
} else {
return Status::OK();
}
}
ArrayBuilder* value_builder() const { return value_builder_.get(); }
// Cannot make this a static attribute because of linking issues
static constexpr int64_t maximum_elements() {
return std::numeric_limits<offset_type>::max() - 1;
}
std::shared_ptr<DataType> type() const override {
return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
}
protected:
TypedBufferBuilder<offset_type> offsets_builder_;
std::shared_ptr<ArrayBuilder> value_builder_;
std::shared_ptr<Field> value_field_;
Status AppendNextOffset() {
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
const int64_t num_values = value_builder_->length();
return offsets_builder_.Append(static_cast<offset_type>(num_values));
}
};
/// \class ListBuilder
/// \brief Builder class for variable-length list array value types
///
/// To use this class, you must append values to the child array builder and use
/// the Append function to delimit each distinct list value (once the values
/// have been appended to the child array) or use the bulk API to append
/// a sequence of offsets and null values.
///
/// A note on types. Per arrow/type.h all types in the c++ implementation are
/// logical so even though this class always builds list array, this can
/// represent multiple different logical types. If no logical type is provided
/// at construction time, the class defaults to List<T> where t is taken from the
/// value_builder/values that the object is constructed with.
class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
public:
using BaseListBuilder::BaseListBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
};
/// \class LargeListBuilder
/// \brief Builder class for large variable-length list array value types
///
/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
public:
using BaseListBuilder::BaseListBuilder;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
};
// ----------------------------------------------------------------------
// Map builder
/// \class MapBuilder
/// \brief Builder class for arrays of variable-size maps
///
/// To use this class, you must append values to the key and item array builders
/// and use the Append function to delimit each distinct map (once the keys and items
/// have been appended) or use the bulk API to append a sequence of offsets and null
/// maps.
///
/// Key uniqueness and ordering are not validated.
class ARROW_EXPORT MapBuilder : public ArrayBuilder {
public:
/// Use this constructor to define the built array's type explicitly. If key_builder
/// or item_builder has indeterminate type, this builder will also.
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
const std::shared_ptr<ArrayBuilder>& item_builder,
const std::shared_ptr<DataType>& type);
/// Use this constructor to infer the built array's type. If key_builder or
/// item_builder has indeterminate type, this builder will also.
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
const std::shared_ptr<DataType>& type);
Status Resize(int64_t capacity) override;
void Reset() override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
Status AppendValues(const int32_t* offsets, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
/// \brief Start a new variable-length map slot
///
/// This function should be called before beginning to append elements to the
/// key and item builders
Status Append();
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
const int32_t* offsets = array.GetValues<int32_t>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(Append());
const int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
*array.child_data[0]->child_data[0], offsets[row], slot_length));
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
*array.child_data[0]->child_data[1], offsets[row], slot_length));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
/// \brief Get builder to append keys.
///
/// Append a key with this builder should be followed by appending
/// an item or null value with item_builder().
ArrayBuilder* key_builder() const { return key_builder_.get(); }
/// \brief Get builder to append items
///
/// Appending an item with this builder should have been preceded
/// by appending a key with key_builder().
ArrayBuilder* item_builder() const { return item_builder_.get(); }
/// \brief Get builder to add Map entries as struct values.
///
/// This is used instead of key_builder()/item_builder() and allows
/// the Map to be built as a list of struct values.
ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
std::shared_ptr<DataType> type() const override {
// Key and Item builder may update types, but they don't contain the field names,
// so we need to reconstruct the type. (See ARROW-13735.)
return std::make_shared<MapType>(
field(entries_name_,
struct_({field(key_name_, key_builder_->type(), false),
field(item_name_, item_builder_->type(), item_nullable_)}),
false),
keys_sorted_);
}
Status ValidateOverflow(int64_t new_elements) {
return list_builder_->ValidateOverflow(new_elements);
}
protected:
inline Status AdjustStructBuilderLength();
protected:
bool keys_sorted_ = false;
bool item_nullable_ = false;
std::string entries_name_;
std::string key_name_;
std::string item_name_;
std::shared_ptr<ListBuilder> list_builder_;
std::shared_ptr<ArrayBuilder> key_builder_;
std::shared_ptr<ArrayBuilder> item_builder_;
};
// ----------------------------------------------------------------------
// FixedSizeList builder
/// \class FixedSizeListBuilder
/// \brief Builder class for fixed-length list array value types
class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
public:
/// Use this constructor to define the built array's type explicitly. If value_builder
/// has indeterminate type, this builder will also.
FixedSizeListBuilder(MemoryPool* pool,
std::shared_ptr<ArrayBuilder> const& value_builder,
int32_t list_size);
/// Use this constructor to infer the built array's type. If value_builder has
/// indeterminate type, this builder will also.
FixedSizeListBuilder(MemoryPool* pool,
std::shared_ptr<ArrayBuilder> const& value_builder,
const std::shared_ptr<DataType>& type);
Status Resize(int64_t capacity) override;
void Reset() override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
/// \brief Append a valid fixed length list.
///
/// This function affects only the validity bitmap; the child values must be appended
/// using the child array builder.
Status Append();
/// \brief Vector append
///
/// If passed, valid_bytes wil be read and any zero byte
/// will cause the corresponding slot to be null
///
/// This function affects only the validity bitmap; the child values must be appended
/// using the child array builder. This includes appending nulls for null lists.
/// XXX this restriction is confusing, should this method be omitted?
Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
/// \brief Append a null fixed length list.
///
/// The child array builder will have the appropriate number of nulls appended
/// automatically.
Status AppendNull() final;
/// \brief Append length null fixed length lists.
///
/// The child array builder will have the appropriate number of nulls appended
/// automatically.
Status AppendNulls(int64_t length) final;
Status ValidateOverflow(int64_t new_elements);
Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;
Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
*array.child_data[0], list_size_ * (array.offset + row), list_size_));
ARROW_RETURN_NOT_OK(Append());
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}
ArrayBuilder* value_builder() const { return value_builder_.get(); }
std::shared_ptr<DataType> type() const override {
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
}
// Cannot make this a static attribute because of linking issues
static constexpr int64_t maximum_elements() {
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
}
protected:
std::shared_ptr<Field> value_field_;
const int32_t list_size_;
std::shared_ptr<ArrayBuilder> value_builder_;
};
// ----------------------------------------------------------------------
// Struct
// ---------------------------------------------------------------------------------
// StructArray builder
/// Append, Resize and Reserve methods are acting on StructBuilder.
/// Please make sure all these methods of all child-builders' are consistently
/// called to maintain data-structure consistency.
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
public:
/// If any of field_builders has indeterminate type, this builder will also
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
/// Null bitmap is of equal length to every child field, and any zero byte
/// will be considered as a null for that field, but users must using app-
/// end methods or advance methods of the child builders' independently to
/// insert data.
Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
/// Append an element to the Struct. All child-builders' Append method must
/// be called independently to maintain data-structure consistency.
Status Append(bool is_valid = true) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
/// \brief Append a null value. Automatically appends an empty value to each child
/// builder.
Status AppendNull() final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(false);
}
/// \brief Append multiple null values. Automatically appends empty values to each
/// child builder.
Status AppendNulls(int64_t length) final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
}
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, false);
return Status::OK();
}
Status AppendEmptyValue() final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(true);
}
Status AppendEmptyValues(int64_t length) final {
for (const auto& field : children_) {
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
}
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(length, true);
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i],
array.offset + offset, length));
}
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(validity, array.offset + offset, length);
return Status::OK();
}
void Reset() override;
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
int num_fields() const { return static_cast<int>(children_.size()); }
std::shared_ptr<DataType> type() const override;
private:
std::shared_ptr<DataType> type_;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,539 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <memory>
#include <vector>
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
namespace arrow {
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
public:
explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {}
explicit NullBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool())
: NullBuilder(pool) {}
/// \brief Append the specified number of null elements
Status AppendNulls(int64_t length) final {
if (length < 0) return Status::Invalid("length must be positive");
null_count_ += length;
length_ += length;
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() final { return AppendNulls(1); }
Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
Status Append(std::nullptr_t) { return AppendNull(); }
Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override {
return AppendNulls(length);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
std::shared_ptr<DataType> type() const override { return null(); }
Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
};
/// \addtogroup numeric-builders
///
/// @{
/// Base class for all Builders that emit an Array of a scalar numerical type.
template <typename T>
class NumericBuilder : public ArrayBuilder {
public:
using TypeClass = T;
using value_type = typename T::c_type;
using ArrayType = typename TypeTraits<T>::ArrayType;
template <typename T1 = T>
explicit NumericBuilder(
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
: ArrayBuilder(pool), type_(TypeTraits<T>::type_singleton()), data_builder_(pool) {}
NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
: ArrayBuilder(pool), type_(type), data_builder_(pool) {}
/// Append a single scalar and increase the size if necessary.
Status Append(const value_type val) {
ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
UnsafeAppend(val);
return Status::OK();
}
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
/// The memory at the corresponding data slot is set to 0 to prevent
/// uninitialized memory access
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNull(length);
return Status::OK();
}
/// \brief Append a single null element
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(false);
return Status::OK();
}
/// \brief Append a empty element
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(true);
return Status::OK();
}
/// \brief Append several empty elements
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNotNull(length);
return Status::OK();
}
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
void Reset() override { data_builder_.Reset(); }
Status Resize(int64_t capacity) override {
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
capacity = std::max(capacity, kMinBuilderCapacity);
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
return ArrayBuilder::Resize(capacity);
}
value_type operator[](int64_t index) const { return GetValue(index); }
value_type& operator[](int64_t index) {
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] bitmap a validity bitmap to copy (may be null)
/// \param[in] bitmap_offset an offset into the validity bitmap
/// \return Status
Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
int64_t bitmap_offset) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const value_type* values, int64_t length,
const std::vector<bool>& is_valid) {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, length);
// length_ is update by these
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<value_type>& values,
const std::vector<bool>& is_valid) {
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
}
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of values
/// \return Status
Status AppendValues(const std::vector<value_type>& values) {
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
null_bitmap_builder_.FinishWithLength(length_));
ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \return Status
template <typename ValuesIter>
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
UnsafeSetNotNull(length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot, with a specified nullmap
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \param[in] valid_begin InputIterator with elements indication valid(1)
/// or null(0) values.
/// \return Status
template <typename ValuesIter, typename ValidIter>
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
static_assert(!internal::is_null_pointer<ValidIter>::value,
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
"version instead");
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
// Same as above, with a pointer type ValidIter
template <typename ValuesIter, typename ValidIter>
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values_begin, values_end);
// this updates the length_
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
}
return Status::OK();
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
return AppendValues(array.GetValues<value_type>(1) + offset, length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
/// Append a single scalar under the assumption that the underlying Buffer is
/// large enough.
///
/// This method does not capacity-check; make sure to call Reserve
/// beforehand.
void UnsafeAppend(const value_type val) {
ArrayBuilder::UnsafeAppendToBitmap(true);
data_builder_.UnsafeAppend(val);
}
void UnsafeAppendNull() {
ArrayBuilder::UnsafeAppendToBitmap(false);
data_builder_.UnsafeAppend(value_type{}); // zero
}
std::shared_ptr<DataType> type() const override { return type_; }
protected:
std::shared_ptr<DataType> type_;
TypedBufferBuilder<value_type> data_builder_;
};
// Builders
using UInt8Builder = NumericBuilder<UInt8Type>;
using UInt16Builder = NumericBuilder<UInt16Type>;
using UInt32Builder = NumericBuilder<UInt32Type>;
using UInt64Builder = NumericBuilder<UInt64Type>;
using Int8Builder = NumericBuilder<Int8Type>;
using Int16Builder = NumericBuilder<Int16Type>;
using Int32Builder = NumericBuilder<Int32Type>;
using Int64Builder = NumericBuilder<Int64Type>;
using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
using FloatBuilder = NumericBuilder<FloatType>;
using DoubleBuilder = NumericBuilder<DoubleType>;
/// @}
/// \addtogroup temporal-builders
///
/// @{
using Date32Builder = NumericBuilder<Date32Type>;
using Date64Builder = NumericBuilder<Date64Type>;
using Time32Builder = NumericBuilder<Time32Type>;
using Time64Builder = NumericBuilder<Time64Type>;
using TimestampBuilder = NumericBuilder<TimestampType>;
using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
using DurationBuilder = NumericBuilder<DurationType>;
/// @}
class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
public:
using TypeClass = BooleanType;
using value_type = bool;
explicit BooleanBuilder(MemoryPool* pool = default_memory_pool());
BooleanBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, false);
UnsafeSetNull(length);
return Status::OK();
}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendNull();
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(false);
UnsafeSetNotNull(1);
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, false);
UnsafeSetNotNull(length);
return Status::OK();
}
/// Scalar append
Status Append(const bool val) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(val);
return Status::OK();
}
Status Append(const uint8_t val) { return Append(val != 0); }
/// Scalar append, without checking for capacity
void UnsafeAppend(const bool val) {
data_builder_.UnsafeAppend(val);
UnsafeAppendToBitmap(true);
}
void UnsafeAppendNull() {
data_builder_.UnsafeAppend(false);
UnsafeAppendToBitmap(false);
}
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous array of bytes (non-zero is 1)
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a bitmap of values
/// \param[in] length the number of values to append
/// \param[in] validity a validity bitmap to copy (may be null)
/// \param[in] offset an offset into the values and validity bitmaps
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
int64_t offset);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const uint8_t* values, int64_t length,
const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of bytes
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<uint8_t>& values,
const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values a std::vector of bytes
/// \return Status
Status AppendValues(const std::vector<uint8_t>& values);
/// \brief Append a sequence of elements in one shot
/// \param[in] values an std::vector<bool> indicating true (1) or false
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
/// (0). Equal in length to values
/// \return Status
Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
/// \brief Append a sequence of elements in one shot
/// \param[in] values an std::vector<bool> indicating true (1) or false
/// \return Status
Status AppendValues(const std::vector<bool>& values);
/// \brief Append a sequence of elements in one shot
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// or null(0) values
/// \return Status
template <typename ValuesIter>
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
// this updates length_
UnsafeSetNotNull(length);
return Status::OK();
}
/// \brief Append a sequence of elements in one shot, with a specified nullmap
/// \param[in] values_begin InputIterator to the beginning of the values
/// \param[in] values_end InputIterator pointing to the end of the values
/// \param[in] valid_begin InputIterator with elements indication valid(1)
/// or null(0) values
/// \return Status
template <typename ValuesIter, typename ValidIter>
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
static_assert(!internal::is_null_pointer<ValidIter>::value,
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
"version instead");
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
// Same as above, for a pointer type ValidIter
template <typename ValuesIter, typename ValidIter>
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend<false>(
length, [&values_begin]() -> bool { return *values_begin++; });
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
null_bitmap_builder_.UnsafeAppend<true>(
length, [&valid_begin]() -> bool { return *valid_begin++; });
}
length_ = null_bitmap_builder_.length();
null_count_ = null_bitmap_builder_.false_count();
return Status::OK();
}
Status AppendValues(int64_t length, bool value);
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
return AppendValues(array.GetValues<uint8_t>(1, 0), length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
void Reset() override;
Status Resize(int64_t capacity) override;
std::shared_ptr<DataType> type() const override { return boolean(); }
protected:
TypedBufferBuilder<bool> data_builder_;
};
} // namespace arrow

View File

@ -0,0 +1,62 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Contains declarations of time related Arrow builder types.
#pragma once
#include <memory>
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_primitive.h"
namespace arrow {
/// \addtogroup temporal-builders
///
/// @{
// TODO(ARROW-7938): this class is untested
class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
public:
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool())
: DayTimeIntervalBuilder(day_time_interval(), pool) {}
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool())
: NumericBuilder<DayTimeIntervalType>(type, pool) {}
};
class ARROW_EXPORT MonthDayNanoIntervalBuilder
: public NumericBuilder<MonthDayNanoIntervalType> {
public:
using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool())
: MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool) {}
explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool())
: NumericBuilder<MonthDayNanoIntervalType>(type, pool) {}
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,248 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer_builder.h"
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \addtogroup nested-builders
///
/// @{
/// \brief Base class for union array builds.
///
/// Note that while we subclass ArrayBuilder, as union types do not have a
/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
public:
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
/// \brief Make a new child builder available to the UnionArray
///
/// \param[in] new_child the child builder
/// \param[in] field_name the name of the field in the union array type
/// if type inference is used
/// \return child index, which is the "type" argument that needs
/// to be passed to the "Append" method to add a new element to
/// the union array.
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
const std::string& field_name = "");
std::shared_ptr<DataType> type() const override;
int64_t length() const override { return types_builder_.length(); }
protected:
BasicUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type);
int8_t NextTypeId();
std::vector<std::shared_ptr<Field>> child_fields_;
std::vector<int8_t> type_codes_;
UnionMode::type mode_;
std::vector<ArrayBuilder*> type_id_to_children_;
std::vector<int> type_id_to_child_id_;
// for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
int8_t dense_type_id_ = 0;
TypedBufferBuilder<int8_t> types_builder_;
};
/// \class DenseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit DenseUnionBuilder(MemoryPool* pool)
: BasicUnionBuilder(pool, {}, dense_union(FieldVector{})), offsets_builder_(pool) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
DenseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, children, type), offsets_builder_(pool) {}
Status AppendNull() final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
// Append a null arbitrarily to the first child
return child_builder->AppendNull();
}
Status AppendNulls(int64_t length) final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
// Append just a single null to the first child
return child_builder->AppendNull();
}
Status AppendEmptyValue() final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
// Append an empty value arbitrarily to the first child
return child_builder->AppendEmptyValue();
}
Status AppendEmptyValues(int64_t length) final {
const int8_t first_child_code = type_codes_[0];
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
// Append just a single empty value to the first child
return child_builder->AppendEmptyValue();
}
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
/// is called.
Status Append(int8_t next_type) {
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
return Status::CapacityError(
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
"child");
}
auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
return offsets_builder_.Append(offset);
}
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
private:
TypedBufferBuilder<int32_t> offsets_builder_;
};
/// \class SparseUnionBuilder
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
public:
/// Use this constructor to initialize the UnionBuilder with no child builders,
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit SparseUnionBuilder(MemoryPool* pool)
: BasicUnionBuilder(pool, {}, sparse_union(FieldVector{})) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
SparseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, children, type) {}
/// \brief Append a null value.
///
/// A null is appended to the first child, empty values to the other children.
Status AppendNull() final {
const auto first_child_code = type_codes_[0];
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
}
return Status::OK();
}
/// \brief Append multiple null values.
///
/// Nulls are appended to the first child, empty values to the other children.
Status AppendNulls(int64_t length) final {
const auto first_child_code = type_codes_[0];
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
ARROW_RETURN_NOT_OK(
type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
}
return Status::OK();
}
Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
for (int8_t code : type_codes_) {
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
}
return Status::OK();
}
Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
for (int8_t code : type_codes_) {
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
}
return Status::OK();
}
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
/// is called, and all other child builders must have null or empty value appended.
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override;
};
/// @}
} // namespace arrow

View File

@ -0,0 +1,37 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \brief Concatenate arrays
///
/// \param[in] arrays a vector of arrays to be concatenated
/// \param[in] pool memory to store the result will be allocated from this memory pool
/// \return the concatenated array
ARROW_EXPORT
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
MemoryPool* pool = default_memory_pool());
} // namespace arrow

View File

@ -0,0 +1,258 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <atomic> // IWYU pragma: export
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
// When slicing, we do not know the null count of the sliced range without
// doing some computation. To avoid doing this eagerly, we set the null count
// to -1 (any negative number will do). When Array::null_count is called the
// first time, the null count will be computed. See ARROW-33
constexpr int64_t kUnknownNullCount = -1;
// ----------------------------------------------------------------------
// Generic array data container
/// \class ArrayData
/// \brief Mutable container for generic Arrow array data
///
/// This data structure is a self-contained representation of the memory and
/// metadata inside an Arrow array data structure (called vectors in Java). The
/// classes arrow::Array and its subclasses provide strongly-typed accessors
/// with support for the visitor pattern and other affordances.
///
/// This class is designed for easy internal data manipulation, analytical data
/// processing, and data transport to and from IPC messages. For example, we
/// could cast from int64 to float64 like so:
///
/// Int64Array arr = GetMyData();
/// auto new_data = arr.data()->Copy();
/// new_data->type = arrow::float64();
/// DoubleArray double_arr(new_data);
///
/// This object is also useful in an analytics setting where memory may be
/// reused. For example, if we had a group of operations all returning doubles,
/// say:
///
/// Log(Sqrt(Expr(arr)))
///
/// Then the low-level implementations of each of these functions could have
/// the signatures
///
/// void Log(const ArrayData& values, ArrayData* out);
///
/// As another example a function may consume one or more memory buffers in an
/// input array and replace them with newly-allocated data, changing the output
/// data type as well.
struct ARROW_EXPORT ArrayData {
ArrayData() = default;
ArrayData(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
}
ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
this->child_data = std::move(child_data);
}
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
// Move constructor
ArrayData(ArrayData&& other) noexcept
: type(std::move(other.type)),
length(other.length),
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)) {
SetNullCount(other.null_count);
}
// Copy constructor
ArrayData(const ArrayData& other) noexcept
: type(other.type),
length(other.length),
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary) {
SetNullCount(other.null_count);
}
// Move assignment
ArrayData& operator=(ArrayData&& other) {
type = std::move(other.type);
length = other.length;
SetNullCount(other.null_count);
offset = other.offset;
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
return *this;
}
// Copy assignment
ArrayData& operator=(const ArrayData& other) {
type = other.type;
length = other.length;
SetNullCount(other.null_count);
offset = other.offset;
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
return *this;
}
std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
// Access a buffer's data as a typed C pointer
template <typename T>
inline const T* GetValues(int i, int64_t absolute_offset) const {
if (buffers[i]) {
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
} else {
return NULLPTR;
}
}
template <typename T>
inline const T* GetValues(int i) const {
return GetValues<T>(i, offset);
}
// Like GetValues, but returns NULLPTR instead of aborting if the underlying
// buffer is not a CPU buffer.
template <typename T>
inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
if (buffers[i] && buffers[i]->is_cpu()) {
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
} else {
return NULLPTR;
}
}
template <typename T>
inline const T* GetValuesSafe(int i) const {
return GetValuesSafe<T>(i, offset);
}
// Access a buffer's data as a typed C pointer
template <typename T>
inline T* GetMutableValues(int i, int64_t absolute_offset) {
if (buffers[i]) {
return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
} else {
return NULLPTR;
}
}
template <typename T>
inline T* GetMutableValues(int i) {
return GetMutableValues<T>(i, offset);
}
/// \brief Construct a zero-copy slice of the data with the given offset and length
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
/// \brief Input-checking variant of Slice
///
/// An Invalid Status is returned if the requested slice falls out of bounds.
/// Note that unlike Slice, `length` isn't clamped to the available buffer size.
Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
void SetNullCount(int64_t v) { null_count.store(v); }
/// \brief Return null count, or compute and set it if it's not known
int64_t GetNullCount() const;
bool MayHaveNulls() const {
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
// but no buffer
return null_count.load() != 0 && buffers[0] != NULLPTR;
}
std::shared_ptr<DataType> type;
int64_t length = 0;
mutable std::atomic<int64_t> null_count{0};
// The logical start point into the physical buffers (in values, not bytes).
// Note that, for child data, this must be *added* to the child data's own offset.
int64_t offset = 0;
std::vector<std::shared_ptr<Buffer>> buffers;
std::vector<std::shared_ptr<ArrayData>> child_data;
// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;
};
namespace internal {
/// Construct a zero-copy view of this ArrayData with the given type.
///
/// This method checks if the types are layout-compatible.
/// Nested types are traversed in depth-first order. Data buffers must have
/// the same item sizes, even though the logical types may be different.
/// An error is returned if the types are not layout-compatible.
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
const std::shared_ptr<DataType>& type);
} // namespace internal
} // namespace arrow

View File

@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <functional>
#include <iosfwd>
#include <memory>
#include "arrow/array/array_base.h"
#include "arrow/array/array_nested.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \brief Compare two arrays, returning an edit script which expresses the difference
/// between them
///
/// An edit script is an array of struct(insert: bool, run_length: int64_t).
/// Each element of "insert" determines whether an element was inserted into (true)
/// or deleted from (false) base. Each insertion or deletion is followed by a run of
/// elements which are unchanged from base to target; the length of this run is stored
/// in "run_length". (Note that the edit script begins and ends with a run of shared
/// elements but both fields of the struct must have the same length. To accommodate this
/// the first element of "insert" should be ignored.)
///
/// For example for base "hlloo" and target "hello", the edit script would be
/// [
/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
/// {"insert": false, "run_length": 0} // delete("o") then an empty run
/// ]
///
/// Diffing arrays containing nulls is not currently supported.
///
/// \param[in] base baseline for comparison
/// \param[in] target an array of identical type to base whose elements differ from base's
/// \param[in] pool memory to store the result will be allocated from this memory pool
/// \return an edit script array which can be applied to base to produce target
ARROW_EXPORT
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
MemoryPool* pool = default_memory_pool());
/// \brief visitor interface for easy traversal of an edit script
///
/// visitor will be called for each hunk of insertions and deletions.
ARROW_EXPORT Status VisitEditScript(
const Array& edits,
const std::function<Status(int64_t delete_begin, int64_t delete_end,
int64_t insert_begin, int64_t insert_end)>& visitor);
/// \brief return a function which will format an edit script in unified
/// diff format to os, given base and target arrays of type
ARROW_EXPORT Result<
std::function<Status(const Array& edits, const Array& base, const Array& target)>>
MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
} // namespace arrow

View File

@ -0,0 +1,89 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <vector>
#include "arrow/array/data.h"
#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \brief Create a strongly-typed Array instance from generic ArrayData
/// \param[in] data the array contents
/// \return the resulting Array instance
ARROW_EXPORT
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
/// \brief Create a strongly-typed Array instance with all elements null
/// \param[in] type the array type
/// \param[in] length the array length
/// \param[in] pool the memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
int64_t length,
MemoryPool* pool = default_memory_pool());
/// \brief Create an Array instance whose slots are the given scalar
/// \param[in] scalar the value with which to fill the array
/// \param[in] length the array length
/// \param[in] pool the memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeArrayFromScalar(
const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
/// \brief Create an empty Array of a given type
///
/// The output Array will be of the given type.
///
/// \param[in] type the data type of the empty Array
/// \param[in] pool the memory pool to allocate memory from
/// \return the resulting Array
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool());
namespace internal {
/// \brief Swap endian of each element in a generic ArrayData
///
/// As dictionaries are often shared between different arrays, dictionaries
/// are not swapped by this function and should be handled separately.
///
/// \param[in] data the array contents
/// \return the resulting ArrayData whose elements were swapped
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
const std::shared_ptr<ArrayData>& data);
/// Given a number of ArrayVectors, treat each ArrayVector as the
/// chunks of a chunked array. Then rechunk each ArrayVector such that
/// all ArrayVectors are chunked identically. It is mandatory that
/// all ArrayVectors contain the same total number of elements.
ARROW_EXPORT
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
} // namespace internal
} // namespace arrow

View File

@ -0,0 +1,56 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
// Internal functions implementing Array::Validate() and friends.
// O(1) array metadata validation
ARROW_EXPORT
Status ValidateArray(const Array& array);
ARROW_EXPORT
Status ValidateArray(const ArrayData& data);
// O(N) array data validation.
// Note that, starting from 7.0.0, "full" routines also validate metadata.
// Before, ValidateArray() needed to be called before ValidateArrayFull()
// to ensure metadata correctness, otherwise invalid memory accesses
// may occur.
ARROW_EXPORT
Status ValidateArrayFull(const Array& array);
ARROW_EXPORT
Status ValidateArrayFull(const ArrayData& data);
ARROW_EXPORT
Status ValidateUTF8(const Array& array);
ARROW_EXPORT
Status ValidateUTF8(const ArrayData& data);
} // namespace internal
} // namespace arrow

View File

@ -0,0 +1,506 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/device.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// Buffer classes
/// \class Buffer
/// \brief Object containing a pointer to a piece of contiguous memory with a
/// particular size.
///
/// Buffers have two related notions of length: size and capacity. Size is
/// the number of bytes that might have valid data. Capacity is the number
/// of bytes that were allocated for the buffer in total.
///
/// The Buffer base class does not own its memory, but subclasses often do.
///
/// The following invariant is always true: Size <= Capacity
class ARROW_EXPORT Buffer {
public:
/// \brief Construct from buffer and size without copying memory
///
/// \param[in] data a memory buffer
/// \param[in] size buffer size
///
/// \note The passed memory must be kept alive through some other means
Buffer(const uint8_t* data, int64_t size)
: is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) {
SetMemoryManager(default_cpu_memory_manager());
}
Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
std::shared_ptr<Buffer> parent = NULLPTR)
: is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
SetMemoryManager(std::move(mm));
}
Buffer(uintptr_t address, int64_t size, std::shared_ptr<MemoryManager> mm,
std::shared_ptr<Buffer> parent = NULLPTR)
: Buffer(reinterpret_cast<const uint8_t*>(address), size, std::move(mm),
std::move(parent)) {}
/// \brief Construct from string_view without copying memory
///
/// \param[in] data a string_view object
///
/// \note The memory viewed by data must not be deallocated in the lifetime of the
/// Buffer; temporary rvalue strings must be stored in an lvalue somewhere
explicit Buffer(util::string_view data)
: Buffer(reinterpret_cast<const uint8_t*>(data.data()),
static_cast<int64_t>(data.size())) {}
virtual ~Buffer() = default;
/// An offset into data that is owned by another buffer, but we want to be
/// able to retain a valid pointer to it even after other shared_ptr's to the
/// parent buffer have been destroyed
///
/// This method makes no assertions about alignment or padding of the buffer but
/// in general we expected buffers to be aligned and padded to 64 bytes. In the future
/// we might add utility methods to help determine if a buffer satisfies this contract.
Buffer(const std::shared_ptr<Buffer>& parent, const int64_t offset, const int64_t size)
: Buffer(parent->data_ + offset, size) {
parent_ = parent;
SetMemoryManager(parent->memory_manager_);
}
uint8_t operator[](std::size_t i) const { return data_[i]; }
/// \brief Construct a new std::string with a hexadecimal representation of the buffer.
/// \return std::string
std::string ToHexString();
/// Return true if both buffers are the same size and contain the same bytes
/// up to the number of compared bytes
bool Equals(const Buffer& other, int64_t nbytes) const;
/// Return true if both buffers are the same size and contain the same bytes
bool Equals(const Buffer& other) const;
/// Copy a section of the buffer into a new Buffer.
Result<std::shared_ptr<Buffer>> CopySlice(
const int64_t start, const int64_t nbytes,
MemoryPool* pool = default_memory_pool()) const;
/// Zero bytes in padding, i.e. bytes between size_ and capacity_.
void ZeroPadding() {
#ifndef NDEBUG
CheckMutable();
#endif
// A zero-capacity buffer can have a null data pointer
if (capacity_ != 0) {
memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
}
}
/// \brief Construct an immutable buffer that takes ownership of the contents
/// of an std::string (without copying it).
///
/// \param[in] data a string to own
/// \return a new Buffer instance
static std::shared_ptr<Buffer> FromString(std::string data);
/// \brief Create buffer referencing typed memory with some length without
/// copying
/// \param[in] data the typed memory as C array
/// \param[in] length the number of values in the array
/// \return a new shared_ptr<Buffer>
template <typename T, typename SizeType = int64_t>
static std::shared_ptr<Buffer> Wrap(const T* data, SizeType length) {
return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data),
static_cast<int64_t>(sizeof(T) * length));
}
/// \brief Create buffer referencing std::vector with some length without
/// copying
/// \param[in] data the vector to be referenced. If this vector is changed,
/// the buffer may become invalid
/// \return a new shared_ptr<Buffer>
template <typename T>
static std::shared_ptr<Buffer> Wrap(const std::vector<T>& data) {
return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data.data()),
static_cast<int64_t>(sizeof(T) * data.size()));
}
/// \brief Copy buffer contents into a new std::string
/// \return std::string
/// \note Can throw std::bad_alloc if buffer is large
std::string ToString() const;
/// \brief View buffer contents as a util::string_view
/// \return util::string_view
explicit operator util::string_view() const {
return util::string_view(reinterpret_cast<const char*>(data_), size_);
}
/// \brief View buffer contents as a util::bytes_view
/// \return util::bytes_view
explicit operator util::bytes_view() const { return util::bytes_view(data_, size_); }
/// \brief Return a pointer to the buffer's data
///
/// The buffer has to be a CPU buffer (`is_cpu()` is true).
/// Otherwise, an assertion may be thrown or a null pointer may be returned.
///
/// To get the buffer's data address regardless of its device, call `address()`.
const uint8_t* data() const {
#ifndef NDEBUG
CheckCPU();
#endif
return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
}
/// \brief Return a writable pointer to the buffer's data
///
/// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
/// are true). Otherwise, an assertion may be thrown or a null pointer may
/// be returned.
///
/// To get the buffer's mutable data address regardless of its device, call
/// `mutable_address()`.
uint8_t* mutable_data() {
#ifndef NDEBUG
CheckCPU();
CheckMutable();
#endif
return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
: NULLPTR;
}
/// \brief Return the device address of the buffer's data
uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
/// \brief Return a writable device address to the buffer's data
///
/// The buffer has to be a mutable buffer (`is_mutable()` is true).
/// Otherwise, an assertion may be thrown or 0 may be returned.
uintptr_t mutable_address() const {
#ifndef NDEBUG
CheckMutable();
#endif
return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
}
/// \brief Return the buffer's size in bytes
int64_t size() const { return size_; }
/// \brief Return the buffer's capacity (number of allocated bytes)
int64_t capacity() const { return capacity_; }
/// \brief Whether the buffer is directly CPU-accessible
///
/// If this function returns true, you can read directly from the buffer's
/// `data()` pointer. Otherwise, you'll have to `View()` or `Copy()` it.
bool is_cpu() const { return is_cpu_; }
/// \brief Whether the buffer is mutable
///
/// If this function returns true, you are allowed to modify buffer contents
/// using the pointer returned by `mutable_data()` or `mutable_address()`.
bool is_mutable() const { return is_mutable_; }
const std::shared_ptr<Device>& device() const { return memory_manager_->device(); }
const std::shared_ptr<MemoryManager>& memory_manager() const { return memory_manager_; }
std::shared_ptr<Buffer> parent() const { return parent_; }
/// \brief Get a RandomAccessFile for reading a buffer
///
/// The returned file object reads from this buffer's underlying memory.
static Result<std::shared_ptr<io::RandomAccessFile>> GetReader(std::shared_ptr<Buffer>);
/// \brief Get a OutputStream for writing to a buffer
///
/// The buffer must be mutable. The returned stream object writes into the buffer's
/// underlying memory (but it won't resize it).
static Result<std::shared_ptr<io::OutputStream>> GetWriter(std::shared_ptr<Buffer>);
/// \brief Copy buffer
///
/// The buffer contents will be copied into a new buffer allocated by the
/// given MemoryManager. This function supports cross-device copies.
static Result<std::shared_ptr<Buffer>> Copy(std::shared_ptr<Buffer> source,
const std::shared_ptr<MemoryManager>& to);
/// \brief Copy a non-owned buffer
///
/// This is useful for cases where the source memory area is externally managed
/// (its lifetime not tied to the source Buffer), otherwise please use Copy().
static Result<std::unique_ptr<Buffer>> CopyNonOwned(
const Buffer& source, const std::shared_ptr<MemoryManager>& to);
/// \brief View buffer
///
/// Return a Buffer that reflects this buffer, seen potentially from another
/// device, without making an explicit copy of the contents. The underlying
/// mechanism is typically implemented by the kernel or device driver, and may
/// involve lazy caching of parts of the buffer contents on the destination
/// device's memory.
///
/// If a non-copy view is unsupported for the buffer on the given device,
/// nullptr is returned. An error can be returned if some low-level
/// operation fails (such as an out-of-memory condition).
static Result<std::shared_ptr<Buffer>> View(std::shared_ptr<Buffer> source,
const std::shared_ptr<MemoryManager>& to);
/// \brief View or copy buffer
///
/// Try to view buffer contents on the given MemoryManager's device, but
/// fall back to copying if a no-copy view isn't supported.
static Result<std::shared_ptr<Buffer>> ViewOrCopy(
std::shared_ptr<Buffer> source, const std::shared_ptr<MemoryManager>& to);
protected:
bool is_mutable_;
bool is_cpu_;
const uint8_t* data_;
int64_t size_;
int64_t capacity_;
// null by default, but may be set
std::shared_ptr<Buffer> parent_;
private:
// private so that subclasses are forced to call SetMemoryManager()
std::shared_ptr<MemoryManager> memory_manager_;
protected:
void CheckMutable() const;
void CheckCPU() const;
void SetMemoryManager(std::shared_ptr<MemoryManager> mm) {
memory_manager_ = std::move(mm);
is_cpu_ = memory_manager_->is_cpu();
}
private:
Buffer() = delete;
ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);
};
/// \defgroup buffer-slicing-functions Functions for slicing buffers
///
/// @{
/// \brief Construct a view on a buffer at the given offset and length.
///
/// This function cannot fail and does not check for errors (except in debug builds)
static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
const int64_t offset,
const int64_t length) {
return std::make_shared<Buffer>(buffer, offset, length);
}
/// \brief Construct a view on a buffer at the given offset, up to the buffer's end.
///
/// This function cannot fail and does not check for errors (except in debug builds)
static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
const int64_t offset) {
int64_t length = buffer->size() - offset;
return SliceBuffer(buffer, offset, length);
}
/// \brief Input-checking version of SliceBuffer
///
/// An Invalid Status is returned if the requested slice falls out of bounds.
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
int64_t offset);
/// \brief Input-checking version of SliceBuffer
///
/// An Invalid Status is returned if the requested slice falls out of bounds.
/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
int64_t offset, int64_t length);
/// \brief Like SliceBuffer, but construct a mutable buffer slice.
///
/// If the parent buffer is not mutable, behavior is undefined (it may abort
/// in debug builds).
ARROW_EXPORT
std::shared_ptr<Buffer> SliceMutableBuffer(const std::shared_ptr<Buffer>& buffer,
const int64_t offset, const int64_t length);
/// \brief Like SliceBuffer, but construct a mutable buffer slice.
///
/// If the parent buffer is not mutable, behavior is undefined (it may abort
/// in debug builds).
static inline std::shared_ptr<Buffer> SliceMutableBuffer(
const std::shared_ptr<Buffer>& buffer, const int64_t offset) {
int64_t length = buffer->size() - offset;
return SliceMutableBuffer(buffer, offset, length);
}
/// \brief Input-checking version of SliceMutableBuffer
///
/// An Invalid Status is returned if the requested slice falls out of bounds.
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
const std::shared_ptr<Buffer>& buffer, int64_t offset);
/// \brief Input-checking version of SliceMutableBuffer
///
/// An Invalid Status is returned if the requested slice falls out of bounds.
/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length);
/// @}
/// \class MutableBuffer
/// \brief A Buffer whose contents can be mutated. May or may not own its data.
class ARROW_EXPORT MutableBuffer : public Buffer {
public:
MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) {
is_mutable_ = true;
}
MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr<MemoryManager> mm)
: Buffer(data, size, std::move(mm)) {
is_mutable_ = true;
}
MutableBuffer(const std::shared_ptr<Buffer>& parent, const int64_t offset,
const int64_t size);
/// \brief Create buffer referencing typed memory with some length
/// \param[in] data the typed memory as C array
/// \param[in] length the number of values in the array
/// \return a new shared_ptr<Buffer>
template <typename T, typename SizeType = int64_t>
static std::shared_ptr<Buffer> Wrap(T* data, SizeType length) {
return std::make_shared<MutableBuffer>(reinterpret_cast<uint8_t*>(data),
static_cast<int64_t>(sizeof(T) * length));
}
protected:
MutableBuffer() : Buffer(NULLPTR, 0) {}
};
/// \class ResizableBuffer
/// \brief A mutable buffer that can be resized
class ARROW_EXPORT ResizableBuffer : public MutableBuffer {
public:
/// Change buffer reported size to indicated size, allocating memory if
/// necessary. This will ensure that the capacity of the buffer is a multiple
/// of 64 bytes as defined in Layout.md.
/// Consider using ZeroPadding afterwards, to conform to the Arrow layout
/// specification.
///
/// @param new_size The new size for the buffer.
/// @param shrink_to_fit Whether to shrink the capacity if new size < current size
virtual Status Resize(const int64_t new_size, bool shrink_to_fit) = 0;
Status Resize(const int64_t new_size) {
return Resize(new_size, /*shrink_to_fit=*/true);
}
/// Ensure that buffer has enough memory allocated to fit the indicated
/// capacity (and meets the 64 byte padding requirement in Layout.md).
/// It does not change buffer's reported size and doesn't zero the padding.
virtual Status Reserve(const int64_t new_capacity) = 0;
template <class T>
Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) {
return Resize(sizeof(T) * new_nb_elements, shrink_to_fit);
}
template <class T>
Status TypedReserve(const int64_t new_nb_elements) {
return Reserve(sizeof(T) * new_nb_elements);
}
protected:
ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {}
ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm)
: MutableBuffer(data, size, std::move(mm)) {}
};
/// \defgroup buffer-allocation-functions Functions for allocating buffers
///
/// @{
/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding.
///
/// \param[in] size size of buffer to allocate
/// \param[in] pool a memory pool
ARROW_EXPORT
Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size,
MemoryPool* pool = NULLPTR);
/// \brief Allocate a resizeable buffer from a memory pool, zero its padding.
///
/// \param[in] size size of buffer to allocate
/// \param[in] pool a memory pool
ARROW_EXPORT
Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
const int64_t size, MemoryPool* pool = NULLPTR);
/// \brief Allocate a bitmap buffer from a memory pool
/// no guarantee on values is provided.
///
/// \param[in] length size in bits of bitmap to allocate
/// \param[in] pool memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t length,
MemoryPool* pool = NULLPTR);
ARROW_EXPORT
Status AllocateBitmap(MemoryPool* pool, int64_t length, std::shared_ptr<Buffer>* out);
/// \brief Allocate a zero-initialized bitmap buffer from a memory pool
///
/// \param[in] length size in bits of bitmap to allocate
/// \param[in] pool memory pool to allocate memory from
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length,
MemoryPool* pool = NULLPTR);
/// \brief Concatenate multiple buffers into a single buffer
///
/// \param[in] buffers to be concatenated
/// \param[in] pool memory pool to allocate the new buffer from
ARROW_EXPORT
Result<std::shared_ptr<Buffer>> ConcatenateBuffers(const BufferVector& buffers,
MemoryPool* pool = NULLPTR);
ARROW_EXPORT
Status ConcatenateBuffers(const BufferVector& buffers, MemoryPool* pool,
std::shared_ptr<Buffer>* out);
/// @}
} // namespace arrow

View File

@ -0,0 +1,459 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_generate.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
#include "arrow/util/visibility.h"
namespace arrow {
// ----------------------------------------------------------------------
// Buffer builder classes
/// \class BufferBuilder
/// \brief A class for incrementally building a contiguous chunk of in-memory
/// data
class ARROW_EXPORT BufferBuilder {
public:
explicit BufferBuilder(MemoryPool* pool = default_memory_pool())
: pool_(pool),
data_(/*ensure never null to make ubsan happy and avoid check penalties below*/
util::MakeNonNull<uint8_t>()),
capacity_(0),
size_(0) {}
/// \brief Constructs new Builder that will start using
/// the provided buffer until Finish/Reset are called.
/// The buffer is not resized.
explicit BufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
MemoryPool* pool = default_memory_pool())
: buffer_(std::move(buffer)),
pool_(pool),
data_(buffer_->mutable_data()),
capacity_(buffer_->capacity()),
size_(buffer_->size()) {}
/// \brief Resize the buffer to the nearest multiple of 64 bytes
///
/// \param new_capacity the new capacity of the of the builder. Will be
/// rounded up to a multiple of 64 bytes for padding
/// \param shrink_to_fit if new capacity is smaller than the existing,
/// reallocate internal buffer. Set to false to avoid reallocations when
/// shrinking the builder.
/// \return Status
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
if (buffer_ == NULLPTR) {
ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(new_capacity, pool_));
} else {
ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
}
capacity_ = buffer_->capacity();
data_ = buffer_->mutable_data();
return Status::OK();
}
/// \brief Ensure that builder can accommodate the additional number of bytes
/// without the need to perform allocations
///
/// \param[in] additional_bytes number of additional bytes to make space for
/// \return Status
Status Reserve(const int64_t additional_bytes) {
auto min_capacity = size_ + additional_bytes;
if (min_capacity <= capacity_) {
return Status::OK();
}
return Resize(GrowByFactor(capacity_, min_capacity), false);
}
/// \brief Return a capacity expanded by the desired growth factor
static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) {
// Doubling capacity except for large Reserve requests. 2x growth strategy
// (versus 1.5x) seems to have slightly better performance when using
// jemalloc, but significantly better performance when using the system
// allocator. See ARROW-6450 for further discussion
return std::max(new_capacity, current_capacity * 2);
}
/// \brief Append the given data to the buffer
///
/// The buffer is automatically expanded if necessary.
Status Append(const void* data, const int64_t length) {
if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false));
}
UnsafeAppend(data, length);
return Status::OK();
}
/// \brief Append copies of a value to the buffer
///
/// The buffer is automatically expanded if necessary.
Status Append(const int64_t num_copies, uint8_t value) {
ARROW_RETURN_NOT_OK(Reserve(num_copies));
UnsafeAppend(num_copies, value);
return Status::OK();
}
// Advance pointer and zero out memory
Status Advance(const int64_t length) { return Append(length, 0); }
// Advance pointer, but don't allocate or zero memory
void UnsafeAdvance(const int64_t length) { size_ += length; }
// Unsafe methods don't check existing size
void UnsafeAppend(const void* data, const int64_t length) {
memcpy(data_ + size_, data, static_cast<size_t>(length));
size_ += length;
}
void UnsafeAppend(const int64_t num_copies, uint8_t value) {
memset(data_ + size_, value, static_cast<size_t>(num_copies));
size_ += num_copies;
}
/// \brief Return result of builder as a Buffer object.
///
/// The builder is reset and can be reused afterwards.
///
/// \param[out] out the finalized Buffer object
/// \param shrink_to_fit if the buffer size is smaller than its capacity,
/// reallocate to fit more tightly in memory. Set to false to avoid
/// a reallocation, at the expense of potentially more memory consumption.
/// \return Status
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
if (size_ != 0) buffer_->ZeroPadding();
*out = buffer_;
if (*out == NULLPTR) {
ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, pool_));
}
Reset();
return Status::OK();
}
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
std::shared_ptr<Buffer> out;
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
return out;
}
/// \brief Like Finish, but override the final buffer size
///
/// This is useful after writing data directly into the builder memory
/// without calling the Append methods (basically, when using BufferBuilder
/// mostly for memory allocation).
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
bool shrink_to_fit = true) {
size_ = final_length;
return Finish(shrink_to_fit);
}
void Reset() {
buffer_ = NULLPTR;
capacity_ = size_ = 0;
}
/// \brief Set size to a smaller value without modifying builder
/// contents. For reusable BufferBuilder classes
/// \param[in] position must be non-negative and less than or equal
/// to the current length()
void Rewind(int64_t position) { size_ = position; }
int64_t capacity() const { return capacity_; }
int64_t length() const { return size_; }
const uint8_t* data() const { return data_; }
uint8_t* mutable_data() { return data_; }
private:
std::shared_ptr<ResizableBuffer> buffer_;
MemoryPool* pool_;
uint8_t* data_;
int64_t capacity_;
int64_t size_;
};
template <typename T, typename Enable = void>
class TypedBufferBuilder;
/// \brief A BufferBuilder for building a buffer of arithmetic elements
template <typename T>
class TypedBufferBuilder<
T, typename std::enable_if<std::is_arithmetic<T>::value ||
std::is_standard_layout<T>::value>::type> {
public:
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool())
: bytes_builder_(pool) {}
explicit TypedBufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
MemoryPool* pool = default_memory_pool())
: bytes_builder_(std::move(buffer), pool) {}
explicit TypedBufferBuilder(BufferBuilder builder)
: bytes_builder_(std::move(builder)) {}
BufferBuilder* bytes_builder() { return &bytes_builder_; }
Status Append(T value) {
return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
}
Status Append(const T* values, int64_t num_elements) {
return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values),
num_elements * sizeof(T));
}
Status Append(const int64_t num_copies, T value) {
ARROW_RETURN_NOT_OK(Reserve(num_copies + length()));
UnsafeAppend(num_copies, value);
return Status::OK();
}
void UnsafeAppend(T value) {
bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T));
}
void UnsafeAppend(const T* values, int64_t num_elements) {
bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values),
num_elements * sizeof(T));
}
template <typename Iter>
void UnsafeAppend(Iter values_begin, Iter values_end) {
int64_t num_elements = static_cast<int64_t>(std::distance(values_begin, values_end));
auto data = mutable_data() + length();
bytes_builder_.UnsafeAdvance(num_elements * sizeof(T));
std::copy(values_begin, values_end, data);
}
void UnsafeAppend(const int64_t num_copies, T value) {
auto data = mutable_data() + length();
bytes_builder_.UnsafeAdvance(num_copies * sizeof(T));
std::fill(data, data + num_copies, value);
}
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit);
}
Status Reserve(const int64_t additional_elements) {
return bytes_builder_.Reserve(additional_elements * sizeof(T));
}
Status Advance(const int64_t length) {
return bytes_builder_.Advance(length * sizeof(T));
}
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
return bytes_builder_.Finish(out, shrink_to_fit);
}
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
std::shared_ptr<Buffer> out;
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
return out;
}
/// \brief Like Finish, but override the final buffer size
///
/// This is useful after writing data directly into the builder memory
/// without calling the Append methods (basically, when using TypedBufferBuilder
/// only for memory allocation).
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
bool shrink_to_fit = true) {
return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
}
void Reset() { bytes_builder_.Reset(); }
int64_t length() const { return bytes_builder_.length() / sizeof(T); }
int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); }
const T* data() const { return reinterpret_cast<const T*>(bytes_builder_.data()); }
T* mutable_data() { return reinterpret_cast<T*>(bytes_builder_.mutable_data()); }
private:
BufferBuilder bytes_builder_;
};
/// \brief A BufferBuilder for building a buffer containing a bitmap
template <>
class TypedBufferBuilder<bool> {
public:
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool())
: bytes_builder_(pool) {}
explicit TypedBufferBuilder(BufferBuilder builder)
: bytes_builder_(std::move(builder)) {}
BufferBuilder* bytes_builder() { return &bytes_builder_; }
Status Append(bool value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
return Status::OK();
}
Status Append(const uint8_t* valid_bytes, int64_t num_elements) {
ARROW_RETURN_NOT_OK(Reserve(num_elements));
UnsafeAppend(valid_bytes, num_elements);
return Status::OK();
}
Status Append(const int64_t num_copies, bool value) {
ARROW_RETURN_NOT_OK(Reserve(num_copies));
UnsafeAppend(num_copies, value);
return Status::OK();
}
void UnsafeAppend(bool value) {
bit_util::SetBitTo(mutable_data(), bit_length_, value);
if (!value) {
++false_count_;
}
++bit_length_;
}
/// \brief Append bits from an array of bytes (one value per byte)
void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) {
if (num_elements == 0) return;
int64_t i = 0;
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
bool value = bytes[i++];
false_count_ += !value;
return value;
});
bit_length_ += num_elements;
}
/// \brief Append bits from a packed bitmap
void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) {
if (num_elements == 0) return;
internal::CopyBitmap(bitmap, offset, num_elements, mutable_data(), bit_length_);
false_count_ += num_elements - internal::CountSetBits(bitmap, offset, num_elements);
bit_length_ += num_elements;
}
void UnsafeAppend(const int64_t num_copies, bool value) {
bit_util::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
false_count_ += num_copies * !value;
bit_length_ += num_copies;
}
template <bool count_falses, typename Generator>
void UnsafeAppend(const int64_t num_elements, Generator&& gen) {
if (num_elements == 0) return;
if (count_falses) {
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
bool value = gen();
false_count_ += !value;
return value;
});
} else {
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
std::forward<Generator>(gen));
}
bit_length_ += num_elements;
}
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
const int64_t old_byte_capacity = bytes_builder_.capacity();
ARROW_RETURN_NOT_OK(
bytes_builder_.Resize(bit_util::BytesForBits(new_capacity), shrink_to_fit));
// Resize() may have chosen a larger capacity (e.g. for padding),
// so ask it again before calling memset().
const int64_t new_byte_capacity = bytes_builder_.capacity();
if (new_byte_capacity > old_byte_capacity) {
// The additional buffer space is 0-initialized for convenience,
// so that other methods can simply bump the length.
memset(mutable_data() + old_byte_capacity, 0,
static_cast<size_t>(new_byte_capacity - old_byte_capacity));
}
return Status::OK();
}
Status Reserve(const int64_t additional_elements) {
return Resize(
BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements),
false);
}
Status Advance(const int64_t length) {
ARROW_RETURN_NOT_OK(Reserve(length));
bit_length_ += length;
false_count_ += length;
return Status::OK();
}
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
// set bytes_builder_.size_ == byte size of data
bytes_builder_.UnsafeAdvance(bit_util::BytesForBits(bit_length_) -
bytes_builder_.length());
bit_length_ = false_count_ = 0;
return bytes_builder_.Finish(out, shrink_to_fit);
}
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
std::shared_ptr<Buffer> out;
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
return out;
}
/// \brief Like Finish, but override the final buffer size
///
/// This is useful after writing data directly into the builder memory
/// without calling the Append methods (basically, when using TypedBufferBuilder
/// only for memory allocation).
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
bool shrink_to_fit = true) {
const auto final_byte_length = bit_util::BytesForBits(final_length);
bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
bit_length_ = false_count_ = 0;
return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
}
void Reset() {
bytes_builder_.Reset();
bit_length_ = false_count_ = 0;
}
int64_t length() const { return bit_length_; }
int64_t capacity() const { return bytes_builder_.capacity() * 8; }
const uint8_t* data() const { return bytes_builder_.data(); }
uint8_t* mutable_data() { return bytes_builder_.mutable_data(); }
int64_t false_count() const { return false_count_; }
private:
BufferBuilder bytes_builder_;
int64_t bit_length_ = 0;
int64_t false_count_ = 0;
};
} // namespace arrow

View File

@ -0,0 +1,32 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/array/builder_adaptive.h" // IWYU pragma: keep
#include "arrow/array/builder_base.h" // IWYU pragma: keep
#include "arrow/array/builder_binary.h" // IWYU pragma: keep
#include "arrow/array/builder_decimal.h" // IWYU pragma: keep
#include "arrow/array/builder_dict.h" // IWYU pragma: keep
#include "arrow/array/builder_nested.h" // IWYU pragma: keep
#include "arrow/array/builder_primitive.h" // IWYU pragma: keep
#include "arrow/array/builder_time.h" // IWYU pragma: keep
#include "arrow/array/builder_union.h" // IWYU pragma: keep
#include "arrow/status.h"
#include "arrow/util/visibility.h"

View File

@ -0,0 +1,103 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#define ARROW_FLAG_DICTIONARY_ORDERED 1
#define ARROW_FLAG_NULLABLE 2
#define ARROW_FLAG_MAP_KEYS_SORTED 4
struct ArrowSchema {
// Array type description
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;
// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};
struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;
// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};
// EXPERIMENTAL: C stream interface
struct ArrowArrayStream {
// Callback to get the stream type
// (will be the same for all arrays in the stream).
//
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
//
// If successful, the ArrowSchema must be released independently from the stream.
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
// Callback to get the next array
// (if no error and the array is released, the stream has ended)
//
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
//
// If successful, the ArrowArray must be released independently from the stream.
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
// Callback to get optional detailed error information.
// This must only be called if the last stream operation failed
// with a non-0 return code.
//
// Return value: pointer to a null-terminated character array describing
// the last error, or NULL if no description is available.
//
// The returned pointer is only valid until the next operation on this stream
// (including release).
const char* (*get_last_error)(struct ArrowArrayStream*);
// Release callback: release the stream's own resources.
// Note that arrays returned by `get_next` must be individually released.
void (*release)(struct ArrowArrayStream*);
// Opaque producer-specific data
void* private_data;
};
#ifdef __cplusplus
}
#endif

Some files were not shown because too many files have changed in this diff Show More