mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-21 10:15:45 +00:00
557 lines
19 KiB
Cython
557 lines
19 KiB
Cython
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from cpython.ref cimport PyObject
|
|
|
|
import warnings
|
|
|
|
|
|
def _deprecate_serialization(name):
|
|
msg = (
|
|
"'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a "
|
|
"future version. Use pickle or the pyarrow IPC functionality instead."
|
|
).format(name)
|
|
warnings.warn(msg, FutureWarning, stacklevel=3)
|
|
|
|
|
|
def is_named_tuple(cls):
|
|
"""
|
|
Return True if cls is a namedtuple and False otherwise.
|
|
"""
|
|
b = cls.__bases__
|
|
if len(b) != 1 or b[0] != tuple:
|
|
return False
|
|
f = getattr(cls, "_fields", None)
|
|
if not isinstance(f, tuple):
|
|
return False
|
|
return all(isinstance(n, str) for n in f)
|
|
|
|
|
|
class SerializationCallbackError(ArrowSerializationError):
|
|
def __init__(self, message, example_object):
|
|
ArrowSerializationError.__init__(self, message)
|
|
self.example_object = example_object
|
|
|
|
|
|
class DeserializationCallbackError(ArrowSerializationError):
|
|
def __init__(self, message, type_id):
|
|
ArrowSerializationError.__init__(self, message)
|
|
self.type_id = type_id
|
|
|
|
|
|
cdef class SerializationContext(_Weakrefable):
|
|
cdef:
|
|
object type_to_type_id
|
|
object whitelisted_types
|
|
object types_to_pickle
|
|
object custom_serializers
|
|
object custom_deserializers
|
|
object pickle_serializer
|
|
object pickle_deserializer
|
|
|
|
def __init__(self):
|
|
# Types with special serialization handlers
|
|
self.type_to_type_id = dict()
|
|
self.whitelisted_types = dict()
|
|
self.types_to_pickle = set()
|
|
self.custom_serializers = dict()
|
|
self.custom_deserializers = dict()
|
|
self.pickle_serializer = pickle.dumps
|
|
self.pickle_deserializer = pickle.loads
|
|
|
|
def set_pickle(self, serializer, deserializer):
|
|
"""
|
|
Set the serializer and deserializer to use for objects that are to be
|
|
pickled.
|
|
|
|
Parameters
|
|
----------
|
|
serializer : callable
|
|
The serializer to use (e.g., pickle.dumps or cloudpickle.dumps).
|
|
deserializer : callable
|
|
The deserializer to use (e.g., pickle.dumps or cloudpickle.dumps).
|
|
"""
|
|
self.pickle_serializer = serializer
|
|
self.pickle_deserializer = deserializer
|
|
|
|
def clone(self):
|
|
"""
|
|
Return copy of this SerializationContext.
|
|
|
|
Returns
|
|
-------
|
|
clone : SerializationContext
|
|
"""
|
|
result = SerializationContext()
|
|
result.type_to_type_id = self.type_to_type_id.copy()
|
|
result.whitelisted_types = self.whitelisted_types.copy()
|
|
result.types_to_pickle = self.types_to_pickle.copy()
|
|
result.custom_serializers = self.custom_serializers.copy()
|
|
result.custom_deserializers = self.custom_deserializers.copy()
|
|
result.pickle_serializer = self.pickle_serializer
|
|
result.pickle_deserializer = self.pickle_deserializer
|
|
|
|
return result
|
|
|
|
def register_type(self, type_, type_id, pickle=False,
|
|
custom_serializer=None, custom_deserializer=None):
|
|
r"""
|
|
EXPERIMENTAL: Add type to the list of types we can serialize.
|
|
|
|
Parameters
|
|
----------
|
|
type\_ : type
|
|
The type that we can serialize.
|
|
type_id : string
|
|
A string used to identify the type.
|
|
pickle : bool
|
|
True if the serialization should be done with pickle.
|
|
False if it should be done efficiently with Arrow.
|
|
custom_serializer : callable
|
|
This argument is optional, but can be provided to
|
|
serialize objects of the class in a particular way.
|
|
custom_deserializer : callable
|
|
This argument is optional, but can be provided to
|
|
deserialize objects of the class in a particular way.
|
|
"""
|
|
if not isinstance(type_id, str):
|
|
raise TypeError("The type_id argument must be a string. The value "
|
|
"passed in has type {}.".format(type(type_id)))
|
|
|
|
self.type_to_type_id[type_] = type_id
|
|
self.whitelisted_types[type_id] = type_
|
|
if pickle:
|
|
self.types_to_pickle.add(type_id)
|
|
if custom_serializer is not None:
|
|
self.custom_serializers[type_id] = custom_serializer
|
|
self.custom_deserializers[type_id] = custom_deserializer
|
|
|
|
def _serialize_callback(self, obj):
|
|
found = False
|
|
for type_ in type(obj).__mro__:
|
|
if type_ in self.type_to_type_id:
|
|
found = True
|
|
break
|
|
|
|
if not found:
|
|
raise SerializationCallbackError(
|
|
"pyarrow does not know how to "
|
|
"serialize objects of type {}.".format(type(obj)), obj
|
|
)
|
|
|
|
# use the closest match to type(obj)
|
|
type_id = self.type_to_type_id[type_]
|
|
if type_id in self.types_to_pickle:
|
|
serialized_obj = {"data": self.pickle_serializer(obj),
|
|
"pickle": True}
|
|
elif type_id in self.custom_serializers:
|
|
serialized_obj = {"data": self.custom_serializers[type_id](obj)}
|
|
else:
|
|
if is_named_tuple(type_):
|
|
serialized_obj = {}
|
|
serialized_obj["_pa_getnewargs_"] = obj.__getnewargs__()
|
|
elif hasattr(obj, "__dict__"):
|
|
serialized_obj = obj.__dict__
|
|
else:
|
|
msg = "We do not know how to serialize " \
|
|
"the object '{}'".format(obj)
|
|
raise SerializationCallbackError(msg, obj)
|
|
return dict(serialized_obj, **{"_pytype_": type_id})
|
|
|
|
def _deserialize_callback(self, serialized_obj):
|
|
type_id = serialized_obj["_pytype_"]
|
|
if isinstance(type_id, bytes):
|
|
# ARROW-4675: Python 2 serialized, read in Python 3
|
|
type_id = frombytes(type_id)
|
|
|
|
if "pickle" in serialized_obj:
|
|
# The object was pickled, so unpickle it.
|
|
obj = self.pickle_deserializer(serialized_obj["data"])
|
|
else:
|
|
assert type_id not in self.types_to_pickle
|
|
if type_id not in self.whitelisted_types:
|
|
msg = "Type ID " + type_id + " not registered in " \
|
|
"deserialization callback"
|
|
raise DeserializationCallbackError(msg, type_id)
|
|
type_ = self.whitelisted_types[type_id]
|
|
if type_id in self.custom_deserializers:
|
|
obj = self.custom_deserializers[type_id](
|
|
serialized_obj["data"])
|
|
else:
|
|
# In this case, serialized_obj should just be
|
|
# the __dict__ field.
|
|
if "_pa_getnewargs_" in serialized_obj:
|
|
obj = type_.__new__(
|
|
type_, *serialized_obj["_pa_getnewargs_"])
|
|
else:
|
|
obj = type_.__new__(type_)
|
|
serialized_obj.pop("_pytype_")
|
|
obj.__dict__.update(serialized_obj)
|
|
return obj
|
|
|
|
def serialize(self, obj):
|
|
"""
|
|
Call pyarrow.serialize and pass this SerializationContext.
|
|
"""
|
|
return serialize(obj, context=self)
|
|
|
|
def serialize_to(self, object value, sink):
|
|
"""
|
|
Call pyarrow.serialize_to and pass this SerializationContext.
|
|
"""
|
|
return serialize_to(value, sink, context=self)
|
|
|
|
def deserialize(self, what):
|
|
"""
|
|
Call pyarrow.deserialize and pass this SerializationContext.
|
|
"""
|
|
return deserialize(what, context=self)
|
|
|
|
def deserialize_components(self, what):
|
|
"""
|
|
Call pyarrow.deserialize_components and pass this SerializationContext.
|
|
"""
|
|
return deserialize_components(what, context=self)
|
|
|
|
|
|
_default_serialization_context = SerializationContext()
|
|
_default_context_initialized = False
|
|
|
|
|
|
def _get_default_context():
|
|
global _default_context_initialized
|
|
from pyarrow.serialization import _register_default_serialization_handlers
|
|
if not _default_context_initialized:
|
|
_register_default_serialization_handlers(
|
|
_default_serialization_context)
|
|
_default_context_initialized = True
|
|
return _default_serialization_context
|
|
|
|
|
|
cdef class SerializedPyObject(_Weakrefable):
|
|
"""
|
|
Arrow-serialized representation of Python object.
|
|
"""
|
|
cdef:
|
|
CSerializedPyObject data
|
|
|
|
cdef readonly:
|
|
object base
|
|
|
|
@property
|
|
def total_bytes(self):
|
|
cdef CMockOutputStream mock_stream
|
|
with nogil:
|
|
check_status(self.data.WriteTo(&mock_stream))
|
|
|
|
return mock_stream.GetExtentBytesWritten()
|
|
|
|
def write_to(self, sink):
|
|
"""
|
|
Write serialized object to a sink.
|
|
"""
|
|
cdef shared_ptr[COutputStream] stream
|
|
get_writer(sink, &stream)
|
|
self._write_to(stream.get())
|
|
|
|
cdef _write_to(self, COutputStream* stream):
|
|
with nogil:
|
|
check_status(self.data.WriteTo(stream))
|
|
|
|
def deserialize(self, SerializationContext context=None):
|
|
"""
|
|
Convert back to Python object.
|
|
"""
|
|
cdef PyObject* result
|
|
|
|
if context is None:
|
|
context = _get_default_context()
|
|
|
|
with nogil:
|
|
check_status(DeserializeObject(context, self.data,
|
|
<PyObject*> self.base, &result))
|
|
|
|
# PyObject_to_object is necessary to avoid a memory leak;
|
|
# also unpack the list the object was wrapped in in serialize
|
|
return PyObject_to_object(result)[0]
|
|
|
|
def to_buffer(self, nthreads=1):
|
|
"""
|
|
Write serialized data as Buffer.
|
|
"""
|
|
cdef Buffer output = allocate_buffer(self.total_bytes)
|
|
sink = FixedSizeBufferWriter(output)
|
|
if nthreads > 1:
|
|
sink.set_memcopy_threads(nthreads)
|
|
self.write_to(sink)
|
|
return output
|
|
|
|
@staticmethod
|
|
def from_components(components):
|
|
"""
|
|
Reconstruct SerializedPyObject from output of
|
|
SerializedPyObject.to_components.
|
|
"""
|
|
cdef:
|
|
int num_tensors = components['num_tensors']
|
|
int num_ndarrays = components['num_ndarrays']
|
|
int num_buffers = components['num_buffers']
|
|
list buffers = components['data']
|
|
SparseTensorCounts num_sparse_tensors = SparseTensorCounts()
|
|
SerializedPyObject result = SerializedPyObject()
|
|
|
|
num_sparse_tensors.coo = components['num_sparse_tensors']['coo']
|
|
num_sparse_tensors.csr = components['num_sparse_tensors']['csr']
|
|
num_sparse_tensors.csc = components['num_sparse_tensors']['csc']
|
|
num_sparse_tensors.csf = components['num_sparse_tensors']['csf']
|
|
num_sparse_tensors.ndim_csf = \
|
|
components['num_sparse_tensors']['ndim_csf']
|
|
|
|
with nogil:
|
|
check_status(GetSerializedFromComponents(num_tensors,
|
|
num_sparse_tensors,
|
|
num_ndarrays,
|
|
num_buffers,
|
|
buffers, &result.data))
|
|
|
|
return result
|
|
|
|
def to_components(self, memory_pool=None):
|
|
"""
|
|
Return the decomposed dict representation of the serialized object
|
|
containing a collection of Buffer objects which maximize opportunities
|
|
for zero-copy.
|
|
|
|
Parameters
|
|
----------
|
|
memory_pool : MemoryPool default None
|
|
Pool to use for necessary allocations.
|
|
|
|
Returns
|
|
|
|
"""
|
|
cdef PyObject* result
|
|
cdef CMemoryPool* c_pool = maybe_unbox_memory_pool(memory_pool)
|
|
|
|
with nogil:
|
|
check_status(self.data.GetComponents(c_pool, &result))
|
|
|
|
return PyObject_to_object(result)
|
|
|
|
|
|
def serialize(object value, SerializationContext context=None):
|
|
"""
|
|
DEPRECATED: Serialize a general Python sequence for transient storage
|
|
and transport.
|
|
|
|
.. deprecated:: 2.0
|
|
The custom serialization functionality is deprecated in pyarrow 2.0,
|
|
and will be removed in a future version. Use the standard library
|
|
``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
|
|
more).
|
|
|
|
Notes
|
|
-----
|
|
This function produces data that is incompatible with the standard
|
|
Arrow IPC binary protocol, i.e. it cannot be used with ipc.open_stream or
|
|
ipc.open_file. You can use deserialize, deserialize_from, or
|
|
deserialize_components to read it.
|
|
|
|
Parameters
|
|
----------
|
|
value : object
|
|
Python object for the sequence that is to be serialized.
|
|
context : SerializationContext
|
|
Custom serialization and deserialization context, uses a default
|
|
context with some standard type handlers if not specified.
|
|
|
|
Returns
|
|
-------
|
|
serialized : SerializedPyObject
|
|
|
|
"""
|
|
_deprecate_serialization("serialize")
|
|
return _serialize(value, context)
|
|
|
|
|
|
def _serialize(object value, SerializationContext context=None):
|
|
cdef SerializedPyObject serialized = SerializedPyObject()
|
|
wrapped_value = [value]
|
|
|
|
if context is None:
|
|
context = _get_default_context()
|
|
|
|
with nogil:
|
|
check_status(SerializeObject(context, wrapped_value, &serialized.data))
|
|
return serialized
|
|
|
|
|
|
def serialize_to(object value, sink, SerializationContext context=None):
|
|
"""
|
|
DEPRECATED: Serialize a Python sequence to a file.
|
|
|
|
.. deprecated:: 2.0
|
|
The custom serialization functionality is deprecated in pyarrow 2.0,
|
|
and will be removed in a future version. Use the standard library
|
|
``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
|
|
more).
|
|
|
|
Parameters
|
|
----------
|
|
value : object
|
|
Python object for the sequence that is to be serialized.
|
|
sink : NativeFile or file-like
|
|
File the sequence will be written to.
|
|
context : SerializationContext
|
|
Custom serialization and deserialization context, uses a default
|
|
context with some standard type handlers if not specified.
|
|
"""
|
|
_deprecate_serialization("serialize_to")
|
|
serialized = _serialize(value, context)
|
|
serialized.write_to(sink)
|
|
|
|
|
|
def read_serialized(source, base=None):
|
|
"""
|
|
DEPRECATED: Read serialized Python sequence from file-like object.
|
|
|
|
.. deprecated:: 2.0
|
|
The custom serialization functionality is deprecated in pyarrow 2.0,
|
|
and will be removed in a future version. Use the standard library
|
|
``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
|
|
more).
|
|
|
|
Parameters
|
|
----------
|
|
source : NativeFile
|
|
File to read the sequence from.
|
|
base : object
|
|
This object will be the base object of all the numpy arrays
|
|
contained in the sequence.
|
|
|
|
Returns
|
|
-------
|
|
serialized : the serialized data
|
|
"""
|
|
_deprecate_serialization("read_serialized")
|
|
return _read_serialized(source, base=base)
|
|
|
|
|
|
def _read_serialized(source, base=None):
|
|
cdef shared_ptr[CRandomAccessFile] stream
|
|
get_reader(source, True, &stream)
|
|
|
|
cdef SerializedPyObject serialized = SerializedPyObject()
|
|
serialized.base = base
|
|
with nogil:
|
|
check_status(ReadSerializedObject(stream.get(), &serialized.data))
|
|
|
|
return serialized
|
|
|
|
|
|
def deserialize_from(source, object base, SerializationContext context=None):
|
|
"""
|
|
DEPRECATED: Deserialize a Python sequence from a file.
|
|
|
|
.. deprecated:: 2.0
|
|
The custom serialization functionality is deprecated in pyarrow 2.0,
|
|
and will be removed in a future version. Use the standard library
|
|
``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
|
|
more).
|
|
|
|
This only can interact with data produced by pyarrow.serialize or
|
|
pyarrow.serialize_to.
|
|
|
|
Parameters
|
|
----------
|
|
source : NativeFile
|
|
File to read the sequence from.
|
|
base : object
|
|
This object will be the base object of all the numpy arrays
|
|
contained in the sequence.
|
|
context : SerializationContext
|
|
Custom serialization and deserialization context.
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
Python object for the deserialized sequence.
|
|
"""
|
|
_deprecate_serialization("deserialize_from")
|
|
serialized = _read_serialized(source, base=base)
|
|
return serialized.deserialize(context)
|
|
|
|
|
|
def deserialize_components(components, SerializationContext context=None):
|
|
"""
|
|
DEPRECATED: Reconstruct Python object from output of
|
|
SerializedPyObject.to_components.
|
|
|
|
.. deprecated:: 2.0
|
|
The custom serialization functionality is deprecated in pyarrow 2.0,
|
|
and will be removed in a future version. Use the standard library
|
|
``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
|
|
more).
|
|
|
|
Parameters
|
|
----------
|
|
components : dict
|
|
Output of SerializedPyObject.to_components
|
|
context : SerializationContext, default None
|
|
|
|
Returns
|
|
-------
|
|
object : the Python object that was originally serialized
|
|
"""
|
|
_deprecate_serialization("deserialize_components")
|
|
serialized = SerializedPyObject.from_components(components)
|
|
return serialized.deserialize(context)
|
|
|
|
|
|
def deserialize(obj, SerializationContext context=None):
|
|
"""
|
|
DEPRECATED: Deserialize Python object from Buffer or other Python
|
|
object supporting the buffer protocol.
|
|
|
|
.. deprecated:: 2.0
|
|
The custom serialization functionality is deprecated in pyarrow 2.0,
|
|
and will be removed in a future version. Use the standard library
|
|
``pickle`` or the IPC functionality of pyarrow (see :ref:`ipc` for
|
|
more).
|
|
|
|
This only can interact with data produced by pyarrow.serialize or
|
|
pyarrow.serialize_to.
|
|
|
|
Parameters
|
|
----------
|
|
obj : pyarrow.Buffer or Python object supporting buffer protocol
|
|
context : SerializationContext
|
|
Custom serialization and deserialization context.
|
|
|
|
Returns
|
|
-------
|
|
deserialized : object
|
|
"""
|
|
_deprecate_serialization("deserialize")
|
|
return _deserialize(obj, context=context)
|
|
|
|
|
|
def _deserialize(obj, SerializationContext context=None):
|
|
source = BufferReader(obj)
|
|
serialized = _read_serialized(source, base=obj)
|
|
return serialized.deserialize(context)
|