first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
from pandas.core.arrays.base import (
ExtensionArray,
ExtensionOpsMixin,
ExtensionScalarOpsMixin,
)
from pandas.core.arrays.boolean import BooleanArray
from pandas.core.arrays.categorical import Categorical
from pandas.core.arrays.datetimes import DatetimeArray
from pandas.core.arrays.floating import FloatingArray
from pandas.core.arrays.integer import IntegerArray
from pandas.core.arrays.interval import IntervalArray
from pandas.core.arrays.masked import BaseMaskedArray
from pandas.core.arrays.numpy_ import PandasArray
from pandas.core.arrays.period import (
PeriodArray,
period_array,
)
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_arrow import ArrowStringArray
from pandas.core.arrays.timedeltas import TimedeltaArray
__all__ = [
"ExtensionArray",
"ExtensionOpsMixin",
"ExtensionScalarOpsMixin",
"ArrowStringArray",
"BaseMaskedArray",
"BooleanArray",
"Categorical",
"DatetimeArray",
"FloatingArray",
"IntegerArray",
"IntervalArray",
"PandasArray",
"PeriodArray",
"period_array",
"SparseArray",
"StringArray",
"TimedeltaArray",
]

View File

@@ -0,0 +1,141 @@
import json
import numpy as np
import pyarrow
from pandas.core.arrays.interval import VALID_CLOSED
def pyarrow_array_to_numpy_and_mask(arr, dtype):
"""
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.
At the moment pyarrow.BooleanArray is not supported.
Parameters
----------
arr : pyarrow.Array
dtype : numpy.dtype
Returns
-------
(data, mask)
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)
buflist = arr.buffers()
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
mask = np.asarray(mask)
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask
class ArrowPeriodType(pyarrow.ExtensionType):
def __init__(self, freq):
# attributes need to be set first before calling
# super init (as that calls serialize)
self._freq = freq
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
@property
def freq(self):
return self._freq
def __arrow_ext_serialize__(self):
metadata = {"freq": self.freq}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
metadata = json.loads(serialized.decode())
return ArrowPeriodType(metadata["freq"])
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return type(self) == type(other) and self.freq == other.freq
else:
return NotImplemented
def __hash__(self):
return hash((str(self), self.freq))
def to_pandas_dtype(self):
import pandas as pd
return pd.PeriodDtype(freq=self.freq)
# register the type with a dummy instance
_period_type = ArrowPeriodType("D")
pyarrow.register_extension_type(_period_type)
class ArrowIntervalType(pyarrow.ExtensionType):
def __init__(self, subtype, closed):
# attributes need to be set first before calling
# super init (as that calls serialize)
assert closed in VALID_CLOSED
self._closed = closed
if not isinstance(subtype, pyarrow.DataType):
subtype = pyarrow.type_for_alias(str(subtype))
self._subtype = subtype
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
@property
def subtype(self):
return self._subtype
@property
def closed(self):
return self._closed
def __arrow_ext_serialize__(self):
metadata = {"subtype": str(self.subtype), "closed": self.closed}
return json.dumps(metadata).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
metadata = json.loads(serialized.decode())
subtype = pyarrow.type_for_alias(metadata["subtype"])
closed = metadata["closed"]
return ArrowIntervalType(subtype, closed)
def __eq__(self, other):
if isinstance(other, pyarrow.BaseExtensionType):
return (
type(self) == type(other)
and self.subtype == other.subtype
and self.closed == other.closed
)
else:
return NotImplemented
def __hash__(self):
return hash((str(self), str(self.subtype), self.closed))
def to_pandas_dtype(self):
import pandas as pd
return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
# register the type with a dummy instance
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
pyarrow.register_extension_type(_interval_type)

View File

@@ -0,0 +1,496 @@
from __future__ import annotations
from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Literal,
Sequence,
TypeVar,
cast,
overload,
)
import numpy as np
from pandas._libs import lib
from pandas._libs.arrays import NDArrayBacked
from pandas._typing import (
ArrayLike,
Dtype,
F,
PositionalIndexer2D,
PositionalIndexerTuple,
ScalarIndexer,
SequenceIndexer,
Shape,
TakeIndexer,
npt,
type_t,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_insert_loc,
)
from pandas.core.dtypes.common import (
is_dtype_equal,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core import missing
from pandas.core.algorithms import (
take,
unique,
value_counts,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays.base import ExtensionArray
from pandas.core.construction import extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.sorting import nargminmax
NDArrayBackedExtensionArrayT = TypeVar(
"NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray"
)
if TYPE_CHECKING:
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)
def ravel_compat(meth: F) -> F:
"""
Decorator to ravel a 2D array before passing it to a cython operation,
then reshape the result to our own shape.
"""
@wraps(meth)
def method(self, *args, **kwargs):
if self.ndim == 1:
return meth(self, *args, **kwargs)
flags = self._ndarray.flags
flat = self.ravel("K")
result = meth(flat, *args, **kwargs)
order = "F" if flags.f_contiguous else "C"
return result.reshape(self.shape, order=order)
return cast(F, method)
class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
"""
ExtensionArray that is backed by a single NumPy ndarray.
"""
_ndarray: np.ndarray
def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
"""
return x
def _validate_scalar(self, value):
# used by NDArrayBackedExtensionIndex.insert
raise AbstractMethodError(self)
# ------------------------------------------------------------------------
def view(self, dtype: Dtype | None = None) -> ArrayLike:
# We handle datetime64, datetime64tz, timedelta64, and period
# dtypes here. Everything else we pass through to the underlying
# ndarray.
if dtype is None or dtype is self.dtype:
return self._from_backing_data(self._ndarray)
if isinstance(dtype, type):
# we sometimes pass non-dtype objects, e.g np.ndarray;
# pass those through to the underlying ndarray
return self._ndarray.view(dtype)
dtype = pandas_dtype(dtype)
arr = self._ndarray
if isinstance(dtype, (PeriodDtype, DatetimeTZDtype)):
cls = dtype.construct_array_type()
return cls(arr.view("i8"), dtype=dtype)
elif dtype == "M8[ns]":
from pandas.core.arrays import DatetimeArray
return DatetimeArray(arr.view("i8"), dtype=dtype)
elif dtype == "m8[ns]":
from pandas.core.arrays import TimedeltaArray
return TimedeltaArray(arr.view("i8"), dtype=dtype)
# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
# type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
# type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
return arr.view(dtype=dtype) # type: ignore[arg-type]
def take(
self: NDArrayBackedExtensionArrayT,
indices: TakeIndexer,
*,
allow_fill: bool = False,
fill_value: Any = None,
axis: int = 0,
) -> NDArrayBackedExtensionArrayT:
if allow_fill:
fill_value = self._validate_scalar(fill_value)
new_data = take(
self._ndarray,
indices,
allow_fill=allow_fill,
fill_value=fill_value,
axis=axis,
)
return self._from_backing_data(new_data)
# ------------------------------------------------------------------------
def equals(self, other) -> bool:
if type(self) is not type(other):
return False
if not is_dtype_equal(self.dtype, other.dtype):
return False
return bool(array_equivalent(self._ndarray, other._ndarray))
def _values_for_argsort(self) -> np.ndarray:
return self._ndarray
# Signature of "argmin" incompatible with supertype "ExtensionArray"
def argmin(self, axis: int = 0, skipna: bool = True): # type:ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self.isna().any():
raise NotImplementedError
return nargminmax(self, "argmin", axis=axis)
# Signature of "argmax" incompatible with supertype "ExtensionArray"
def argmax(self, axis: int = 0, skipna: bool = True): # type:ignore[override]
# override base class by adding axis keyword
validate_bool_kwarg(skipna, "skipna")
if not skipna and self.isna().any():
raise NotImplementedError
return nargminmax(self, "argmax", axis=axis)
def unique(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT:
new_data = unique(self._ndarray)
return self._from_backing_data(new_data)
@classmethod
@doc(ExtensionArray._concat_same_type)
def _concat_same_type(
cls: type[NDArrayBackedExtensionArrayT],
to_concat: Sequence[NDArrayBackedExtensionArrayT],
axis: int = 0,
) -> NDArrayBackedExtensionArrayT:
dtypes = {str(x.dtype) for x in to_concat}
if len(dtypes) != 1:
raise ValueError("to_concat must have the same dtype (tz)", dtypes)
new_values = [x._ndarray for x in to_concat]
new_values = np.concatenate(new_values, axis=axis)
# error: Argument 1 to "_from_backing_data" of "NDArrayBackedExtensionArray" has
# incompatible type "List[ndarray]"; expected "ndarray"
return to_concat[0]._from_backing_data(new_values) # type: ignore[arg-type]
@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> npt.NDArray[np.intp] | np.intp:
npvalue = self._validate_searchsorted_value(value)
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
def _validate_searchsorted_value(
self, value: NumpyValueArrayLike | ExtensionArray
) -> NumpyValueArrayLike:
if isinstance(value, ExtensionArray):
return value.to_numpy()
else:
return value
@doc(ExtensionArray.shift)
def shift(self, periods=1, fill_value=None, axis=0):
fill_value = self._validate_shift_value(fill_value)
new_values = shift(self._ndarray, periods, axis, fill_value)
return self._from_backing_data(new_values)
def _validate_shift_value(self, fill_value):
# TODO(2.0): after deprecation in datetimelikearraymixin is enforced,
# we can remove this and use validate_fill_value directly
return self._validate_scalar(fill_value)
def __setitem__(self, key, value):
key = check_array_indexer(self, key)
value = self._validate_setitem_value(value)
self._ndarray[key] = value
def _validate_setitem_value(self, value):
return value
@overload
def __getitem__(self, key: ScalarIndexer) -> Any:
...
@overload
def __getitem__(
self: NDArrayBackedExtensionArrayT,
key: SequenceIndexer | PositionalIndexerTuple,
) -> NDArrayBackedExtensionArrayT:
...
def __getitem__(
self: NDArrayBackedExtensionArrayT,
key: PositionalIndexer2D,
) -> NDArrayBackedExtensionArrayT | Any:
if lib.is_integer(key):
# fast-path
result = self._ndarray[key]
if self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# error: Incompatible types in assignment (expression has type "ExtensionArray",
# variable has type "Union[int, slice, ndarray]")
key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
key = check_array_indexer(self, key)
result = self._ndarray[key]
if lib.is_scalar(result):
return self._box_func(result)
result = self._from_backing_data(result)
return result
def _fill_mask_inplace(
self, method: str, limit, mask: npt.NDArray[np.bool_]
) -> None:
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
func(self._ndarray.T, limit=limit, mask=mask.T)
return
@doc(ExtensionArray.fillna)
def fillna(
self: NDArrayBackedExtensionArrayT, value=None, method=None, limit=None
) -> NDArrayBackedExtensionArrayT:
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
mask = self.isna()
# error: Argument 2 to "check_value_size" has incompatible type
# "ExtensionArray"; expected "ndarray"
value = missing.check_value_size(
value, mask, len(self) # type: ignore[arg-type]
)
if mask.any():
if method is not None:
# TODO: check value is None
# (for now) when self.ndim == 2, we assume axis=0
func = missing.get_fill_func(method, ndim=self.ndim)
new_values, _ = func(self._ndarray.T.copy(), limit=limit, mask=mask.T)
new_values = new_values.T
# TODO: PandasArray didn't used to copy, need tests for this
new_values = self._from_backing_data(new_values)
else:
# fill with value
new_values = self.copy()
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)
new_values = self.copy()
return new_values
# ------------------------------------------------------------------------
# Reductions
def _wrap_reduction_result(self, axis: int | None, result):
if axis is None or self.ndim == 1:
return self._box_func(result)
return self._from_backing_data(result)
# ------------------------------------------------------------------------
# __array_function__ methods
def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
"""
Analogue to np.putmask(self, mask, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
np.putmask(self._ndarray, mask, value)
def _where(
self: NDArrayBackedExtensionArrayT, mask: np.ndarray, value
) -> NDArrayBackedExtensionArrayT:
"""
Analogue to np.where(mask, self, value)
Parameters
----------
mask : np.ndarray[bool]
value : scalar or listlike
Raises
------
TypeError
If value cannot be cast to self.dtype.
"""
value = self._validate_setitem_value(value)
res_values = np.where(mask, self._ndarray, value)
return self._from_backing_data(res_values)
# ------------------------------------------------------------------------
# Index compat methods
def insert(
self: NDArrayBackedExtensionArrayT, loc: int, item
) -> NDArrayBackedExtensionArrayT:
"""
Make new ExtensionArray inserting new item at location. Follows
Python list.append semantics for negative values.
Parameters
----------
loc : int
item : object
Returns
-------
type(self)
"""
loc = validate_insert_loc(loc, len(self))
code = self._validate_scalar(item)
new_vals = np.concatenate(
(
self._ndarray[:loc],
np.asarray([code], dtype=self._ndarray.dtype),
self._ndarray[loc:],
)
)
return self._from_backing_data(new_vals)
# ------------------------------------------------------------------------
# Additional array methods
# These are not part of the EA API, but we implement them because
# pandas assumes they're there.
def value_counts(self, dropna: bool = True):
"""
Return a Series containing counts of unique values.
Parameters
----------
dropna : bool, default True
Don't include counts of NA values.
Returns
-------
Series
"""
if self.ndim != 1:
raise NotImplementedError
from pandas import (
Index,
Series,
)
if dropna:
# error: Unsupported operand type for ~ ("ExtensionArray")
values = self[~self.isna()]._ndarray # type: ignore[operator]
else:
values = self._ndarray
result = value_counts(values, sort=False, dropna=dropna)
index_arr = self._from_backing_data(np.asarray(result.index._data))
index = Index(index_arr, name=result.index.name)
return Series(result._values, index=index, name=result.name)
def _quantile(
self: NDArrayBackedExtensionArrayT,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> NDArrayBackedExtensionArrayT:
# TODO: disable for Categorical if not ordered?
# asarray needed for Sparse, see GH#24600
mask = np.asarray(self.isna())
mask = np.atleast_2d(mask)
arr = np.atleast_2d(self._ndarray)
# TODO: something NDArrayBacked-specific instead of _values_for_factorize[1]?
fill_value = self._values_for_factorize()[1]
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
result = type(self)._from_factorized(res_values, self)
if self.ndim == 1:
assert result.shape == (1, len(qs)), result.shape
result = result[0]
return result
# ------------------------------------------------------------------------
# numpy-like methods
@classmethod
def _empty(
cls: type_t[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype
) -> NDArrayBackedExtensionArrayT:
"""
Analogous to np.empty(shape, dtype=dtype)
Parameters
----------
shape : tuple[int]
dtype : ExtensionDtype
"""
# The base implementation uses a naive approach to find the dtype
# for the backing ndarray
arr = cls._from_sequence([], dtype=dtype)
backing = np.empty(shape, dtype=arr._ndarray.dtype)
return arr._from_backing_data(backing)

View File

@@ -0,0 +1,193 @@
"""
Helper functions to generate range-like data for DatetimeArray
(and possibly TimedeltaArray/PeriodArray)
"""
from __future__ import annotations
import numpy as np
from pandas._libs.lib import i8max
from pandas._libs.tslibs import (
BaseOffset,
OutOfBoundsDatetime,
Timedelta,
Timestamp,
iNaT,
)
def generate_regular_range(
start: Timestamp | Timedelta,
end: Timestamp | Timedelta,
periods: int,
freq: BaseOffset,
):
"""
Generate a range of dates or timestamps with the spans between dates
described by the given `freq` DateOffset.
Parameters
----------
start : Timedelta, Timestamp or None
First point of produced date range.
end : Timedelta, Timestamp or None
Last point of produced date range.
periods : int
Number of periods in produced date range.
freq : Tick
Describes space between dates in produced date range.
Returns
-------
ndarray[np.int64] Representing nanoseconds.
"""
istart = start.value if start is not None else None
iend = end.value if end is not None else None
stride = freq.nanos
if periods is None:
b = istart
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (iend - b) // stride * stride + stride // 2 + 1
elif istart is not None:
b = istart
e = _generate_range_overflow_safe(b, periods, stride, side="start")
elif iend is not None:
e = iend + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
else:
raise ValueError(
"at least 'start' or 'end' should be specified if a 'period' is given."
)
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)
return values
def _generate_range_overflow_safe(
endpoint: int, periods: int, stride: int, side: str = "start"
) -> int:
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.
Parameters
----------
endpoint : int
nanosecond timestamp of the known endpoint of the desired range
periods : int
number of periods in the desired range
stride : int
nanoseconds between periods in the desired range
side : {'start', 'end'}
which end of the range `endpoint` refers to
Returns
-------
other_end : int
Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ["start", "end"]
i64max = np.uint64(i8max)
msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError as err:
raise OutOfBoundsDatetime(msg) from err
if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
elif (endpoint > 0 and side == "start" and stride > 0) or (
endpoint < 0 and side == "end" and stride > 0
):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)
elif side == "end" and endpoint > i64max and endpoint - stride <= i64max:
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(
endpoint - stride, periods - 1, stride, side
)
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side)
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
def _generate_range_overflow_safe_signed(
endpoint: int, periods: int, stride: int, side: str
) -> int:
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ["start", "end"]
if side == "end":
stride *= -1
with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
result = np.int64(endpoint) + addend
if result == iNaT:
# Putting this into a DatetimeArray/TimedeltaArray
# would incorrectly be interpreted as NaT
raise OverflowError
# error: Incompatible return value type (got "signedinteger[_64Bit]",
# expected "int")
return result # type: ignore[return-value]
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass
# if stride and endpoint had opposite signs, then endpoint + addend
# should never overflow. so they must have the same signs
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
# error: Incompatible types in assignment (expression has type
# "unsignedinteger[_64Bit]", variable has type "signedinteger[_64Bit]")
result = np.uint64(endpoint) + np.uint64(addend) # type: ignore[assignment]
i64max = np.uint64(i8max)
assert result > i64max
if result <= i64max + np.uint64(stride):
# error: Incompatible return value type (got "unsignedinteger", expected
# "int")
return result # type: ignore[return-value]
raise OutOfBoundsDatetime(
f"Cannot generate range with {side}={endpoint} and periods={periods}"
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,535 @@
from __future__ import annotations
import numbers
from typing import (
TYPE_CHECKING,
overload,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._typing import (
ArrayLike,
AstypeArg,
Dtype,
DtypeObj,
npt,
type_t,
)
from pandas.core.dtypes.common import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.missing import isna
from pandas.core import ops
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
import pyarrow
@register_extension_dtype
class BooleanDtype(BaseMaskedDtype):
"""
Extension dtype for boolean data.
.. versionadded:: 1.0.0
.. warning::
BooleanDtype is considered experimental. The implementation and
parts of the API may change without warning.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> pd.BooleanDtype()
BooleanDtype
"""
name = "boolean"
# https://github.com/python/mypy/issues/4125
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
@property
def type(self) -> type: # type: ignore[override]
return np.bool_
@property
def kind(self) -> str:
return "b"
@property
def numpy_dtype(self) -> np.dtype:
return np.dtype("bool")
@classmethod
def construct_array_type(cls) -> type_t[BooleanArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return BooleanArray
def __repr__(self) -> str:
return "BooleanDtype"
@property
def _is_boolean(self) -> bool:
return True
@property
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BooleanArray:
"""
Construct BooleanArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
if array.type != pyarrow.bool_():
raise TypeError(f"Expected array of boolean type, got {array.type} instead")
if isinstance(array, pyarrow.Array):
chunks = [array]
else:
# pyarrow.ChunkedArray
chunks = array.chunks
results = []
for arr in chunks:
buflist = arr.buffers()
data = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
).to_numpy(zero_copy_only=False)
if arr.null_count != 0:
mask = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[0]], offset=arr.offset
).to_numpy(zero_copy_only=False)
mask = ~mask
else:
mask = np.zeros(len(arr), dtype=bool)
bool_arr = BooleanArray(data, mask)
results.append(bool_arr)
if not results:
return BooleanArray(
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
)
else:
return BooleanArray._concat_same_type(results)
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
# Handle only boolean + np.bool_ -> boolean, since other cases like
# Int64 + boolean -> Int64 will be handled by the other type
if all(
isinstance(t, BooleanDtype)
or (isinstance(t, np.dtype) and (np.issubdtype(t, np.bool_)))
for t in dtypes
):
return BooleanDtype()
else:
return None
def coerce_to_array(
values, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.
Parameters
----------
values : 1D list-like
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
if isinstance(values, BooleanArray):
if mask is not None:
raise ValueError("cannot pass mask for BooleanArray input")
values, mask = values._data, values._mask
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
mask_values = None
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
if copy:
values = values.copy()
elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype):
mask_values = isna(values)
values_bool = np.zeros(len(values), dtype=bool)
values_bool[~mask_values] = values[~mask_values].astype(bool)
if not np.all(
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
):
raise TypeError("Need to pass bool-like values")
values = values_bool
else:
values_object = np.asarray(values, dtype=object)
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
integer_like = ("floating", "integer", "mixed-integer-float")
if inferred_dtype not in ("boolean", "empty") + integer_like:
raise TypeError("Need to pass bool-like values")
mask_values = isna(values_object)
values = np.zeros(len(values), dtype=bool)
values[~mask_values] = values_object[~mask_values].astype(bool)
# if the values were integer-like, validate it were actually 0/1's
if (inferred_dtype in integer_like) and not (
np.all(
values[~mask_values].astype(float)
== values_object[~mask_values].astype(float)
)
):
raise TypeError("Need to pass bool-like values")
if mask is None and mask_values is None:
mask = np.zeros(len(values), dtype=bool)
elif mask is None:
mask = mask_values
else:
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
if mask_values is not None:
mask = mask | mask_values
else:
if copy:
mask = mask.copy()
else:
mask = np.array(mask, dtype=bool)
if mask_values is not None:
mask = mask | mask_values
if values.shape != mask.shape:
raise ValueError("values.shape and mask.shape must match")
return values, mask
class BooleanArray(BaseMaskedArray):
"""
Array of boolean (True/False) data with missing values.
This is a pandas Extension array for boolean data, under the hood
represented by 2 numpy arrays: a boolean array with the data and
a boolean array with the mask (True indicating missing).
BooleanArray implements Kleene logic (sometimes called three-value
logic) for logical operations. See :ref:`boolean.kleene` for more.
To construct an BooleanArray from generic array-like input, use
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
below).
.. versionadded:: 1.0.0
.. warning::
BooleanArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : numpy.ndarray
A 1-d boolean-dtype array with the data.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values (True
indicates missing).
copy : bool, default False
Whether to copy the `values` and `mask` arrays.
Attributes
----------
None
Methods
-------
None
Returns
-------
BooleanArray
Examples
--------
Create an BooleanArray with :func:`pandas.array`:
>>> pd.array([True, False, None], dtype="boolean")
<BooleanArray>
[True, False, <NA>]
Length: 3, dtype: boolean
"""
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False
# Fill values used for any/all
_truthy_value = True
_falsey_value = False
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
"values should be boolean numpy array. Use "
"the 'pd.array' function instead"
)
self._dtype = BooleanDtype()
super().__init__(values, mask, copy=copy)
@property
def dtype(self) -> BooleanDtype:
return self._dtype
@classmethod
def _from_sequence(
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
) -> BooleanArray:
if dtype:
assert dtype == "boolean"
values, mask = coerce_to_array(scalars, copy=copy)
return BooleanArray(values, mask)
@classmethod
def _from_sequence_of_strings(
cls,
strings: list[str],
*,
dtype: Dtype | None = None,
copy: bool = False,
true_values: list[str] | None = None,
false_values: list[str] | None = None,
) -> BooleanArray:
true_values_union = cls._TRUE_VALUES.union(true_values or [])
false_values_union = cls._FALSE_VALUES.union(false_values or [])
def map_string(s):
if isna(s):
return s
elif s in true_values_union:
return True
elif s in false_values_union:
return False
else:
raise ValueError(f"{s} cannot be cast to bool")
scalars = [map_string(x) for x in strings]
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value)
@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
...
@overload
def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
...
@overload
def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
...
def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.
Parameters
----------
dtype : str or dtype
Typecode or data-type to which the array is cast.
copy : bool, default True
Whether to copy the data, even if not necessary. If False,
a copy is made only if the old dtype does not match the
new dtype.
Returns
-------
ndarray or ExtensionArray
NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype.
Raises
------
TypeError
if incompatible type with an BooleanDtype, equivalent of same_kind
casting
"""
dtype = pandas_dtype(dtype)
if isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy)
if is_bool_dtype(dtype):
# astype_nansafe converts np.nan to True
if self._hasna:
raise ValueError("cannot convert float NaN to bool")
else:
return self._data.astype(dtype, copy=copy)
# for integer, error if there are missing values
if is_integer_dtype(dtype) and self._hasna:
raise ValueError("cannot convert NA to integer")
# for float dtype, ensure we use np.nan before casting (numpy cannot
# deal with pd.NA)
na_value = self._na_value
if is_float_dtype(dtype):
na_value = np.nan
# coerce
return self.to_numpy(dtype=dtype, na_value=na_value, copy=False)
def _values_for_argsort(self) -> np.ndarray:
"""
Return values for sorting.
Returns
-------
ndarray
The transformed values should maintain the ordering between values
within the array.
See Also
--------
ExtensionArray.argsort : Return the indices that would sort this array.
"""
data = self._data.copy()
data[self._mask] = -1
return data
def _logical_method(self, other, op):
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
other_is_booleanarray = isinstance(other, BooleanArray)
other_is_scalar = lib.is_scalar(other)
mask = None
if other_is_booleanarray:
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other, dtype="bool")
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
other, mask = coerce_to_array(other, copy=False)
elif isinstance(other, np.bool_):
other = other.item()
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
raise TypeError(
"'other' should be pandas.NA or a bool. "
f"Got {type(other).__name__} instead."
)
if not other_is_scalar and len(self) != len(other):
raise ValueError("Lengths must match to compare")
if op.__name__ in {"or_", "ror_"}:
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
elif op.__name__ in {"and_", "rand_"}:
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
elif op.__name__ in {"xor", "rxor"}:
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
# error: Argument 2 to "BooleanArray" has incompatible type "Optional[Any]";
# expected "ndarray"
return BooleanArray(result, mask) # type: ignore[arg-type]
def _arith_method(self, other, op):
mask = None
op_name = op.__name__
if isinstance(other, BooleanArray):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other)
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
if len(self) != len(other):
raise ValueError("Lengths must match")
# nans propagate
if mask is None:
mask = self._mask
if other is libmissing.NA:
mask |= True
else:
mask = self._mask | mask
if other is libmissing.NA:
# if other is NA, the result will be all NA and we can't run the
# actual op, so we need to choose the resulting dtype manually
if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}:
dtype = "int8"
elif op_name in {"truediv", "rtruediv"}:
dtype = "float64"
else:
dtype = "bool"
result = np.zeros(len(self._data), dtype=dtype)
else:
if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
# Avoid DeprecationWarning: In future, it will be an error
# for 'np.bool_' scalars to be interpreted as an index
other = bool(other)
with np.errstate(all="ignore"):
result = op(self._data, other)
# divmod returns a tuple
if op_name == "divmod":
div, mod = result
return (
self._maybe_mask_result(div, mask, other, "floordiv"),
self._maybe_mask_result(mod, mask, other, "mod"),
)
return self._maybe_mask_result(result, mask, other, op_name)
def __abs__(self):
return self.copy()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,375 @@
from __future__ import annotations
from typing import overload
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._typing import (
ArrayLike,
AstypeArg,
DtypeObj,
npt,
)
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
from pandas.core.tools.numeric import to_numeric
class FloatingDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size of floating dtype.
These specific implementations are subclasses of the non-public
FloatingDtype. For example we have Float32Dtype to represent float32.
The attributes name & type are set when these subclasses are created.
"""
def __repr__(self) -> str:
return f"{self.name}Dtype()"
@property
def _is_numeric(self) -> bool:
return True
@classmethod
def construct_array_type(cls) -> type[FloatingArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return FloatingArray
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
# for now only handle other floating types
if not all(isinstance(t, FloatingDtype) for t in dtypes):
return None
np_dtype = np.find_common_type(
# error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype]" has no
# attribute "numpy_dtype"
[t.numpy_dtype for t in dtypes], # type: ignore[union-attr]
[],
)
if np.issubdtype(np_dtype, np.floating):
return FLOAT_STR_TO_DTYPE[str(np_dtype)]
return None
def coerce_to_array(
values, dtype=None, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.
Parameters
----------
values : 1D list-like
dtype : float dtype
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
# if values is floating numpy array, preserve its dtype
if dtype is None and hasattr(values, "dtype"):
if is_float_dtype(values.dtype):
dtype = values.dtype
if dtype is not None:
if isinstance(dtype, str) and dtype.startswith("Float"):
# Avoid DeprecationWarning from NumPy about np.dtype("Float64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not issubclass(type(dtype), FloatingDtype):
try:
dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
if isinstance(values, FloatingArray):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
values = np.array(values, copy=copy)
if is_object_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
pass
elif inferred_type not in [
"floating",
"integer",
"mixed-integer",
"integer-na",
"mixed-integer-float",
]:
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
elif is_bool_dtype(values) and is_float_dtype(dtype):
values = np.array(values, dtype=float, copy=copy)
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
if values.ndim != 1:
raise TypeError("values must be a 1D list-like")
if mask is None:
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)
if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = np.dtype("float64")
else:
dtype = dtype.type
# if we are float, let's make sure that we can
# safely cast
# we copy as need to coerce here
# TODO should this be a safe cast?
if mask.any():
values = values.copy()
values[mask] = np.nan
values = values.astype(dtype, copy=False) # , casting="safe")
return values, mask
class FloatingArray(NumericArray):
"""
Array of floating (optional missing) values.
.. versionadded:: 1.2.0
.. warning::
FloatingArray is currently experimental, and its API or internal
implementation may change without warning. Especially the behaviour
regarding NaN (distinct from NA missing values) is subject to change.
We represent a FloatingArray with 2 numpy arrays:
- data: contains a numpy float array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an FloatingArray from generic array-like input, use
:func:`pandas.array` with one of the float dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d float-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
FloatingArray
Examples
--------
Create an FloatingArray with :func:`pandas.array`:
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([0.1, None, 0.3], dtype="Float32")
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
"""
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 0.0
# Fill values used for any/all
_truthy_value = 1.0
_falsey_value = 0.0
@cache_readonly
def dtype(self) -> FloatingDtype:
return FLOAT_STR_TO_DTYPE[str(self._data.dtype)]
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"):
raise TypeError(
"values should be floating numpy array. Use "
"the 'pd.array' function instead"
)
if values.dtype == np.float16:
# If we don't raise here, then accessing self.dtype would raise
raise TypeError("FloatingArray does not support np.float16 dtype.")
super().__init__(values, mask, copy=copy)
@classmethod
def _from_sequence(
cls, scalars, *, dtype=None, copy: bool = False
) -> FloatingArray:
values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
return FloatingArray(values, mask)
@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype=None, copy: bool = False
) -> FloatingArray:
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value, dtype=self.dtype)
@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
...
@overload
def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
...
@overload
def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
...
def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.
Parameters
----------
dtype : str or dtype
Typecode or data-type to which the array is cast.
copy : bool, default True
Whether to copy the data, even if not necessary. If False,
a copy is made only if the old dtype does not match the
new dtype.
Returns
-------
ndarray or ExtensionArray
NumPy ndarray, or BooleanArray, IntegerArray or FloatingArray with
'dtype' for its dtype.
Raises
------
TypeError
if incompatible type with an FloatingDtype, equivalent of same_kind
casting
"""
dtype = pandas_dtype(dtype)
if isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
# coerce
if is_float_dtype(dtype):
# In astype, we consider dtype=float to also mean na_value=np.nan
kwargs = {"na_value": np.nan}
elif is_datetime64_dtype(dtype):
# error: Dict entry 0 has incompatible type "str": "datetime64"; expected
# "str": "float"
kwargs = {"na_value": np.datetime64("NaT")} # type: ignore[dict-item]
else:
kwargs = {}
# error: Argument 2 to "to_numpy" of "BaseMaskedArray" has incompatible
# type "**Dict[str, float]"; expected "bool"
data = self.to_numpy(dtype=dtype, **kwargs) # type: ignore[arg-type]
return astype_nansafe(data, dtype, copy=False)
def _values_for_argsort(self) -> np.ndarray:
return self._data
_dtype_docstring = """
An ExtensionDtype for {dtype} data.
This dtype uses ``pd.NA`` as missing value indicator.
Attributes
----------
None
Methods
-------
None
"""
# create the Dtype
@register_extension_dtype
class Float32Dtype(FloatingDtype):
type = np.float32
name = "Float32"
__doc__ = _dtype_docstring.format(dtype="float32")
@register_extension_dtype
class Float64Dtype(FloatingDtype):
type = np.float64
name = "Float64"
__doc__ = _dtype_docstring.format(dtype="float64")
FLOAT_STR_TO_DTYPE = {
"float32": Float32Dtype(),
"float64": Float64Dtype(),
}

View File

@@ -0,0 +1,497 @@
from __future__ import annotations
from typing import overload
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._typing import (
ArrayLike,
AstypeArg,
Dtype,
DtypeObj,
npt,
)
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.base import (
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.masked import BaseMaskedDtype
from pandas.core.arrays.numeric import (
NumericArray,
NumericDtype,
)
from pandas.core.tools.numeric import to_numeric
class _IntegerDtype(NumericDtype):
"""
An ExtensionDtype to hold a single size & kind of integer dtype.
These specific implementations are subclasses of the non-public
_IntegerDtype. For example we have Int8Dtype to represent signed int 8s.
The attributes name & type are set when these subclasses are created.
"""
def __repr__(self) -> str:
sign = "U" if self.is_unsigned_integer else ""
return f"{sign}Int{8 * self.itemsize}Dtype()"
@cache_readonly
def is_signed_integer(self) -> bool:
return self.kind == "i"
@cache_readonly
def is_unsigned_integer(self) -> bool:
return self.kind == "u"
@property
def _is_numeric(self) -> bool:
return True
@classmethod
def construct_array_type(cls) -> type[IntegerArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return IntegerArray
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
# we only handle nullable EA dtypes and numeric numpy dtypes
if not all(
isinstance(t, BaseMaskedDtype)
or (
isinstance(t, np.dtype)
and (np.issubdtype(t, np.number) or np.issubdtype(t, np.bool_))
)
for t in dtypes
):
return None
np_dtype = np.find_common_type(
# error: List comprehension has incompatible type List[Union[Any,
# dtype, ExtensionDtype]]; expected List[Union[dtype, None, type,
# _SupportsDtype, str, Tuple[Any, Union[int, Sequence[int]]],
# List[Any], _DtypeDict, Tuple[Any, Any]]]
[
t.numpy_dtype # type: ignore[misc]
if isinstance(t, BaseMaskedDtype)
else t
for t in dtypes
],
[],
)
if np.issubdtype(np_dtype, np.integer):
return INT_STR_TO_DTYPE[str(np_dtype)]
elif np.issubdtype(np_dtype, np.floating):
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
return FLOAT_STR_TO_DTYPE[str(np_dtype)]
return None
def safe_cast(values, dtype, copy: bool):
"""
Safely cast the values to the dtype if they
are equivalent, meaning floats must be equivalent to the
ints.
"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError as err:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
raise TypeError(
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
) from err
def coerce_to_array(
values, dtype, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.
Parameters
----------
values : 1D list-like
dtype : integer dtype
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
# if values is integer numpy array, preserve its dtype
if dtype is None and hasattr(values, "dtype"):
if is_integer_dtype(values.dtype):
dtype = values.dtype
if dtype is not None:
if isinstance(dtype, str) and (
dtype.startswith("Int") or dtype.startswith("UInt")
):
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not issubclass(type(dtype), _IntegerDtype):
try:
dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
if isinstance(values, IntegerArray):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
values = np.array(values, copy=copy)
inferred_type = None
if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
pass
elif inferred_type not in [
"floating",
"integer",
"mixed-integer",
"integer-na",
"mixed-integer-float",
"string",
"unicode",
]:
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
elif is_bool_dtype(values) and is_integer_dtype(dtype):
values = np.array(values, dtype=int, copy=copy)
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
if values.ndim != 1:
raise TypeError("values must be a 1D list-like")
if mask is None:
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)
if mask.ndim != 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = np.dtype("int64")
else:
dtype = dtype.type
# if we are float, let's make sure that we can
# safely cast
# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = 1
if inferred_type in ("string", "unicode"):
# casts from str are always safe since they raise
# a ValueError if the str cannot be parsed into an int
values = values.astype(dtype, copy=copy)
else:
values = safe_cast(values, dtype, copy=False)
return values, mask
class IntegerArray(NumericArray):
"""
Array of integer (optional missing) values.
.. versionchanged:: 1.0.0
Now uses :attr:`pandas.NA` as the missing value rather
than :attr:`numpy.nan`.
.. warning::
IntegerArray is currently experimental, and its API or internal
implementation may change without warning.
We represent an IntegerArray with 2 numpy arrays:
- data: contains a numpy integer array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an IntegerArray from generic array-like input, use
:func:`pandas.array` with one of the integer dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d integer-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
IntegerArray
Examples
--------
Create an IntegerArray with :func:`pandas.array`.
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
>>> int_array
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([1, None, 3], dtype='Int32')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: Int32
>>> pd.array([1, None, 3], dtype='UInt16')
<IntegerArray>
[1, <NA>, 3]
Length: 3, dtype: UInt16
"""
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1
# Fill values used for any/all
_truthy_value = 1
_falsey_value = 0
@cache_readonly
def dtype(self) -> _IntegerDtype:
return INT_STR_TO_DTYPE[str(self._data.dtype)]
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]):
raise TypeError(
"values should be integer numpy array. Use "
"the 'pd.array' function instead"
)
super().__init__(values, mask, copy=copy)
@classmethod
def _from_sequence(
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
) -> IntegerArray:
values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
return IntegerArray(values, mask)
@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
) -> IntegerArray:
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value, dtype=self.dtype)
@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
...
@overload
def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
...
@overload
def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
...
def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.
Parameters
----------
dtype : str or dtype
Typecode or data-type to which the array is cast.
copy : bool, default True
Whether to copy the data, even if not necessary. If False,
a copy is made only if the old dtype does not match the
new dtype.
Returns
-------
ndarray or ExtensionArray
NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype.
Raises
------
TypeError
if incompatible type with an IntegerDtype, equivalent of same_kind
casting
"""
dtype = pandas_dtype(dtype)
if isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
na_value: float | np.datetime64 | lib.NoDefault
# coerce
if is_float_dtype(dtype):
# In astype, we consider dtype=float to also mean na_value=np.nan
na_value = np.nan
elif is_datetime64_dtype(dtype):
na_value = np.datetime64("NaT")
else:
na_value = lib.no_default
return self.to_numpy(dtype=dtype, na_value=na_value, copy=False)
def _values_for_argsort(self) -> np.ndarray:
"""
Return values for sorting.
Returns
-------
ndarray
The transformed values should maintain the ordering between values
within the array.
See Also
--------
ExtensionArray.argsort : Return the indices that would sort this array.
"""
data = self._data.copy()
if self._mask.any():
data[self._mask] = data.min() - 1
return data
_dtype_docstring = """
An ExtensionDtype for {dtype} integer data.
.. versionchanged:: 1.0.0
Now uses :attr:`pandas.NA` as its missing value,
rather than :attr:`numpy.nan`.
Attributes
----------
None
Methods
-------
None
"""
# create the Dtype
@register_extension_dtype
class Int8Dtype(_IntegerDtype):
type = np.int8
name = "Int8"
__doc__ = _dtype_docstring.format(dtype="int8")
@register_extension_dtype
class Int16Dtype(_IntegerDtype):
type = np.int16
name = "Int16"
__doc__ = _dtype_docstring.format(dtype="int16")
@register_extension_dtype
class Int32Dtype(_IntegerDtype):
type = np.int32
name = "Int32"
__doc__ = _dtype_docstring.format(dtype="int32")
@register_extension_dtype
class Int64Dtype(_IntegerDtype):
type = np.int64
name = "Int64"
__doc__ = _dtype_docstring.format(dtype="int64")
@register_extension_dtype
class UInt8Dtype(_IntegerDtype):
type = np.uint8
name = "UInt8"
__doc__ = _dtype_docstring.format(dtype="uint8")
@register_extension_dtype
class UInt16Dtype(_IntegerDtype):
type = np.uint16
name = "UInt16"
__doc__ = _dtype_docstring.format(dtype="uint16")
@register_extension_dtype
class UInt32Dtype(_IntegerDtype):
type = np.uint32
name = "UInt32"
__doc__ = _dtype_docstring.format(dtype="uint32")
@register_extension_dtype
class UInt64Dtype(_IntegerDtype):
type = np.uint64
name = "UInt64"
__doc__ = _dtype_docstring.format(dtype="uint64")
INT_STR_TO_DTYPE: dict[str, _IntegerDtype] = {
"int8": Int8Dtype(),
"int16": Int16Dtype(),
"int32": Int32Dtype(),
"int64": Int64Dtype(),
"uint8": UInt8Dtype(),
"uint16": UInt16Dtype(),
"uint32": UInt32Dtype(),
"uint64": UInt64Dtype(),
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,202 @@
from __future__ import annotations
import datetime
import numbers
from typing import (
TYPE_CHECKING,
TypeVar,
)
import numpy as np
from pandas._libs import (
Timedelta,
missing as libmissing,
)
from pandas.compat.numpy import function as nv
from pandas.core.dtypes.common import (
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
pandas_dtype,
)
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
)
if TYPE_CHECKING:
import pyarrow
T = TypeVar("T", bound="NumericArray")
class NumericDtype(BaseMaskedDtype):
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BaseMaskedArray:
"""
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
array_class = self.construct_array_type()
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
if not array.type.equals(pyarrow_type):
# test_from_arrow_type_error raise for string, but allow
# through itemsize conversion GH#31896
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
if rt_dtype.kind not in ["i", "u", "f"]:
# Could allow "c" or potentially disallow float<->int conversion,
# but at the moment we specifically test that uint<->int works
raise TypeError(
f"Expected array of {self} type, got {array.type} instead"
)
array = array.cast(pyarrow_type)
if isinstance(array, pyarrow.Array):
chunks = [array]
else:
# pyarrow.ChunkedArray
chunks = array.chunks
results = []
for arr in chunks:
data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
num_arr = array_class(data.copy(), ~mask, copy=False)
results.append(num_arr)
if not results:
return array_class(
np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
)
elif len(results) == 1:
# avoid additional copy in _concat_same_type
return results[0]
else:
return array_class._concat_same_type(results)
class NumericArray(BaseMaskedArray):
"""
Base class for IntegerArray and FloatingArray.
"""
def _arith_method(self, other, op):
op_name = op.__name__
omask = None
if getattr(other, "ndim", 0) > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
if isinstance(other, NumericArray):
other, omask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other)
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
if len(self) != len(other):
raise ValueError("Lengths must match")
if not (is_float_dtype(other) or is_integer_dtype(other)):
raise TypeError("can only perform ops with numeric values")
elif isinstance(other, (datetime.timedelta, np.timedelta64)):
other = Timedelta(other)
else:
if not (is_float(other) or is_integer(other) or other is libmissing.NA):
raise TypeError("can only perform ops with numeric values")
if omask is None:
mask = self._mask.copy()
if other is libmissing.NA:
mask |= True
else:
mask = self._mask | omask
if op_name == "pow":
# 1 ** x is 1.
mask = np.where((self._data == 1) & ~self._mask, False, mask)
# x ** 0 is 1.
if omask is not None:
mask = np.where((other == 0) & ~omask, False, mask)
elif other is not libmissing.NA:
mask = np.where(other == 0, False, mask)
elif op_name == "rpow":
# 1 ** x is 1.
if omask is not None:
mask = np.where((other == 1) & ~omask, False, mask)
elif other is not libmissing.NA:
mask = np.where(other == 1, False, mask)
# x ** 0 is 1.
mask = np.where((self._data == 0) & ~self._mask, False, mask)
if other is libmissing.NA:
result = np.ones_like(self._data)
if "truediv" in op_name and self.dtype.kind != "f":
# The actual data here doesn't matter since the mask
# will be all-True, but since this is division, we want
# to end up with floating dtype.
result = result.astype(np.float64)
else:
with np.errstate(all="ignore"):
result = op(self._data, other)
# divmod returns a tuple
if op_name == "divmod":
div, mod = result
return (
self._maybe_mask_result(div, mask, other, "floordiv"),
self._maybe_mask_result(mod, mask, other, "mod"),
)
return self._maybe_mask_result(result, mask, other, op_name)
_HANDLED_TYPES = (np.ndarray, numbers.Number)
def __neg__(self):
return type(self)(-self._data, self._mask.copy())
def __pos__(self):
return self.copy()
def __abs__(self):
return type(self)(abs(self._data), self._mask.copy())
def round(self: T, decimals: int = 0, *args, **kwargs) -> T:
"""
Round each value in the array a to the given number of decimals.
Parameters
----------
decimals : int, default 0
Number of decimal places to round to. If decimals is negative,
it specifies the number of positions to the left of the decimal point.
*args, **kwargs
Additional arguments and keywords have no effect but might be
accepted for compatibility with NumPy.
Returns
-------
NumericArray
Rounded values of the NumericArray.
See Also
--------
numpy.around : Round values of an np.array.
DataFrame.round : Round values of a DataFrame.
Series.round : Round values of a Series.
"""
nv.validate_round(args, kwargs)
values = np.round(self._data, decimals=decimals, **kwargs)
return type(self)(values, self._mask.copy())

View File

@@ -0,0 +1,428 @@
from __future__ import annotations
import numpy as np
from pandas._libs import lib
from pandas._typing import (
Dtype,
NpDtype,
Scalar,
npt,
)
from pandas.compat.numpy import function as nv
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.dtypes import PandasDtype
from pandas.core.dtypes.missing import isna
from pandas.core import (
arraylike,
nanops,
ops,
)
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.strings.object_array import ObjectStringArrayMixin
class PandasArray(
OpsMixin,
NDArrayBackedExtensionArray,
ObjectStringArrayMixin,
):
"""
A pandas ExtensionArray for NumPy data.
This is mostly for internal compatibility, and is not especially
useful on its own.
Parameters
----------
values : ndarray
The NumPy ndarray to wrap. Must be 1-dimensional.
copy : bool, default False
Whether to copy `values`.
Attributes
----------
None
Methods
-------
None
"""
# If you're wondering why pd.Series(cls) doesn't put the array in an
# ExtensionBlock, search for `ABCPandasArray`. We check for
# that _typ to ensure that users don't unnecessarily use EAs inside
# pandas internals, which turns off things like block consolidation.
_typ = "npy_extension"
__array_priority__ = 1000
_ndarray: np.ndarray
_dtype: PandasDtype
# ------------------------------------------------------------------------
# Constructors
def __init__(self, values: np.ndarray | PandasArray, copy: bool = False):
if isinstance(values, type(self)):
values = values._ndarray
if not isinstance(values, np.ndarray):
raise ValueError(
f"'values' must be a NumPy array, not {type(values).__name__}"
)
if values.ndim == 0:
# Technically we support 2, but do not advertise that fact.
raise ValueError("PandasArray must be 1-dimensional.")
if copy:
values = values.copy()
dtype = PandasDtype(values.dtype)
super().__init__(values, dtype)
@classmethod
def _from_sequence(
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
) -> PandasArray:
if isinstance(dtype, PandasDtype):
dtype = dtype._dtype
# error: Argument "dtype" to "asarray" has incompatible type
# "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
# None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
# Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
# _DTypeDict, Tuple[Any, Any]]]"
result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
if (
result.ndim > 1
and not hasattr(scalars, "dtype")
and (dtype is None or dtype == object)
):
# e.g. list-of-tuples
result = construct_1d_object_array_from_listlike(scalars)
if copy and result is scalars:
result = result.copy()
return cls(result)
@classmethod
def _from_factorized(cls, values, original) -> PandasArray:
return cls(values)
def _from_backing_data(self, arr: np.ndarray) -> PandasArray:
return type(self)(arr)
# ------------------------------------------------------------------------
# Data
@property
def dtype(self) -> PandasDtype:
return self._dtype
# ------------------------------------------------------------------------
# NumPy Array Interface
def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
return np.asarray(self._ndarray, dtype=dtype)
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# Lightly modified version of
# https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
# The primary modification is not boxing scalar return values
# in PandasArray, since pandas' ExtensionArrays are 1-d.
out = kwargs.get("out", ())
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result
if method == "reduce":
result = arraylike.dispatch_reduction_ufunc(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
# e.g. tests.series.test_ufunc.TestNumpyReductions
return result
# Defer to the implementation of the ufunc on unwrapped values.
inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs)
if out:
kwargs["out"] = tuple(
x._ndarray if isinstance(x, PandasArray) else x for x in out
)
result = getattr(ufunc, method)(*inputs, **kwargs)
if ufunc.nout > 1:
# multiple return values; re-box array-like results
return tuple(type(self)(x) for x in result)
elif method == "at":
# no return value
return None
elif method == "reduce":
if isinstance(result, np.ndarray):
# e.g. test_np_reduce_2d
return type(self)(result)
# e.g. test_np_max_nested_tuples
return result
else:
# one return value; re-box array-like results
return type(self)(result)
# ------------------------------------------------------------------------
# Pandas ExtensionArray Interface
def isna(self) -> np.ndarray:
return isna(self._ndarray)
def _validate_scalar(self, fill_value):
if fill_value is None:
# Primarily for subclasses
fill_value = self.dtype.na_value
return fill_value
def _values_for_factorize(self) -> tuple[np.ndarray, int]:
return self._ndarray, -1
# ------------------------------------------------------------------------
# Reductions
def any(
self,
*,
axis: int | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_any((), {"out": out, "keepdims": keepdims})
result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def all(
self,
*,
axis: int | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_all((), {"out": out, "keepdims": keepdims})
result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def min(self, *, axis: int | None = None, skipna: bool = True, **kwargs) -> Scalar:
nv.validate_min((), kwargs)
result = nanops.nanmin(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def max(self, *, axis: int | None = None, skipna: bool = True, **kwargs) -> Scalar:
nv.validate_max((), kwargs)
result = nanops.nanmax(
values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def sum(
self, *, axis: int | None = None, skipna: bool = True, min_count=0, **kwargs
) -> Scalar:
nv.validate_sum((), kwargs)
result = nanops.nansum(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
return self._wrap_reduction_result(axis, result)
def prod(
self, *, axis: int | None = None, skipna: bool = True, min_count=0, **kwargs
) -> Scalar:
nv.validate_prod((), kwargs)
result = nanops.nanprod(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
return self._wrap_reduction_result(axis, result)
def mean(
self,
*,
axis: int | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def median(
self,
*,
axis: int | None = None,
out=None,
overwrite_input: bool = False,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_median(
(), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
)
result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def std(
self,
*,
axis: int | None = None,
dtype: NpDtype | None = None,
out=None,
ddof=1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
)
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def var(
self,
*,
axis: int | None = None,
dtype: NpDtype | None = None,
out=None,
ddof=1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
)
result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def sem(
self,
*,
axis: int | None = None,
dtype: NpDtype | None = None,
out=None,
ddof=1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
)
result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def kurt(
self,
*,
axis: int | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
)
result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def skew(
self,
*,
axis: int | None = None,
dtype: NpDtype | None = None,
out=None,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
)
result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
# ------------------------------------------------------------------------
# Additional Methods
def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
copy: bool = False,
na_value=lib.no_default,
) -> np.ndarray:
result = np.asarray(self._ndarray, dtype=dtype)
if (copy or na_value is not lib.no_default) and result is self._ndarray:
result = result.copy()
if na_value is not lib.no_default:
result[self.isna()] = na_value
return result
# ------------------------------------------------------------------------
# Ops
def __invert__(self) -> PandasArray:
return type(self)(~self._ndarray)
def __neg__(self) -> PandasArray:
return type(self)(-self._ndarray)
def __pos__(self) -> PandasArray:
return type(self)(+self._ndarray)
def __abs__(self) -> PandasArray:
return type(self)(abs(self._ndarray))
def _cmp_method(self, other, op):
if isinstance(other, PandasArray):
other = other._ndarray
other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
pd_op = ops.get_array_op(op)
other = ensure_wrapped_if_datetimelike(other)
with np.errstate(all="ignore"):
result = pd_op(self._ndarray, other)
if op is divmod or op is ops.rdivmod:
a, b = result
if isinstance(a, np.ndarray):
# for e.g. op vs TimedeltaArray, we may already
# have an ExtensionArray, in which case we do not wrap
return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
return a, b
if isinstance(result, np.ndarray):
# for e.g. multiplication vs TimedeltaArray, we may already
# have an ExtensionArray, in which case we do not wrap
return self._wrap_ndarray_result(result)
return result
_arith_method = _cmp_method
def _wrap_ndarray_result(self, result: np.ndarray):
# If we have timedelta64[ns] result, return a TimedeltaArray instead
# of a PandasArray
if result.dtype == "timedelta64[ns]":
from pandas.core.arrays import TimedeltaArray
return TimedeltaArray._simple_new(result)
return type(self)(result)
# ------------------------------------------------------------------------
# String methods interface
_str_na_value = np.nan

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
# flake8: noqa: F401
from pandas.core.arrays.sparse.accessor import (
SparseAccessor,
SparseFrameAccessor,
)
from pandas.core.arrays.sparse.array import (
BlockIndex,
IntIndex,
SparseArray,
make_sparse_index,
)
from pandas.core.arrays.sparse.dtype import SparseDtype

View File

@@ -0,0 +1,386 @@
"""Sparse accessor"""
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.cast import find_common_type
from pandas.core.accessor import (
PandasDelegate,
delegate_names,
)
from pandas.core.arrays.sparse.array import SparseArray
from pandas.core.arrays.sparse.dtype import SparseDtype
class BaseAccessor:
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
def __init__(self, data=None):
self._parent = data
self._validate(data)
def _validate(self, data):
raise NotImplementedError
@delegate_names(
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
)
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
"""
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
raise AttributeError(self._validation_msg)
def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.array, name)
def _delegate_method(self, name, *args, **kwargs):
if name == "from_coo":
return self.from_coo(*args, **kwargs)
elif name == "to_coo":
return self.to_coo(*args, **kwargs)
else:
raise ValueError
@classmethod
def from_coo(cls, A, dense_index=False):
"""
Create a Series with sparse values from a scipy.sparse.coo_matrix.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
If False (default), the SparseSeries index consists of only the
coords of the non-null entries of the original coo_matrix.
If True, the SparseSeries index consists of the full sorted
(row, col) coordinates of the coo_matrix.
Returns
-------
s : Series
A Series with sparse values.
Examples
--------
>>> from scipy import sparse
>>> A = sparse.coo_matrix(
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
... )
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[0., 0., 1., 2.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> ss = pd.Series.sparse.from_coo(A)
>>> ss
0 2 1.0
3 2.0
1 0 3.0
dtype: Sparse[float64, nan]
"""
from pandas import Series
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
result = coo_to_sparse_series(A, dense_index=dense_index)
result = Series(result.array, index=result.index, copy=False)
return result
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
Use row_levels and column_levels to determine the row and column
coordinates respectively. row_levels and column_levels are the names
(labels) or numbers of the levels. {row_levels, column_levels} must be
a partition of the MultiIndex level names (or numbers).
Parameters
----------
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
y : scipy.sparse.coo_matrix
rows : list (row labels)
columns : list (column labels)
Examples
--------
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
>>> s.index = pd.MultiIndex.from_tuples(
... [
... (1, 2, "a", 0),
... (1, 2, "a", 1),
... (1, 1, "b", 0),
... (1, 1, "b", 1),
... (2, 1, "b", 0),
... (2, 1, "b", 1)
... ],
... names=["A", "B", "C", "D"],
... )
>>> s
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: float64
>>> ss = s.astype("Sparse")
>>> ss
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: Sparse[float64, nan]
>>> A, rows, columns = ss.sparse.to_coo(
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
... )
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[0., 0., 1., 3.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> rows
[(1, 1), (1, 2), (2, 1)]
>>> columns
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
"""
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
A, rows, columns = sparse_series_to_coo(
self._parent, row_levels, column_levels, sort_labels=sort_labels
)
return A, rows, columns
def to_dense(self):
"""
Convert a Series from sparse values to dense.
.. versionadded:: 0.25.0
Returns
-------
Series:
A Series with the same values, stored as a dense array.
Examples
--------
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
>>> series
0 0
1 1
2 0
dtype: Sparse[int64, 0]
>>> series.sparse.to_dense()
0 0
1 1
2 0
dtype: int64
"""
from pandas import Series
return Series(
self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name,
)
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.
.. versionadded:: 0.25.0
"""
def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)
@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
"""
Create a new DataFrame from a scipy sparse matrix.
.. versionadded:: 0.25.0
Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.
Returns
-------
DataFrame
Each column of the DataFrame is stored as a
:class:`arrays.SparseArray`.
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas._libs.sparse import IntIndex
from pandas import DataFrame
data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
n_rows, n_columns = data.shape
# We need to make sure indices are sorted, as we create
# IntIndex with no input validation (i.e. check_integrity=False ).
# Indices may already be sorted in scipy in which case this adds
# a small overhead.
data.sort_indices()
indices = data.indices
indptr = data.indptr
array_data = data.data
dtype = SparseDtype(array_data.dtype, 0)
arrays = []
for i in range(n_columns):
sl = slice(indptr[i], indptr[i + 1])
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
arrays.append(arr)
return DataFrame._from_arrays(
arrays, columns=columns, index=index, verify_integrity=False
)
def to_dense(self):
"""
Convert a DataFrame with sparse values to dense.
.. versionadded:: 0.25.0
Returns
-------
DataFrame
A DataFrame with the same values stored as dense arrays.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
>>> df.sparse.to_dense()
A
0 0
1 1
2 0
"""
from pandas import DataFrame
data = {k: v.array.to_dense() for k, v in self._parent.items()}
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.
.. versionadded:: 0.25.0
Returns
-------
coo_matrix : scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.
Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.
e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
import_optional_dependency("scipy")
from scipy.sparse import coo_matrix
dtype = find_common_type(self._parent.dtypes.to_list())
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
cols, rows, data = [], [], []
for col, (_, ser) in enumerate(self._parent.iteritems()):
sp_arr = ser.array
if sp_arr.fill_value != 0:
raise ValueError("fill value must be 0 when converting to COO matrix")
row = sp_arr.sp_index.indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
data.append(sp_arr.sp_values.astype(dtype, copy=False))
cols = np.concatenate(cols)
rows = np.concatenate(rows)
data = np.concatenate(data)
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
@property
def density(self) -> float:
"""
Ratio of non-sparse points to total (dense) data points.
"""
tmp = np.mean([column.array.density for _, column in self._parent.items()])
return tmp
@staticmethod
def _prep_index(data, index, columns):
from pandas.core.indexes.api import (
default_index,
ensure_index,
)
N, K = data.shape
if index is None:
index = default_index(N)
else:
index = ensure_index(index)
if columns is None:
columns = default_index(K)
else:
columns = ensure_index(columns)
if len(columns) != K:
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
if len(index) != N:
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
return index, columns

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,414 @@
"""Sparse Dtype"""
from __future__ import annotations
import re
from typing import (
TYPE_CHECKING,
Any,
)
import warnings
import numpy as np
from pandas._typing import (
Dtype,
DtypeObj,
type_t,
)
from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.base import (
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)
if TYPE_CHECKING:
from pandas.core.arrays.sparse.array import SparseArray
@register_extension_dtype
class SparseDtype(ExtensionDtype):
"""
Dtype for data stored in :class:`SparseArray`.
This dtype implements the pandas ExtensionDtype interface.
Parameters
----------
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
The dtype of the underlying array storing the non-fill value values.
fill_value : scalar, optional
The scalar value not stored in the SparseArray. By default, this
depends on `dtype`.
=========== ==========
dtype na_value
=========== ==========
float ``np.nan``
int ``0``
bool ``False``
datetime64 ``pd.NaT``
timedelta64 ``pd.NaT``
=========== ==========
The default value may be overridden by specifying a `fill_value`.
Attributes
----------
None
Methods
-------
None
"""
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
# Without is_na_fill_value in the comparison, those would be equal since
# hash(nan) is (sometimes?) 0.
_metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
if isinstance(dtype, type(self)):
if fill_value is None:
fill_value = dtype.fill_value
dtype = dtype.subtype
dtype = pandas_dtype(dtype)
if is_string_dtype(dtype):
dtype = np.dtype("object")
if fill_value is None:
fill_value = na_value_for_dtype(dtype)
self._dtype = dtype
self._fill_value = fill_value
self._check_fill_value()
def __hash__(self):
# Python3 doesn't inherit __hash__ when a base class overrides
# __eq__, so we explicitly do it here.
return super().__hash__()
def __eq__(self, other: Any) -> bool:
# We have to override __eq__ to handle NA values in _metadata.
# The base class does simple == checks, which fail for NA.
if isinstance(other, str):
try:
other = self.construct_from_string(other)
except TypeError:
return False
if isinstance(other, type(self)):
subtype = self.subtype == other.subtype
if self._is_na_fill_value:
# this case is complicated by two things:
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
# i.e. we want to treat any floating-point NaN as equal, but
# not a floating-point NaN and a datetime NaT.
fill_value = (
other._is_na_fill_value
and isinstance(self.fill_value, type(other.fill_value))
or isinstance(other.fill_value, type(self.fill_value))
)
else:
fill_value = self.fill_value == other.fill_value
return subtype and fill_value
return False
@property
def fill_value(self):
"""
The fill value of the array.
Converting the SparseArray to a dense ndarray will fill the
array with this value.
.. warning::
It's possible to end up with a SparseArray that has ``fill_value``
values in ``sp_values``. This can occur, for example, when setting
``SparseArray.fill_value`` directly.
"""
return self._fill_value
def _check_fill_value(self):
if not is_scalar(self._fill_value):
raise ValueError(
f"fill_value must be a scalar. Got {self._fill_value} instead"
)
# TODO: Right now we can use Sparse boolean array
# with any fill_value. Here was an attempt
# to allow only 3 value: True, False or nan
# but plenty test has failed.
# see pull 44955
# if self._is_boolean and not (
# is_bool(self._fill_value) or isna(self._fill_value)
# ):
# raise ValueError(
# "fill_value must be True, False or nan "
# f"for boolean type. Got {self._fill_value} instead"
# )
@property
def _is_na_fill_value(self) -> bool:
return isna(self.fill_value)
@property
def _is_numeric(self) -> bool:
return not is_object_dtype(self.subtype)
@property
def _is_boolean(self) -> bool:
return is_bool_dtype(self.subtype)
@property
def kind(self):
"""
The sparse kind. Either 'integer', or 'block'.
"""
return self.subtype.kind
@property
def type(self):
return self.subtype.type
@property
def subtype(self):
return self._dtype
@property
def name(self):
return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"
def __repr__(self) -> str:
return self.name
@classmethod
def construct_array_type(cls) -> type_t[SparseArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays.sparse.array import SparseArray
return SparseArray
@classmethod
def construct_from_string(cls, string: str) -> SparseDtype:
"""
Construct a SparseDtype from a string form.
Parameters
----------
string : str
Can take the following forms.
string dtype
================ ============================
'int' SparseDtype[np.int64, 0]
'Sparse' SparseDtype[np.float64, nan]
'Sparse[int]' SparseDtype[np.int64, 0]
'Sparse[int, 0]' SparseDtype[np.int64, 0]
================ ============================
It is not possible to specify non-default fill values
with a string. An argument like ``'Sparse[int, 1]'``
will raise a ``TypeError`` because the default fill value
for integers is 0.
Returns
-------
SparseDtype
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
msg = f"Cannot construct a 'SparseDtype' from '{string}'"
if string.startswith("Sparse"):
try:
sub_type, has_fill_value = cls._parse_subtype(string)
except ValueError as err:
raise TypeError(msg) from err
else:
result = SparseDtype(sub_type)
msg = (
f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "
"looks like the fill_value in the string is not "
"the default for the dtype. Non-default fill_values "
"are not supported. Use the 'SparseDtype()' "
"constructor instead."
)
if has_fill_value and str(result) != string:
raise TypeError(msg)
return result
else:
raise TypeError(msg)
@staticmethod
def _parse_subtype(dtype: str) -> tuple[str, bool]:
"""
Parse a string to get the subtype
Parameters
----------
dtype : str
A string like
* Sparse[subtype]
* Sparse[subtype, fill_value]
Returns
-------
subtype : str
Raises
------
ValueError
When the subtype cannot be extracted.
"""
xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")
m = xpr.match(dtype)
has_fill_value = False
if m:
subtype = m.groupdict()["subtype"]
has_fill_value = bool(m.groupdict()["fill_value"])
elif dtype == "Sparse":
subtype = "float64"
else:
raise ValueError(f"Cannot parse {dtype}")
return subtype, has_fill_value
@classmethod
def is_dtype(cls, dtype: object) -> bool:
dtype = getattr(dtype, "dtype", dtype)
if isinstance(dtype, str) and dtype.startswith("Sparse"):
sub_type, _ = cls._parse_subtype(dtype)
dtype = np.dtype(sub_type)
elif isinstance(dtype, cls):
return True
return isinstance(dtype, np.dtype) or dtype == "Sparse"
def update_dtype(self, dtype) -> SparseDtype:
"""
Convert the SparseDtype to a new dtype.
This takes care of converting the ``fill_value``.
Parameters
----------
dtype : Union[str, numpy.dtype, SparseDtype]
The new dtype to use.
* For a SparseDtype, it is simply returned
* For a NumPy dtype (or str), the current fill value
is converted to the new dtype, and a SparseDtype
with `dtype` and the new fill value is returned.
Returns
-------
SparseDtype
A new SparseDtype with the correct `dtype` and fill value
for that `dtype`.
Raises
------
ValueError
When the current fill value cannot be converted to the
new `dtype` (e.g. trying to convert ``np.nan`` to an
integer dtype).
Examples
--------
>>> SparseDtype(int, 0).update_dtype(float)
Sparse[float64, 0.0]
>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
Sparse[float64, nan]
"""
cls = type(self)
dtype = pandas_dtype(dtype)
if not isinstance(dtype, cls):
if not isinstance(dtype, np.dtype):
raise TypeError("sparse arrays of extension dtypes not supported")
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
dtype = cls(dtype, fill_value=fill_value)
return dtype
@property
def _subtype_with_str(self):
"""
Whether the SparseDtype's subtype should be considered ``str``.
Typically, pandas will store string data in an object-dtype array.
When converting values to a dtype, e.g. in ``.astype``, we need to
be more specific, we need the actual underlying type.
Returns
-------
>>> SparseDtype(int, 1)._subtype_with_str
dtype('int64')
>>> SparseDtype(object, 1)._subtype_with_str
dtype('O')
>>> dtype = SparseDtype(str, '')
>>> dtype.subtype
dtype('O')
>>> dtype._subtype_with_str
<class 'str'>
"""
if isinstance(self.fill_value, str):
return type(self.fill_value)
return self.subtype
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
# TODO for now only handle SparseDtypes and numpy dtypes => extend
# with other compatible extension dtypes
if any(
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
for x in dtypes
):
return None
fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
fill_value = fill_values[0]
# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore the all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
warnings.warn(
"Concatenating sparse arrays with multiple fill "
f"values: '{fill_values}'. Picking the first and "
"converting the rest.",
PerformanceWarning,
stacklevel=find_stack_level(),
)
np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)

View File

@@ -0,0 +1,211 @@
"""
Interaction with scipy.sparse matrices.
Currently only includes to_coo helpers.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Iterable,
)
import numpy as np
from pandas._libs import lib
from pandas._typing import (
IndexLabel,
npt,
)
from pandas.core.dtypes.missing import notna
from pandas.core.algorithms import factorize
from pandas.core.indexes.api import MultiIndex
from pandas.core.series import Series
if TYPE_CHECKING:
import scipy.sparse
def _check_is_partition(parts: Iterable, whole: Iterable):
whole = set(whole)
parts = [set(x) for x in parts]
if set.intersection(*parts) != set():
raise ValueError("Is not a partition because intersection is not null.")
if set.union(*parts) != whole:
raise ValueError("Is not a partition because union is not the whole.")
def _levels_to_axis(
ss,
levels: tuple[int] | list[int],
valid_ilocs: npt.NDArray[np.intp],
sort_labels: bool = False,
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
"""
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
where `ax_coords` are the coordinates along one of the two axes of the
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
which correspond to these coordinates.
Parameters
----------
ss : Series
levels : tuple/list
valid_ilocs : numpy.ndarray
Array of integer positions of valid values for the sparse matrix in ss.
sort_labels : bool, default False
Sort the axis labels before forming the sparse matrix. When `levels`
refers to a single level, set to True for a faster execution.
Returns
-------
ax_coords : numpy.ndarray (axis coordinates)
ax_labels : list (axis labels)
"""
# Since the labels are sorted in `Index.levels`, when we wish to sort and
# there is only one level of the MultiIndex for this axis, the desired
# output can be obtained in the following simpler, more efficient way.
if sort_labels and len(levels) == 1:
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
ax_labels = ss.index.levels[levels[0]]
else:
levels_values = lib.fast_zip(
[ss.index.get_level_values(lvl).values for lvl in levels]
)
codes, ax_labels = factorize(levels_values, sort=sort_labels)
ax_coords = codes[valid_ilocs]
ax_labels = ax_labels.tolist()
return ax_coords, ax_labels
def _to_ijv(
ss,
row_levels: tuple[int] | list[int] = (0,),
column_levels: tuple[int] | list[int] = (1,),
sort_labels: bool = False,
) -> tuple[
np.ndarray,
npt.NDArray[np.intp],
npt.NDArray[np.intp],
list[IndexLabel],
list[IndexLabel],
]:
"""
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
constructor, and ilabels and jlabels are the row and column labels
respectively.
Parameters
----------
ss : Series
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
values : numpy.ndarray
Valid values to populate a sparse matrix, extracted from
ss.
i_coords : numpy.ndarray (row coordinates of the values)
j_coords : numpy.ndarray (column coordinates of the values)
i_labels : list (row labels)
j_labels : list (column labels)
"""
# index and column levels must be a partition of the index
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
# From the sparse Series, get the integer indices and data for valid sparse
# entries.
sp_vals = ss.array.sp_values
na_mask = notna(sp_vals)
values = sp_vals[na_mask]
valid_ilocs = ss.array.sp_index.indices[na_mask]
i_coords, i_labels = _levels_to_axis(
ss, row_levels, valid_ilocs, sort_labels=sort_labels
)
j_coords, j_labels = _levels_to_axis(
ss, column_levels, valid_ilocs, sort_labels=sort_labels
)
return values, i_coords, j_coords, i_labels, j_labels
def sparse_series_to_coo(
ss: Series,
row_levels: Iterable[int] = (0,),
column_levels: Iterable[int] = (1,),
sort_labels: bool = False,
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
"""
Convert a sparse Series to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
import scipy.sparse
if ss.index.nlevels < 2:
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
if not ss.index.is_unique:
raise ValueError(
"Duplicate index entries are not allowed in to_coo transformation."
)
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
v, i, j, rows, columns = _to_ijv(
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
)
sparse_matrix = scipy.sparse.coo_matrix(
(v, (i, j)), shape=(len(rows), len(columns))
)
return sparse_matrix, rows, columns
def coo_to_sparse_series(
A: scipy.sparse.coo_matrix, dense_index: bool = False
) -> Series:
"""
Convert a scipy.sparse.coo_matrix to a SparseSeries.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
Returns
-------
Series
Raises
------
TypeError if A is not a coo_matrix
"""
from pandas import SparseDtype
try:
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
except AttributeError as err:
raise TypeError(
f"Expected coo_matrix. Got {type(A).__name__} instead."
) from err
ser = ser.sort_index()
ser = ser.astype(SparseDtype(ser.dtype))
if dense_index:
# is there a better constructor method to use here?
i = range(A.shape[0])
j = range(A.shape[1])
ind = MultiIndex.from_product([i, j])
ser = ser.reindex(ind)
return ser

View File

@@ -0,0 +1,572 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas._config import get_option
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._libs.arrays import NDArrayBacked
from pandas._typing import (
Dtype,
Scalar,
type_t,
)
from pandas.compat import pa_version_under1p01
from pandas.compat.numpy import function as nv
from pandas.core.dtypes.base import (
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_dtype_equal,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core import ops
from pandas.core.array_algos import masked_reductions
from pandas.core.arrays import (
FloatingArray,
IntegerArray,
PandasArray,
)
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.floating import FloatingDtype
from pandas.core.arrays.integer import _IntegerDtype
from pandas.core.construction import extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.missing import isna
if TYPE_CHECKING:
import pyarrow
@register_extension_dtype
class StringDtype(ExtensionDtype):
"""
Extension dtype for string data.
.. versionadded:: 1.0.0
.. warning::
StringDtype is considered experimental. The implementation and
parts of the API may change without warning.
In particular, StringDtype.na_value may change to no longer be
``numpy.nan``.
Parameters
----------
storage : {"python", "pyarrow"}, optional
If not given, the value of ``pd.options.mode.string_storage``.
Attributes
----------
None
Methods
-------
None
Examples
--------
>>> pd.StringDtype()
string[python]
>>> pd.StringDtype(storage="pyarrow")
string[pyarrow]
"""
name = "string"
#: StringDtype.na_value uses pandas.NA
na_value = libmissing.NA
_metadata = ("storage",)
def __init__(self, storage=None):
if storage is None:
storage = get_option("mode.string_storage")
if storage not in {"python", "pyarrow"}:
raise ValueError(
f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
)
if storage == "pyarrow" and pa_version_under1p01:
raise ImportError(
"pyarrow>=1.0.0 is required for PyArrow backed StringArray."
)
self.storage = storage
@property
def type(self) -> type[str]:
return str
@classmethod
def construct_from_string(cls, string):
"""
Construct a StringDtype from a string.
Parameters
----------
string : str
The type of the name. The storage type will be taking from `string`.
Valid options and their storage types are
========================== ==============================================
string result storage
========================== ==============================================
``'string'`` pd.options.mode.string_storage, default python
``'string[python]'`` python
``'string[pyarrow]'`` pyarrow
========================== ==============================================
Returns
-------
StringDtype
Raise
-----
TypeError
If the string is not a valid option.
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
if string == "string":
return cls()
elif string == "string[python]":
return cls(storage="python")
elif string == "string[pyarrow]":
return cls(storage="pyarrow")
else:
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
def __eq__(self, other: Any) -> bool:
if isinstance(other, str) and other == "string":
return True
return super().__eq__(other)
def __hash__(self) -> int:
# custom __eq__ so have to override __hash__
return super().__hash__()
# https://github.com/pandas-dev/pandas/issues/36126
# error: Signature of "construct_array_type" incompatible with supertype
# "ExtensionDtype"
def construct_array_type( # type: ignore[override]
self,
) -> type_t[BaseStringArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays.string_arrow import ArrowStringArray
if self.storage == "python":
return StringArray
else:
return ArrowStringArray
def __repr__(self):
return f"string[{self.storage}]"
def __str__(self):
return self.name
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BaseStringArray:
"""
Construct StringArray from pyarrow Array/ChunkedArray.
"""
if self.storage == "pyarrow":
from pandas.core.arrays.string_arrow import ArrowStringArray
return ArrowStringArray(array)
else:
import pyarrow
if isinstance(array, pyarrow.Array):
chunks = [array]
else:
# pyarrow.ChunkedArray
chunks = array.chunks
results = []
for arr in chunks:
# using _from_sequence to ensure None is converted to NA
str_arr = StringArray._from_sequence(np.array(arr))
results.append(str_arr)
if results:
return StringArray._concat_same_type(results)
else:
return StringArray(np.array([], dtype="object"))
class BaseStringArray(ExtensionArray):
pass
class StringArray(BaseStringArray, PandasArray):
"""
Extension array for string data.
.. versionadded:: 1.0.0
.. warning::
StringArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : array-like
The array of data.
.. warning::
Currently, this expects an object-dtype ndarray
where the elements are Python strings or :attr:`pandas.NA`.
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.
copy : bool, default False
Whether to copy the array of data.
Attributes
----------
None
Methods
-------
None
See Also
--------
array
The recommended function for creating a StringArray.
Series.str
The string methods are available on Series backed by
a StringArray.
Notes
-----
StringArray returns a BooleanArray for comparison methods.
Examples
--------
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
<StringArray>
['This is', 'some text', <NA>, 'data.']
Length: 4, dtype: string
Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
will convert the values to strings.
>>> pd.array(['1', 1], dtype="object")
<PandasArray>
['1', 1]
Length: 2, dtype: object
>>> pd.array(['1', 1], dtype="string")
<StringArray>
['1', '1']
Length: 2, dtype: string
However, instantiating StringArrays directly with non-strings will raise an error.
For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
>>> pd.array(["a", None, "c"], dtype="string") == "a"
<BooleanArray>
[True, <NA>, False]
Length: 3, dtype: boolean
"""
# undo the PandasArray hack
_typ = "extension"
def __init__(self, values, copy=False):
values = extract_array(values)
super().__init__(values, copy=copy)
# error: Incompatible types in assignment (expression has type "StringDtype",
# variable has type "PandasDtype")
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
if not isinstance(values, type(self)):
self._validate()
def _validate(self):
"""Validate that we only store NA or strings."""
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
if self._ndarray.dtype != "object":
raise ValueError(
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)
@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "python"
from pandas.core.arrays.masked import BaseMaskedArray
if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
result[na_values] = StringDtype.na_value
else:
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
result = lib.ensure_string_array(
scalars, na_value=StringDtype.na_value, copy=copy
)
# Manually creating new array avoids the validation step in the __init__, so is
# faster. Refactor need for validation?
new_string_array = cls.__new__(cls)
NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
return new_string_array
@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype: Dtype | None = None, copy=False
):
return cls._from_sequence(strings, dtype=dtype, copy=copy)
@classmethod
def _empty(cls, shape, dtype) -> StringArray:
values = np.empty(shape, dtype=object)
values[:] = libmissing.NA
return cls(values).astype(dtype, copy=False)
def __arrow_array__(self, type=None):
"""
Convert myself into a pyarrow Array.
"""
import pyarrow as pa
if type is None:
type = pa.string()
values = self._ndarray.copy()
values[self.isna()] = None
return pa.array(values, type=type, from_pandas=True)
def _values_for_factorize(self):
arr = self._ndarray.copy()
mask = self.isna()
arr[mask] = -1
return arr, -1
def __setitem__(self, key, value):
value = extract_array(value, extract_numpy=True)
if isinstance(value, type(self)):
# extract_array doesn't extract PandasArray subclasses
value = value._ndarray
key = check_array_indexer(self, key)
scalar_key = lib.is_scalar(key)
scalar_value = lib.is_scalar(value)
if scalar_key and not scalar_value:
raise ValueError("setting an array element with a sequence.")
# validate new items
if scalar_value:
if isna(value):
value = StringDtype.na_value
elif not isinstance(value, str):
raise ValueError(
f"Cannot set non-string value '{value}' into a StringArray."
)
else:
if not is_array_like(value):
value = np.asarray(value, dtype=object)
if len(value) and not lib.is_string_array(value, skipna=True):
raise ValueError("Must provide strings.")
super().__setitem__(key, value)
def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
if is_dtype_equal(dtype, self.dtype):
if copy:
return self.copy()
return self
elif isinstance(dtype, _IntegerDtype):
arr = self._ndarray.copy()
mask = self.isna()
arr[mask] = 0
values = arr.astype(dtype.numpy_dtype)
return IntegerArray(values, mask, copy=False)
elif isinstance(dtype, FloatingDtype):
arr = self.copy()
mask = self.isna()
arr[mask] = "0"
values = arr.astype(dtype.numpy_dtype)
return FloatingArray(values, mask, copy=False)
elif isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
elif np.issubdtype(dtype, np.floating):
arr = self._ndarray.copy()
mask = self.isna()
arr[mask] = 0
values = arr.astype(dtype)
values[mask] = np.nan
return values
return super().astype(dtype, copy)
def _reduce(
self, name: str, *, skipna: bool = True, axis: int | None = 0, **kwargs
):
if name in ["min", "max"]:
return getattr(self, name)(skipna=skipna, axis=axis)
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
nv.validate_min((), kwargs)
result = masked_reductions.min(
values=self.to_numpy(), mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
nv.validate_max((), kwargs)
result = masked_reductions.max(
values=self.to_numpy(), mask=self.isna(), skipna=skipna
)
return self._wrap_reduction_result(axis, result)
def value_counts(self, dropna: bool = True):
from pandas import value_counts
result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
result.index = result.index.astype(self.dtype)
return result
def memory_usage(self, deep: bool = False) -> int:
result = self._ndarray.nbytes
if deep:
return result + lib.memory_usage_of_objects(self._ndarray)
return result
def _cmp_method(self, other, op):
from pandas.arrays import BooleanArray
if isinstance(other, StringArray):
other = other._ndarray
mask = isna(self) | isna(other)
valid = ~mask
if not lib.is_scalar(other):
if len(other) != len(self):
# prevent improper broadcasting when other is 2D
raise ValueError(
f"Lengths of operands do not match: {len(self)} != {len(other)}"
)
other = np.asarray(other)
other = other[valid]
if op.__name__ in ops.ARITHMETIC_BINOPS:
result = np.empty_like(self._ndarray, dtype="object")
result[mask] = StringDtype.na_value
result[valid] = op(self._ndarray[valid], other)
return StringArray(result)
else:
# logical
result = np.zeros(len(self._ndarray), dtype="bool")
result[valid] = op(self._ndarray[valid], other)
return BooleanArray(result, mask)
_arith_method = _cmp_method
# ------------------------------------------------------------------------
# String methods interface
# error: Incompatible types in assignment (expression has type "NAType",
# base class "PandasArray" defined the type as "float")
_str_na_value = StringDtype.na_value # type: ignore[assignment]
def _str_map(
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
):
from pandas.arrays import BooleanArray
if dtype is None:
dtype = StringDtype(storage="python")
if na_value is None:
na_value = self.dtype.na_value
mask = isna(self)
arr = np.asarray(self)
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
constructor: type[IntegerArray] | type[BooleanArray]
if is_integer_dtype(dtype):
constructor = IntegerArray
else:
constructor = BooleanArray
na_value_is_na = isna(na_value)
if na_value_is_na:
na_value = 1
result = lib.map_infer_mask(
arr,
f,
mask.view("uint8"),
convert=False,
na_value=na_value,
# error: Argument 1 to "dtype" has incompatible type
# "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
# "Type[object]"
dtype=np.dtype(dtype), # type: ignore[arg-type]
)
if not na_value_is_na:
mask[:] = False
return constructor(result, mask)
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
# i.e. StringDtype
result = lib.map_infer_mask(
arr, f, mask.view("uint8"), convert=False, na_value=na_value
)
return StringArray(result)
else:
# This is when the result type is object. We reach this when
# -> We know the result type is truly object (e.g. .encode returns bytes
# or .findall returns a list).
# -> We don't know the result type. E.g. `.get` can return anything.
return lib.map_infer_mask(arr, f, mask.view("uint8"))

View File

@@ -0,0 +1,862 @@
from __future__ import annotations
from collections.abc import Callable # noqa: PDF001
import re
from typing import (
TYPE_CHECKING,
Any,
Union,
cast,
overload,
)
import numpy as np
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._typing import (
Dtype,
NpDtype,
PositionalIndexer,
Scalar,
ScalarIndexer,
SequenceIndexer,
TakeIndexer,
npt,
)
from pandas.compat import (
pa_version_under1p01,
pa_version_under2p0,
pa_version_under3p0,
pa_version_under4p0,
)
from pandas.util._decorators import doc
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_dtype_equal,
is_integer,
is_integer_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.numeric import NumericDtype
from pandas.core.arrays.string_ import (
BaseStringArray,
StringDtype,
)
from pandas.core.indexers import (
check_array_indexer,
unpack_tuple_and_ellipses,
validate_indices,
)
from pandas.core.strings.object_array import ObjectStringArrayMixin
if not pa_version_under1p01:
import pyarrow as pa
import pyarrow.compute as pc
ARROW_CMP_FUNCS = {
"eq": pc.equal,
"ne": pc.not_equal,
"lt": pc.less,
"gt": pc.greater,
"le": pc.less_equal,
"ge": pc.greater_equal,
}
if TYPE_CHECKING:
from pandas import Series
ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
def _chk_pyarrow_available() -> None:
if pa_version_under1p01:
msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray."
raise ImportError(msg)
# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
# ObjectStringArrayMixin because we want to have the object-dtype based methods as
# fallback for the ones that pyarrow doesn't yet support
class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin):
"""
Extension array for string data in a ``pyarrow.ChunkedArray``.
.. versionadded:: 1.2.0
.. warning::
ArrowStringArray is considered experimental. The implementation and
parts of the API may change without warning.
Parameters
----------
values : pyarrow.Array or pyarrow.ChunkedArray
The array of data.
Attributes
----------
None
Methods
-------
None
See Also
--------
array
The recommended function for creating a ArrowStringArray.
Series.str
The string methods are available on Series backed by
a ArrowStringArray.
Notes
-----
ArrowStringArray returns a BooleanArray for comparison methods.
Examples
--------
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
<ArrowStringArray>
['This is', 'some text', <NA>, 'data.']
Length: 4, dtype: string
"""
def __init__(self, values):
self._dtype = StringDtype(storage="pyarrow")
if isinstance(values, pa.Array):
self._data = pa.chunked_array([values])
elif isinstance(values, pa.ChunkedArray):
self._data = values
else:
raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
if not pa.types.is_string(self._data.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
)
@classmethod
def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
from pandas.core.arrays.masked import BaseMaskedArray
_chk_pyarrow_available()
if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype in ensure_string_array and
# numerical issues with Float32Dtype
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
return cls(pa.array(result, mask=na_values, type=pa.string()))
# convert non-na-likes to str
result = lib.ensure_string_array(scalars, copy=copy)
return cls(pa.array(result, type=pa.string(), from_pandas=True))
@classmethod
def _from_sequence_of_strings(
cls, strings, dtype: Dtype | None = None, copy: bool = False
):
return cls._from_sequence(strings, dtype=dtype, copy=copy)
@property
def dtype(self) -> StringDtype:
"""
An instance of 'string[pyarrow]'.
"""
return self._dtype
def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)
def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow Array or ChunkedArray."""
return self._data
def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
copy: bool = False,
na_value=lib.no_default,
) -> np.ndarray:
"""
Convert to a NumPy ndarray.
"""
# TODO: copy argument is ignored
result = np.array(self._data, dtype=dtype)
if self._data.null_count > 0:
if na_value is lib.no_default:
if dtype and np.issubdtype(dtype, np.floating):
return result
na_value = self._dtype.na_value
mask = self.isna()
result[mask] = na_value
return result
def __len__(self) -> int:
"""
Length of this array.
Returns
-------
length : int
"""
return len(self._data)
@doc(ExtensionArray.factorize)
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
encoded = self._data.dictionary_encode()
indices = pa.chunked_array(
[c.indices for c in encoded.chunks], type=encoded.type.index_type
).to_pandas()
if indices.dtype.kind == "f":
indices[np.isnan(indices)] = na_sentinel
indices = indices.astype(np.int64, copy=False)
if encoded.num_chunks:
uniques = type(self)(encoded.chunk(0).dictionary)
else:
uniques = type(self)(pa.array([], type=encoded.type.value_type))
return indices.values, uniques
@classmethod
def _concat_same_type(cls, to_concat) -> ArrowStringArray:
"""
Concatenate multiple ArrowStringArray.
Parameters
----------
to_concat : sequence of ArrowStringArray
Returns
-------
ArrowStringArray
"""
return cls(
pa.chunked_array(
[array for ea in to_concat for array in ea._data.iterchunks()]
)
)
@overload
def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT:
...
@overload
def __getitem__(self: ArrowStringArray, item: SequenceIndexer) -> ArrowStringArray:
...
def __getitem__(
self: ArrowStringArray, item: PositionalIndexer
) -> ArrowStringArray | ArrowStringScalarOrNAT:
"""Select a subset of self.
Parameters
----------
item : int, slice, or ndarray
* int: The position in 'self' to get.
* slice: A slice object, where 'start', 'stop', and 'step' are
integers or None
* ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
Returns
-------
item : scalar or ExtensionArray
Notes
-----
For scalar ``item``, return a scalar value suitable for the array's
type. This should be an instance of ``self.dtype.type``.
For slice ``key``, return an instance of ``ExtensionArray``, even
if the slice is length 0 or 1.
For a boolean mask, return an instance of ``ExtensionArray``, filtered
to the values where ``item`` is True.
"""
item = check_array_indexer(self, item)
if isinstance(item, np.ndarray):
if not len(item):
return type(self)(pa.chunked_array([], type=pa.string()))
elif is_integer_dtype(item.dtype):
return self.take(item)
elif is_bool_dtype(item.dtype):
return type(self)(self._data.filter(item))
else:
raise IndexError(
"Only integers, slices and integer or "
"boolean arrays are valid indices."
)
elif isinstance(item, tuple):
item = unpack_tuple_and_ellipses(item)
# error: Non-overlapping identity check (left operand type:
# "Union[Union[int, integer[Any]], Union[slice, List[int],
# ndarray[Any, Any]]]", right operand type: "ellipsis")
if item is Ellipsis: # type: ignore[comparison-overlap]
# TODO: should be handled by pyarrow?
item = slice(None)
if is_scalar(item) and not is_integer(item):
# e.g. "foo" or 2.5
# exception message copied from numpy
raise IndexError(
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
r"(`None`) and integer or boolean arrays are valid indices"
)
# We are not an array indexer, so maybe e.g. a slice or integer
# indexer. We dispatch to pyarrow.
value = self._data[item]
if isinstance(value, pa.ChunkedArray):
return type(self)(value)
else:
return self._as_pandas_scalar(value)
def _as_pandas_scalar(self, arrow_scalar: pa.Scalar):
scalar = arrow_scalar.as_py()
if scalar is None:
return self._dtype.na_value
else:
return scalar
@property
def nbytes(self) -> int:
"""
The number of bytes needed to store this object in memory.
"""
return self._data.nbytes
def isna(self) -> np.ndarray:
"""
Boolean NumPy array indicating if each value is missing.
This should return a 1-D array the same length as 'self'.
"""
# TODO: Implement .to_numpy for ChunkedArray
return self._data.is_null().to_pandas().values
def copy(self) -> ArrowStringArray:
"""
Return a shallow copy of the array.
Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
Returns
-------
ArrowStringArray
"""
return type(self)(self._data)
def _cmp_method(self, other, op):
from pandas.arrays import BooleanArray
pc_func = ARROW_CMP_FUNCS[op.__name__]
if isinstance(other, ArrowStringArray):
result = pc_func(self._data, other._data)
elif isinstance(other, (np.ndarray, list)):
result = pc_func(self._data, other)
elif is_scalar(other):
try:
result = pc_func(self._data, pa.scalar(other))
except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
mask = isna(self) | isna(other)
valid = ~mask
result = np.zeros(len(self), dtype="bool")
result[valid] = op(np.array(self)[valid], other)
return BooleanArray(result, mask)
else:
return NotImplemented
# TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
return BooleanArray._from_sequence(result.to_pandas().values)
def insert(self, loc: int, item):
if not isinstance(item, str) and item is not libmissing.NA:
raise TypeError("Scalar must be NA or str")
return super().insert(loc, item)
def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
"""Set one or more values inplace.
Parameters
----------
key : int, ndarray, or slice
When called from, e.g. ``Series.__setitem__``, ``key`` will be
one of
* scalar int
* ndarray of integers.
* boolean ndarray
* slice object
value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
value or values to be set of ``key``.
Returns
-------
None
"""
key = check_array_indexer(self, key)
if is_integer(key):
key = cast(int, key)
if not is_scalar(value):
raise ValueError("Must pass scalars with scalar indexer")
elif isna(value):
value = None
elif not isinstance(value, str):
raise ValueError("Scalar must be NA or str")
# Slice data and insert in-between
new_data = [
*self._data[0:key].chunks,
pa.array([value], type=pa.string()),
*self._data[(key + 1) :].chunks,
]
self._data = pa.chunked_array(new_data)
else:
# Convert to integer indices and iteratively assign.
# TODO: Make a faster variant of this in Arrow upstream.
# This is probably extremely slow.
# Convert all possible input key types to an array of integers
if isinstance(key, slice):
key_array = np.array(range(len(self))[key])
elif is_bool_dtype(key):
# TODO(ARROW-9430): Directly support setitem(booleans)
key_array = np.argwhere(key).flatten()
else:
# TODO(ARROW-9431): Directly support setitem(integers)
key_array = np.asanyarray(key)
if is_scalar(value):
value = np.broadcast_to(value, len(key_array))
else:
value = np.asarray(value)
if len(key_array) != len(value):
raise ValueError("Length of indexer and values mismatch")
for k, v in zip(key_array, value):
self[k] = v
def take(
self,
indices: TakeIndexer,
allow_fill: bool = False,
fill_value: Any = None,
):
"""
Take elements from an array.
Parameters
----------
indices : sequence of int or one-dimensional np.ndarray of int
Indices to be taken.
allow_fill : bool, default False
How to handle negative values in `indices`.
* False: negative values in `indices` indicate positional indices
from the right (the default). This is similar to
:func:`numpy.take`.
* True: negative values in `indices` indicate
missing values. These values are set to `fill_value`. Any other
other negative values raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indices when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type, ``self.dtype.na_value``, is used.
For many ExtensionArrays, there will be two representations of
`fill_value`: a user-facing "boxed" scalar, and a low-level
physical NA value. `fill_value` should be the user-facing version,
and the implementation should handle translating that to the
physical version for processing the take if necessary.
Returns
-------
ExtensionArray
Raises
------
IndexError
When the indices are out of bounds for the array.
ValueError
When `indices` contains negative values other than ``-1``
and `allow_fill` is True.
See Also
--------
numpy.take
api.extensions.take
Notes
-----
ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
``iloc``, when `indices` is a sequence of values. Additionally,
it's called by :meth:`Series.reindex`, or any other method
that causes realignment, with a `fill_value`.
"""
# TODO: Remove once we got rid of the (indices < 0) check
if not is_array_like(indices):
indices_array = np.asanyarray(indices)
else:
# error: Incompatible types in assignment (expression has type
# "Sequence[int]", variable has type "ndarray")
indices_array = indices # type: ignore[assignment]
if len(self._data) == 0 and (indices_array >= 0).any():
raise IndexError("cannot do a non-empty take")
if indices_array.size > 0 and indices_array.max() >= len(self._data):
raise IndexError("out of bounds value in 'indices'.")
if allow_fill:
fill_mask = indices_array < 0
if fill_mask.any():
validate_indices(indices_array, len(self._data))
# TODO(ARROW-9433): Treat negative indices as NULL
indices_array = pa.array(indices_array, mask=fill_mask)
result = self._data.take(indices_array)
if isna(fill_value):
return type(self)(result)
# TODO: ArrowNotImplementedError: Function fill_null has no
# kernel matching input types (array[string], scalar[string])
result = type(self)(result)
result[fill_mask] = fill_value
return result
# return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
else:
# Nothing to fill
return type(self)(self._data.take(indices))
else: # allow_fill=False
# TODO(ARROW-9432): Treat negative indices as indices from the right.
if (indices_array < 0).any():
# Don't modify in-place
indices_array = np.copy(indices_array)
indices_array[indices_array < 0] += len(self._data)
return type(self)(self._data.take(indices_array))
def isin(self, values):
if pa_version_under2p0:
return super().isin(values)
value_set = [
pa_scalar.as_py()
for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
if pa_scalar.type in (pa.string(), pa.null())
]
# for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
# for null values, so we short-circuit to return all False array.
if not len(value_set):
return np.zeros(len(self), dtype=bool)
kwargs = {}
if pa_version_under3p0:
# in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
# with unexpected keyword argument in pyarrow 3.0.0+
kwargs["skip_null"] = True
result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of each unique value.
Parameters
----------
dropna : bool, default True
Don't include counts of missing values.
Returns
-------
counts : Series
See Also
--------
Series.value_counts
"""
from pandas import (
Index,
Series,
)
vc = self._data.value_counts()
values = vc.field(0)
counts = vc.field(1)
if dropna and self._data.null_count > 0:
mask = values.is_valid()
values = values.filter(mask)
counts = counts.filter(mask)
# No missing values so we can adhere to the interface and return a numpy array.
counts = np.array(counts)
index = Index(type(self)(values))
return Series(counts, index=index).astype("Int64")
def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
if is_dtype_equal(dtype, self.dtype):
if copy:
return self.copy()
return self
elif isinstance(dtype, NumericDtype):
data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
return dtype.__from_arrow__(data)
return super().astype(dtype, copy=copy)
# ------------------------------------------------------------------------
# String methods interface
# error: Cannot determine type of 'na_value'
_str_na_value = StringDtype.na_value # type: ignore[has-type]
def _str_map(
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
):
# TODO: de-duplicate with StringArray method. This method is moreless copy and
# paste.
from pandas.arrays import (
BooleanArray,
IntegerArray,
)
if dtype is None:
dtype = self.dtype
if na_value is None:
na_value = self.dtype.na_value
mask = isna(self)
arr = np.asarray(self)
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
constructor: type[IntegerArray] | type[BooleanArray]
if is_integer_dtype(dtype):
constructor = IntegerArray
else:
constructor = BooleanArray
na_value_is_na = isna(na_value)
if na_value_is_na:
na_value = 1
result = lib.map_infer_mask(
arr,
f,
mask.view("uint8"),
convert=False,
na_value=na_value,
# error: Argument 1 to "dtype" has incompatible type
# "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
# "Type[object]"
dtype=np.dtype(dtype), # type: ignore[arg-type]
)
if not na_value_is_na:
mask[:] = False
return constructor(result, mask)
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
# i.e. StringDtype
result = lib.map_infer_mask(
arr, f, mask.view("uint8"), convert=False, na_value=na_value
)
result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
return type(self)(result)
else:
# This is when the result type is object. We reach this when
# -> We know the result type is truly object (e.g. .encode returns bytes
# or .findall returns a list).
# -> We don't know the result type. E.g. `.get` can return anything.
return lib.map_infer_mask(arr, f, mask.view("uint8"))
def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):
if flags:
return super()._str_contains(pat, case, flags, na, regex)
if regex:
if pa_version_under4p0 or case is False:
return super()._str_contains(pat, case, flags, na, regex)
else:
result = pc.match_substring_regex(self._data, pat)
else:
if case:
result = pc.match_substring(self._data, pat)
else:
result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result
def _str_startswith(self, pat: str, na=None):
if pa_version_under4p0:
return super()._str_startswith(pat, na)
pat = "^" + re.escape(pat)
return self._str_contains(pat, na=na, regex=True)
def _str_endswith(self, pat: str, na=None):
if pa_version_under4p0:
return super()._str_endswith(pat, na)
pat = re.escape(pat) + "$"
return self._str_contains(pat, na=na, regex=True)
def _str_replace(
self,
pat: str | re.Pattern,
repl: str | Callable,
n: int = -1,
case: bool = True,
flags: int = 0,
regex: bool = True,
):
if (
pa_version_under4p0
or isinstance(pat, re.Pattern)
or callable(repl)
or not case
or flags
):
return super()._str_replace(pat, repl, n, case, flags, regex)
func = pc.replace_substring_regex if regex else pc.replace_substring
result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
return type(self)(result)
def _str_match(
self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None
):
if pa_version_under4p0:
return super()._str_match(pat, case, flags, na)
if not pat.startswith("^"):
pat = "^" + pat
return self._str_contains(pat, case, flags, na, regex=True)
def _str_fullmatch(self, pat, case: bool = True, flags: int = 0, na: Scalar = None):
if pa_version_under4p0:
return super()._str_fullmatch(pat, case, flags, na)
if not pat.endswith("$") or pat.endswith("//$"):
pat = pat + "$"
return self._str_match(pat, case, flags, na)
def _str_isalnum(self):
result = pc.utf8_is_alnum(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_isalpha(self):
result = pc.utf8_is_alpha(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_isdecimal(self):
result = pc.utf8_is_decimal(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_isdigit(self):
result = pc.utf8_is_digit(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_islower(self):
result = pc.utf8_is_lower(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_isnumeric(self):
result = pc.utf8_is_numeric(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_isspace(self):
if pa_version_under2p0:
return super()._str_isspace()
result = pc.utf8_is_space(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_istitle(self):
result = pc.utf8_is_title(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_isupper(self):
result = pc.utf8_is_upper(self._data)
return BooleanDtype().__from_arrow__(result)
def _str_len(self):
if pa_version_under4p0:
return super()._str_len()
result = pc.utf8_length(self._data)
return Int64Dtype().__from_arrow__(result)
def _str_lower(self):
return type(self)(pc.utf8_lower(self._data))
def _str_upper(self):
return type(self)(pc.utf8_upper(self._data))
def _str_strip(self, to_strip=None):
if pa_version_under4p0:
return super()._str_strip(to_strip)
if to_strip is None:
result = pc.utf8_trim_whitespace(self._data)
else:
result = pc.utf8_trim(self._data, characters=to_strip)
return type(self)(result)
def _str_lstrip(self, to_strip=None):
if pa_version_under4p0:
return super()._str_lstrip(to_strip)
if to_strip is None:
result = pc.utf8_ltrim_whitespace(self._data)
else:
result = pc.utf8_ltrim(self._data, characters=to_strip)
return type(self)(result)
def _str_rstrip(self, to_strip=None):
if pa_version_under4p0:
return super()._str_rstrip(to_strip)
if to_strip is None:
result = pc.utf8_rtrim_whitespace(self._data)
else:
result = pc.utf8_rtrim(self._data, characters=to_strip)
return type(self)(result)

File diff suppressed because it is too large Load Diff