first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,60 @@
from pandas.core.internals.api import make_block
from pandas.core.internals.array_manager import (
ArrayManager,
SingleArrayManager,
)
from pandas.core.internals.base import (
DataManager,
SingleDataManager,
)
from pandas.core.internals.blocks import ( # io.pytables, io.packers
Block,
DatetimeTZBlock,
ExtensionBlock,
NumericBlock,
ObjectBlock,
)
from pandas.core.internals.concat import concatenate_managers
from pandas.core.internals.managers import (
BlockManager,
SingleBlockManager,
create_block_manager_from_blocks,
)
__all__ = [
"Block",
"CategoricalBlock",
"NumericBlock",
"DatetimeTZBlock",
"ExtensionBlock",
"ObjectBlock",
"make_block",
"DataManager",
"ArrayManager",
"BlockManager",
"SingleDataManager",
"SingleBlockManager",
"SingleArrayManager",
"concatenate_managers",
# this is preserved here for downstream compatibility (GH-33892)
"create_block_manager_from_blocks",
]
def __getattr__(name: str):
import warnings
from pandas.util._exceptions import find_stack_level
if name == "CategoricalBlock":
warnings.warn(
"CategoricalBlock is deprecated and will be removed in a future version. "
"Use ExtensionBlock instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
from pandas.core.internals.blocks import CategoricalBlock
return CategoricalBlock
raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'")

View File

@ -0,0 +1,97 @@
"""
This is a pseudo-public API for downstream libraries. We ask that downstream
authors
1) Try to avoid using internals directly altogether, and failing that,
2) Use only functions exposed here (or in core.internals)
"""
from __future__ import annotations
import numpy as np
from pandas._libs.internals import BlockPlacement
from pandas._typing import Dtype
from pandas.core.dtypes.common import (
is_datetime64tz_dtype,
is_period_dtype,
pandas_dtype,
)
from pandas.core.arrays import DatetimeArray
from pandas.core.construction import extract_array
from pandas.core.internals.blocks import (
Block,
DatetimeTZBlock,
ExtensionBlock,
check_ndim,
ensure_block_shape,
extract_pandas_array,
get_block_type,
maybe_coerce_values,
)
def make_block(
values, placement, klass=None, ndim=None, dtype: Dtype | None = None
) -> Block:
"""
This is a pseudo-public analogue to blocks.new_block.
We ask that downstream libraries use this rather than any fully-internal
APIs, including but not limited to:
- core.internals.blocks.make_block
- Block.make_block
- Block.make_block_same_class
- Block.__init__
"""
if dtype is not None:
dtype = pandas_dtype(dtype)
values, dtype = extract_pandas_array(values, dtype, ndim)
if klass is ExtensionBlock and is_period_dtype(values.dtype):
# GH-44681 changed PeriodArray to be stored in the 2D
# NDArrayBackedExtensionBlock instead of ExtensionBlock
# -> still allow ExtensionBlock to be passed in this case for back compat
klass = None
if klass is None:
dtype = dtype or values.dtype
klass = get_block_type(dtype)
elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
# pyarrow calls get here
values = DatetimeArray._simple_new(values, dtype=dtype)
if not isinstance(placement, BlockPlacement):
placement = BlockPlacement(placement)
ndim = maybe_infer_ndim(values, placement, ndim)
if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
# GH#41168 ensure we can pass 1D dt64tz values
# More generally, any EA dtype that isn't is_1d_only_ea_dtype
values = extract_array(values, extract_numpy=True)
values = ensure_block_shape(values, ndim)
check_ndim(values, placement, ndim)
values = maybe_coerce_values(values)
return klass(values, ndim=ndim, placement=placement)
def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int:
"""
If `ndim` is not provided, infer it from placment and values.
"""
if ndim is None:
# GH#38134 Block constructor now assumes ndim is not None
if not isinstance(values.dtype, np.dtype):
if len(placement) != 1:
ndim = 1
else:
ndim = 2
else:
ndim = values.ndim
return ndim

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,226 @@
"""
Base class for the internal managers. Both BlockManager and ArrayManager
inherit from this class.
"""
from __future__ import annotations
from typing import (
TypeVar,
final,
)
import numpy as np
from pandas._typing import (
ArrayLike,
DtypeObj,
Shape,
)
from pandas.errors import AbstractMethodError
from pandas.core.dtypes.cast import (
find_common_type,
np_can_hold_element,
)
from pandas.core.base import PandasObject
from pandas.core.indexes.api import (
Index,
default_index,
)
T = TypeVar("T", bound="DataManager")
class DataManager(PandasObject):
# TODO share more methods/attributes
axes: list[Index]
@property
def items(self) -> Index:
raise AbstractMethodError(self)
@final
def __len__(self) -> int:
return len(self.items)
@property
def ndim(self) -> int:
return len(self.axes)
@property
def shape(self) -> Shape:
return tuple(len(ax) for ax in self.axes)
@final
def _validate_set_axis(self, axis: int, new_labels: Index) -> None:
# Caller is responsible for ensuring we have an Index object.
old_len = len(self.axes[axis])
new_len = len(new_labels)
if axis == 1 and len(self.items) == 0:
# If we are setting the index on a DataFrame with no columns,
# it is OK to change the length.
pass
elif new_len != old_len:
raise ValueError(
f"Length mismatch: Expected axis has {old_len} elements, new "
f"values have {new_len} elements"
)
def reindex_indexer(
self: T,
new_axis,
indexer,
axis: int,
fill_value=None,
allow_dups: bool = False,
copy: bool = True,
consolidate: bool = True,
only_slice: bool = False,
) -> T:
raise AbstractMethodError(self)
@final
def reindex_axis(
self: T,
new_index: Index,
axis: int,
fill_value=None,
consolidate: bool = True,
only_slice: bool = False,
) -> T:
"""
Conform data manager to new index.
"""
new_index, indexer = self.axes[axis].reindex(new_index)
return self.reindex_indexer(
new_index,
indexer,
axis=axis,
fill_value=fill_value,
copy=False,
consolidate=consolidate,
only_slice=only_slice,
)
def _equal_values(self: T, other: T) -> bool:
"""
To be implemented by the subclasses. Only check the column values
assuming shape and indexes have already been checked.
"""
raise AbstractMethodError(self)
@final
def equals(self, other: object) -> bool:
"""
Implementation for DataFrame.equals
"""
if not isinstance(other, DataManager):
return False
self_axes, other_axes = self.axes, other.axes
if len(self_axes) != len(other_axes):
return False
if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
return False
return self._equal_values(other)
def apply(
self: T,
f,
align_keys: list[str] | None = None,
ignore_failures: bool = False,
**kwargs,
) -> T:
raise AbstractMethodError(self)
@final
def isna(self: T, func) -> T:
return self.apply("apply", func=func)
# --------------------------------------------------------------------
# Consolidation: No-ops for all but BlockManager
def is_consolidated(self) -> bool:
return True
def consolidate(self: T) -> T:
return self
def _consolidate_inplace(self) -> None:
return
class SingleDataManager(DataManager):
ndim = 1
@final
@property
def array(self) -> ArrayLike:
"""
Quick access to the backing array of the Block or SingleArrayManager.
"""
# error: "SingleDataManager" has no attribute "arrays"; maybe "array"
return self.arrays[0] # type: ignore[attr-defined]
def setitem_inplace(self, indexer, value) -> None:
"""
Set values with indexer.
For Single[Block/Array]Manager, this backs s[indexer] = value
This is an inplace version of `setitem()`, mutating the manager/values
in place, not returning a new Manager (and Block), and thus never changing
the dtype.
"""
arr = self.array
# EAs will do this validation in their own __setitem__ methods.
if isinstance(arr, np.ndarray):
# Note: checking for ndarray instead of np.dtype means we exclude
# dt64/td64, which do their own validation.
value = np_can_hold_element(arr.dtype, value)
arr[indexer] = value
def grouped_reduce(self, func, ignore_failures: bool = False):
"""
ignore_failures : bool, default False
Not used; for compatibility with ArrayManager/BlockManager.
"""
arr = self.array
res = func(arr)
index = default_index(len(res))
mgr = type(self).from_array(res, index)
return mgr
@classmethod
def from_array(cls, arr: ArrayLike, index: Index):
raise AbstractMethodError(cls)
def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None:
"""
Find the common dtype for `blocks`.
Parameters
----------
blocks : List[DtypeObj]
Returns
-------
dtype : np.dtype, ExtensionDtype, or None
None is returned when `blocks` is empty.
"""
if not len(dtypes):
return None
return find_common_type(dtypes)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,652 @@
from __future__ import annotations
import itertools
from typing import (
TYPE_CHECKING,
Sequence,
cast,
)
import numpy as np
from pandas._libs import (
NaT,
internals as libinternals,
)
from pandas._typing import (
ArrayLike,
DtypeObj,
Manager,
Shape,
)
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
)
from pandas.core.dtypes.common import (
is_1d_only_ea_dtype,
is_1d_only_ea_obj,
is_datetime64tz_dtype,
is_dtype_equal,
)
from pandas.core.dtypes.concat import (
cast_to_common_type,
concat_compat,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
)
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import (
ArrayManager,
NullArrayProxy,
)
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block_2d,
)
from pandas.core.internals.managers import BlockManager
if TYPE_CHECKING:
from pandas import Index
from pandas.core.internals.blocks import Block
def _concatenate_array_managers(
mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool
) -> Manager:
"""
Concatenate array managers into one.
Parameters
----------
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
Returns
-------
ArrayManager
"""
# reindex all arrays
mgrs = []
for mgr, indexers in mgrs_indexers:
axis1_made_copy = False
for ax, indexer in indexers.items():
mgr = mgr.reindex_indexer(
axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
)
if ax == 1 and indexer is not None:
axis1_made_copy = True
if copy and concat_axis == 0 and not axis1_made_copy:
# for concat_axis 1 we will always get a copy through concat_arrays
mgr = mgr.copy()
mgrs.append(mgr)
if concat_axis == 1:
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
else:
# concatting along the columns -> combine reindexed arrays in a single manager
assert concat_axis == 0
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr
def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.
Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.
Parameters
----------
to_concat : list of arrays
Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
dtypes = {x.dtype for x in to_concat_no_proxy}
single_dtype = len(dtypes) == 1
if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes):
# GH#42092
target_dtype = np.find_common_type(list(dtypes), [])
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
if target_dtype.kind in ["m", "M"]:
# for datetimelike use DatetimeArray/TimedeltaArray concatenation
# don't use arr.astype(target_dtype, copy=False), because that doesn't
# work for DatetimeArray/TimedeltaArray (returns ndarray)
to_concat = [
arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr
for arr in to_concat
]
return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0)
to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else cast_to_common_type(arr, target_dtype)
for arr in to_concat
]
if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)
result = np.concatenate(to_concat)
# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result
def concatenate_managers(
mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool
) -> Manager:
"""
Concatenate block managers into one.
Parameters
----------
mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
Returns
-------
BlockManager
"""
# TODO(ArrayManager) this assumes that all managers are of the same type
if isinstance(mgrs_indexers[0][0], ArrayManager):
return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
# Assertions disabled for performance
# for tup in mgrs_indexers:
# # caller is responsible for ensuring this
# indexers = tup[1]
# assert concat_axis not in indexers
if concat_axis == 0:
return _concat_managers_axis0(mgrs_indexers, axes, copy)
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
# Assertion disabled for performance
# assert all(not x[1] for x in mgrs_indexers)
concat_plans = [_get_mgr_concatenation_plan(mgr) for mgr, _ in mgrs_indexers]
concat_plan = _combine_concat_plans(concat_plans)
blocks = []
for placement, join_units in concat_plan:
unit = join_units[0]
blk = unit.block
if len(join_units) == 1:
values = blk.values
if copy:
values = values.copy()
else:
values = values.view()
fastpath = True
elif _is_uniform_join_units(join_units):
vals = [ju.block.values for ju in join_units]
if not blk.is_extension:
# _is_uniform_join_units ensures a single dtype, so
# we can use np.concatenate, which is more performant
# than concat_compat
values = np.concatenate(vals, axis=1)
else:
# TODO(EA2D): special-casing not needed with 2D EAs
values = concat_compat(vals, axis=1)
values = ensure_block_shape(values, ndim=2)
values = ensure_wrapped_if_datetimelike(values)
fastpath = blk.values.dtype == values.dtype
else:
values = _concatenate_join_units(join_units, copy=copy)
fastpath = False
if fastpath:
b = blk.make_block_same_class(values, placement=placement)
else:
b = new_block_2d(values, placement=placement)
blocks.append(b)
return BlockManager(tuple(blocks), axes)
def _concat_managers_axis0(
mgrs_indexers, axes: list[Index], copy: bool
) -> BlockManager:
"""
concat_managers specialized to concat_axis=0, with reindexing already
having been done in _maybe_reindex_columns_na_proxy.
"""
had_reindexers = {
i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers))
}
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
mgrs = [x[0] for x in mgrs_indexers]
offset = 0
blocks = []
for i, mgr in enumerate(mgrs):
# If we already reindexed, then we definitely don't need another copy
made_copy = had_reindexers[i]
for blk in mgr.blocks:
if made_copy:
nb = blk.copy(deep=False)
elif copy:
nb = blk.copy()
else:
# by slicing instead of copy(deep=False), we get a new array
# object, see test_concat_copy
nb = blk.getitem_block(slice(None))
nb._mgr_locs = nb._mgr_locs.add(offset)
blocks.append(nb)
offset += len(mgr.items)
return BlockManager(tuple(blocks), axes)
def _maybe_reindex_columns_na_proxy(
axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
"""
Reindex along columns so that all of the BlockManagers being concatenated
have matching columns.
Columns added in this reindexing have dtype=np.void, indicating they
should be ignored when choosing a column's final dtype.
"""
new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = []
for mgr, indexers in mgrs_indexers:
# For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
# is a cheap reindexing.
for i, indexer in indexers.items():
mgr = mgr.reindex_indexer(
axes[i],
indexers[i],
axis=i,
copy=False,
only_slice=True, # only relevant for i==0
allow_dups=True,
use_na_proxy=True, # only relevant for i==0
)
new_mgrs_indexers.append((mgr, {}))
return new_mgrs_indexers
def _get_mgr_concatenation_plan(mgr: BlockManager):
"""
Construct concatenation plan for given block manager.
Parameters
----------
mgr : BlockManager
Returns
-------
plan : list of (BlockPlacement, JoinUnit) tuples
"""
# Calculate post-reindex shape , save for item axis which will be separate
# for each block anyway.
mgr_shape_list = list(mgr.shape)
mgr_shape = tuple(mgr_shape_list)
if mgr.is_single_block:
blk = mgr.blocks[0]
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape))]
blknos = mgr.blknos
blklocs = mgr.blklocs
plan = []
for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):
assert placements.is_slice_like
assert blkno != -1
shape_list = list(mgr_shape)
shape_list[0] = len(placements)
shape = tuple(shape_list)
blk = mgr.blocks[blkno]
ax0_blk_indexer = blklocs[placements.indexer]
unit_no_ax0_reindexing = (
len(placements) == len(blk.mgr_locs)
and
# Fastpath detection of join unit not
# needing to reindex its block: no ax0
# reindexing took place and block
# placement was sequential before.
(
(blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1)
or
# Slow-ish detection: all indexer locs
# are sequential (and length match is
# checked above).
(np.diff(ax0_blk_indexer) == 1).all()
)
)
if not unit_no_ax0_reindexing:
# create block from subset of columns
blk = blk.getitem_block(ax0_blk_indexer)
# Assertions disabled for performance
# assert blk._mgr_locs.as_slice == placements.as_slice
# assert blk.shape[0] == shape[0]
unit = JoinUnit(blk, shape)
plan.append((placements, unit))
return plan
class JoinUnit:
def __init__(self, block: Block, shape: Shape):
# Passing shape explicitly is required for cases when block is None.
self.block = block
self.shape = shape
def __repr__(self) -> str:
return f"{type(self).__name__}({repr(self.block)})"
@cache_readonly
def is_na(self) -> bool:
blk = self.block
if blk.dtype.kind == "V":
return True
return False
def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike:
values: ArrayLike
if self.is_na:
return make_na_array(empty_dtype, self.shape)
else:
if not self.block._can_consolidate:
# preserve these for validation in concat_compat
return self.block.values
# No dtype upcasting is done here, it will be performed during
# concatenation itself.
values = self.block.values
return values
def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
"""
Construct an np.ndarray or ExtensionArray of the given dtype and shape
holding all-NA values.
"""
if is_datetime64tz_dtype(dtype):
# NaT here is analogous to dtype.na_value below
i8values = np.full(shape, NaT.value)
return DatetimeArray(i8values, dtype=dtype)
elif is_1d_only_ea_dtype(dtype):
dtype = cast(ExtensionDtype, dtype)
cls = dtype.construct_array_type()
missing_arr = cls._from_sequence([], dtype=dtype)
nrows = shape[-1]
taker = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value)
elif isinstance(dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
cls = dtype.construct_array_type()
missing_arr = cls._empty(shape=shape, dtype=dtype)
missing_arr[:] = dtype.na_value
return missing_arr
else:
# NB: we should never get here with dtype integer or bool;
# if we did, the missing_arr.fill would cast to gibberish
missing_arr = np.empty(shape, dtype=dtype)
fill_value = _dtype_to_na_value(dtype)
missing_arr.fill(fill_value)
return missing_arr
def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:
"""
Concatenate values from several join units along axis=1.
"""
empty_dtype = _get_empty_dtype(join_units)
to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype) for ju in join_units]
if len(to_concat) == 1:
# Only one block, nothing to concatenate.
concat_values = to_concat[0]
if copy:
if isinstance(concat_values, np.ndarray):
# non-reindexed (=not yet copied) arrays are made into a view
# in JoinUnit.get_reindexed_values
if concat_values.base is not None:
concat_values = concat_values.copy()
else:
concat_values = concat_values.copy()
elif any(is_1d_only_ea_obj(t) for t in to_concat):
# TODO(EA2D): special case not needed if all EAs used HybridBlocks
# NB: we are still assuming here that Hybrid blocks have shape (1, N)
# concatting with at least one EA means we are concatting a single column
# the non-EA values are 2D arrays with shape (1, n)
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[int, slice]"
to_concat = [
t if is_1d_only_ea_obj(t) else t[0, :] # type: ignore[call-overload]
for t in to_concat
]
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
concat_values = ensure_block_shape(concat_values, 2)
else:
concat_values = concat_compat(to_concat, axis=1)
return concat_values
def _dtype_to_na_value(dtype: DtypeObj):
"""
Find the NA value to go with this dtype.
"""
if isinstance(dtype, ExtensionDtype):
return dtype.na_value
elif dtype.kind in ["m", "M"]:
return dtype.type("NaT")
elif dtype.kind in ["f", "c"]:
return dtype.type("NaN")
elif dtype.kind == "b":
# different from missing.na_value_for_dtype
return None
elif dtype.kind in ["i", "u"]:
return np.nan
elif dtype.kind == "O":
return np.nan
raise NotImplementedError
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
"""
Return dtype and N/A values to use when concatenating specified units.
Returned N/A value may be None which means there was no casting involved.
Returns
-------
dtype
"""
if len(join_units) == 1:
blk = join_units[0].block
return blk.dtype
if _is_uniform_reindex(join_units):
empty_dtype = join_units[0].block.dtype
return empty_dtype
needs_can_hold_na = any(unit.is_na for unit in join_units)
dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
dtype = find_common_type(dtypes)
if needs_can_hold_na:
dtype = ensure_dtype_can_hold_na(dtype)
return dtype
def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
"""
Check if the join units consist of blocks of uniform type that can
be concatenated using Block.concat_same_type instead of the generic
_concatenate_join_units (which uses `concat_compat`).
"""
first = join_units[0].block
if first.dtype.kind == "V":
return False
return (
# exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
all(type(ju.block) is type(first) for ju in join_units)
and
# e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
all(
is_dtype_equal(ju.block.dtype, first.dtype)
# GH#42092 we only want the dtype_equal check for non-numeric blocks
# (for now, may change but that would need a deprecation)
or ju.block.dtype.kind in ["b", "i", "u"]
for ju in join_units
)
and
# no blocks that would get missing values (can lead to type upcasts)
# unless we're an extension dtype.
all(not ju.is_na or ju.block.is_extension for ju in join_units)
and
# only use this path when there is something to concatenate
len(join_units) > 1
)
def _is_uniform_reindex(join_units) -> bool:
return (
# TODO: should this be ju.block._can_hold_na?
all(ju.block.is_extension for ju in join_units)
and len({ju.block.dtype.name for ju in join_units}) == 1
)
def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
"""
Reduce join_unit's shape along item axis to length.
Extra items that didn't fit are returned as a separate block.
"""
extra_block = join_unit.block.getitem_block(slice(length, None))
join_unit.block = join_unit.block.getitem_block(slice(length))
extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
join_unit.shape = (length,) + join_unit.shape[1:]
return JoinUnit(block=extra_block, shape=extra_shape)
def _combine_concat_plans(plans):
"""
Combine multiple concatenation plans into one.
existing_plan is updated in-place.
"""
if len(plans) == 1:
for p in plans[0]:
yield p[0], [p[1]]
else:
# singleton list so we can modify it as a side-effect within _next_or_none
num_ended = [0]
def _next_or_none(seq):
retval = next(seq, None)
if retval is None:
num_ended[0] += 1
return retval
plans = list(map(iter, plans))
next_items = list(map(_next_or_none, plans))
while num_ended[0] != len(next_items):
if num_ended[0] > 0:
raise ValueError("Plan shapes are not aligned")
placements, units = zip(*next_items)
lengths = list(map(len, placements))
min_len, max_len = min(lengths), max(lengths)
if min_len == max_len:
yield placements[0], units
next_items[:] = map(_next_or_none, plans)
else:
yielded_placement = None
yielded_units = [None] * len(next_items)
for i, (plc, unit) in enumerate(next_items):
yielded_units[i] = unit
if len(plc) > min_len:
# _trim_join_unit updates unit in place, so only
# placement needs to be sliced to skip min_len.
next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len))
else:
yielded_placement = plc
next_items[i] = _next_or_none(plans[i])
yield yielded_placement, yielded_units

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,147 @@
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Iterator,
NamedTuple,
)
from pandas._typing import ArrayLike
if TYPE_CHECKING:
from pandas._libs.internals import BlockPlacement
from pandas.core.internals.blocks import Block
from pandas.core.internals.managers import BlockManager
class BlockPairInfo(NamedTuple):
lvals: ArrayLike
rvals: ArrayLike
locs: BlockPlacement
left_ea: bool
right_ea: bool
rblk: Block
def _iter_block_pairs(
left: BlockManager, right: BlockManager
) -> Iterator[BlockPairInfo]:
# At this point we have already checked the parent DataFrames for
# assert rframe._indexed_same(lframe)
for blk in left.blocks:
locs = blk.mgr_locs
blk_vals = blk.values
left_ea = blk_vals.ndim == 1
rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True)
# Assertions are disabled for performance, but should hold:
# if left_ea:
# assert len(locs) == 1, locs
# assert len(rblks) == 1, rblks
# assert rblks[0].shape[0] == 1, rblks[0].shape
for rblk in rblks:
right_ea = rblk.values.ndim == 1
lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk)
yield info
def operate_blockwise(
left: BlockManager, right: BlockManager, array_op
) -> BlockManager:
# At this point we have already checked the parent DataFrames for
# assert rframe._indexed_same(lframe)
res_blks: list[Block] = []
for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right):
res_values = array_op(lvals, rvals)
if left_ea and not right_ea and hasattr(res_values, "reshape"):
res_values = res_values.reshape(1, -1)
nbs = rblk._split_op_result(res_values)
# Assertions are disabled for performance, but should hold:
# if right_ea or left_ea:
# assert len(nbs) == 1
# else:
# assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape)
_reset_block_mgr_locs(nbs, locs)
res_blks.extend(nbs)
# Assertions are disabled for performance, but should hold:
# slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array}
# nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks)
# assert nlocs == len(left.items), (nlocs, len(left.items))
# assert len(slocs) == nlocs, (len(slocs), nlocs)
# assert slocs == set(range(nlocs)), slocs
new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False)
return new_mgr
def _reset_block_mgr_locs(nbs: list[Block], locs):
"""
Reset mgr_locs to correspond to our original DataFrame.
"""
for nb in nbs:
nblocs = locs[nb.mgr_locs.indexer]
nb.mgr_locs = nblocs
# Assertions are disabled for performance, but should hold:
# assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape)
# assert all(x in locs.as_array for x in nb.mgr_locs.as_array)
def _get_same_shape_values(
lblk: Block, rblk: Block, left_ea: bool, right_ea: bool
) -> tuple[ArrayLike, ArrayLike]:
"""
Slice lblk.values to align with rblk. Squeeze if we have EAs.
"""
lvals = lblk.values
rvals = rblk.values
# Require that the indexing into lvals be slice-like
assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs
# TODO(EA2D): with 2D EAs only this first clause would be needed
if not (left_ea or right_ea):
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[Union[ndarray, slice], slice]"
lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload]
assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
elif left_ea and right_ea:
assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
elif right_ea:
# lvals are 2D, rvals are 1D
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[Union[ndarray, slice], slice]"
lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload]
assert lvals.shape[0] == 1, lvals.shape
lvals = lvals[0, :]
else:
# lvals are 1D, rvals are 2D
assert rvals.shape[0] == 1, rvals.shape
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[int, slice]"
rvals = rvals[0, :] # type: ignore[call-overload]
return lvals, rvals
def blockwise_all(left: BlockManager, right: BlockManager, op) -> bool:
"""
Blockwise `all` reduction.
"""
for info in _iter_block_pairs(left, right):
res = op(info.lvals, info.rvals)
if not res:
return False
return True