mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-03 22:57:06 +00:00
first commit
This commit is contained in:
15
.venv/Lib/site-packages/pandas/core/groupby/__init__.py
Normal file
15
.venv/Lib/site-packages/pandas/core/groupby/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
from pandas.core.groupby.generic import (
|
||||
DataFrameGroupBy,
|
||||
NamedAgg,
|
||||
SeriesGroupBy,
|
||||
)
|
||||
from pandas.core.groupby.groupby import GroupBy
|
||||
from pandas.core.groupby.grouper import Grouper
|
||||
|
||||
__all__ = [
|
||||
"DataFrameGroupBy",
|
||||
"NamedAgg",
|
||||
"SeriesGroupBy",
|
||||
"GroupBy",
|
||||
"Grouper",
|
||||
]
|
163
.venv/Lib/site-packages/pandas/core/groupby/base.py
Normal file
163
.venv/Lib/site-packages/pandas/core/groupby/base.py
Normal file
@ -0,0 +1,163 @@
|
||||
"""
|
||||
Provide basic components for groupby. These definitions
|
||||
hold the allowlist of methods that are exposed on the
|
||||
SeriesGroupBy and the DataFrameGroupBy objects.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
from typing import Hashable
|
||||
|
||||
|
||||
@dataclasses.dataclass(order=True, frozen=True)
|
||||
class OutputKey:
|
||||
label: Hashable
|
||||
position: int
|
||||
|
||||
|
||||
# special case to prevent duplicate plots when catching exceptions when
|
||||
# forwarding methods from NDFrames
|
||||
plotting_methods = frozenset(["plot", "hist"])
|
||||
|
||||
common_apply_allowlist = (
|
||||
frozenset(
|
||||
[
|
||||
"quantile",
|
||||
"fillna",
|
||||
"mad",
|
||||
"take",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"tshift",
|
||||
"skew",
|
||||
"corr",
|
||||
"cov",
|
||||
"diff",
|
||||
]
|
||||
)
|
||||
| plotting_methods
|
||||
)
|
||||
|
||||
series_apply_allowlist: frozenset[str] = (
|
||||
common_apply_allowlist
|
||||
| frozenset(
|
||||
{"nlargest", "nsmallest", "is_monotonic_increasing", "is_monotonic_decreasing"}
|
||||
)
|
||||
) | frozenset(["dtype", "unique"])
|
||||
|
||||
dataframe_apply_allowlist: frozenset[str] = common_apply_allowlist | frozenset(
|
||||
["dtypes", "corrwith"]
|
||||
)
|
||||
|
||||
# cythonized transformations or canned "agg+broadcast", which do not
|
||||
# require postprocessing of the result by transform.
|
||||
cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"])
|
||||
|
||||
# List of aggregation/reduction functions.
|
||||
# These map each group to a single numeric value
|
||||
reduction_kernels = frozenset(
|
||||
[
|
||||
"all",
|
||||
"any",
|
||||
"corrwith",
|
||||
"count",
|
||||
"first",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"last",
|
||||
"mad",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"ngroup",
|
||||
"nth",
|
||||
"nunique",
|
||||
"prod",
|
||||
# as long as `quantile`'s signature accepts only
|
||||
# a single quantile value, it's a reduction.
|
||||
# GH#27526 might change that.
|
||||
"quantile",
|
||||
"sem",
|
||||
"size",
|
||||
"skew",
|
||||
"std",
|
||||
"sum",
|
||||
"var",
|
||||
]
|
||||
)
|
||||
|
||||
# List of transformation functions.
|
||||
# a transformation is a function that, for each group,
|
||||
# produces a result that has the same shape as the group.
|
||||
|
||||
|
||||
# TODO(2.0) Remove after pad/backfill deprecation enforced
|
||||
def maybe_normalize_deprecated_kernels(kernel):
|
||||
if kernel == "backfill":
|
||||
kernel = "bfill"
|
||||
elif kernel == "pad":
|
||||
kernel = "ffill"
|
||||
return kernel
|
||||
|
||||
|
||||
transformation_kernels = frozenset(
|
||||
[
|
||||
"backfill",
|
||||
"bfill",
|
||||
"cumcount",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"diff",
|
||||
"ffill",
|
||||
"fillna",
|
||||
"pad",
|
||||
"pct_change",
|
||||
"rank",
|
||||
"shift",
|
||||
"tshift",
|
||||
]
|
||||
)
|
||||
|
||||
# these are all the public methods on Grouper which don't belong
|
||||
# in either of the above lists
|
||||
groupby_other_methods = frozenset(
|
||||
[
|
||||
"agg",
|
||||
"aggregate",
|
||||
"apply",
|
||||
"boxplot",
|
||||
# corr and cov return ngroups*ncolumns rows, so they
|
||||
# are neither a transformation nor a reduction
|
||||
"corr",
|
||||
"cov",
|
||||
"describe",
|
||||
"dtypes",
|
||||
"expanding",
|
||||
"ewm",
|
||||
"filter",
|
||||
"get_group",
|
||||
"groups",
|
||||
"head",
|
||||
"hist",
|
||||
"indices",
|
||||
"ndim",
|
||||
"ngroups",
|
||||
"ohlc",
|
||||
"pipe",
|
||||
"plot",
|
||||
"resample",
|
||||
"rolling",
|
||||
"tail",
|
||||
"take",
|
||||
"transform",
|
||||
"sample",
|
||||
"value_counts",
|
||||
]
|
||||
)
|
||||
# Valid values of `name` for `groupby.transform(name)`
|
||||
# NOTE: do NOT edit this directly. New additions should be inserted
|
||||
# into the appropriate list above.
|
||||
transform_kernel_allowlist = reduction_kernels | transformation_kernels
|
117
.venv/Lib/site-packages/pandas/core/groupby/categorical.py
Normal file
117
.venv/Lib/site-packages/pandas/core/groupby/categorical.py
Normal file
@ -0,0 +1,117 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.algorithms import unique1d
|
||||
from pandas.core.arrays.categorical import (
|
||||
Categorical,
|
||||
CategoricalDtype,
|
||||
recode_for_categories,
|
||||
)
|
||||
from pandas.core.indexes.api import CategoricalIndex
|
||||
|
||||
|
||||
def recode_for_groupby(
|
||||
c: Categorical, sort: bool, observed: bool
|
||||
) -> tuple[Categorical, Categorical | None]:
|
||||
"""
|
||||
Code the categories to ensure we can groupby for categoricals.
|
||||
|
||||
If observed=True, we return a new Categorical with the observed
|
||||
categories only.
|
||||
|
||||
If sort=False, return a copy of self, coded with categories as
|
||||
returned by .unique(), followed by any categories not appearing in
|
||||
the data. If sort=True, return self.
|
||||
|
||||
This method is needed solely to ensure the categorical index of the
|
||||
GroupBy result has categories in the order of appearance in the data
|
||||
(GH-8868).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
c : Categorical
|
||||
sort : bool
|
||||
The value of the sort parameter groupby was called with.
|
||||
observed : bool
|
||||
Account only for the observed values
|
||||
|
||||
Returns
|
||||
-------
|
||||
Categorical
|
||||
If sort=False, the new categories are set to the order of
|
||||
appearance in codes (unless ordered=True, in which case the
|
||||
original order is preserved), followed by any unrepresented
|
||||
categories in the original order.
|
||||
Categorical or None
|
||||
If we are observed, return the original categorical, otherwise None
|
||||
"""
|
||||
# we only care about observed values
|
||||
if observed:
|
||||
# In cases with c.ordered, this is equivalent to
|
||||
# return c.remove_unused_categories(), c
|
||||
|
||||
unique_codes = unique1d(c.codes)
|
||||
|
||||
take_codes = unique_codes[unique_codes != -1]
|
||||
if c.ordered:
|
||||
take_codes = np.sort(take_codes)
|
||||
|
||||
# we recode according to the uniques
|
||||
categories = c.categories.take(take_codes)
|
||||
codes = recode_for_categories(c.codes, c.categories, categories)
|
||||
|
||||
# return a new categorical that maps our new codes
|
||||
# and categories
|
||||
dtype = CategoricalDtype(categories, ordered=c.ordered)
|
||||
return Categorical(codes, dtype=dtype, fastpath=True), c
|
||||
|
||||
# Already sorted according to c.categories; all is fine
|
||||
if sort:
|
||||
return c, None
|
||||
|
||||
# sort=False should order groups in as-encountered order (GH-8868)
|
||||
cat = c.unique()
|
||||
|
||||
# See GH-38140 for block below
|
||||
# exclude nan from indexer for categories
|
||||
take_codes = cat.codes[cat.codes != -1]
|
||||
if cat.ordered:
|
||||
take_codes = np.sort(take_codes)
|
||||
cat = cat.set_categories(cat.categories.take(take_codes))
|
||||
|
||||
# But for groupby to work, all categories should be present,
|
||||
# including those missing from the data (GH-13179), which .unique()
|
||||
# above dropped
|
||||
cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)])
|
||||
|
||||
return c.reorder_categories(cat.categories), None
|
||||
|
||||
|
||||
def recode_from_groupby(
|
||||
c: Categorical, sort: bool, ci: CategoricalIndex
|
||||
) -> CategoricalIndex:
|
||||
"""
|
||||
Reverse the codes_to_groupby to account for sort / observed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
c : Categorical
|
||||
sort : bool
|
||||
The value of the sort parameter groupby was called with.
|
||||
ci : CategoricalIndex
|
||||
The codes / categories to recode
|
||||
|
||||
Returns
|
||||
-------
|
||||
CategoricalIndex
|
||||
"""
|
||||
# we re-order to the original category orderings
|
||||
if sort:
|
||||
# error: "CategoricalIndex" has no attribute "set_categories"
|
||||
return ci.set_categories(c.categories) # type: ignore[attr-defined]
|
||||
|
||||
# we are not sorting, so add unobserved to the end
|
||||
new_cats = c.categories[~c.categories.isin(ci.categories)]
|
||||
# error: "CategoricalIndex" has no attribute "add_categories"
|
||||
return ci.add_categories(new_cats) # type: ignore[attr-defined]
|
1785
.venv/Lib/site-packages/pandas/core/groupby/generic.py
Normal file
1785
.venv/Lib/site-packages/pandas/core/groupby/generic.py
Normal file
File diff suppressed because it is too large
Load Diff
3901
.venv/Lib/site-packages/pandas/core/groupby/groupby.py
Normal file
3901
.venv/Lib/site-packages/pandas/core/groupby/groupby.py
Normal file
File diff suppressed because it is too large
Load Diff
991
.venv/Lib/site-packages/pandas/core/groupby/grouper.py
Normal file
991
.venv/Lib/site-packages/pandas/core/groupby/grouper.py
Normal file
@ -0,0 +1,991 @@
|
||||
"""
|
||||
Provide user facing operators for doing the split part of the
|
||||
split-apply-combine paradigm.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Hashable,
|
||||
final,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
NDFrameT,
|
||||
npt,
|
||||
)
|
||||
from pandas.errors import InvalidIndexError
|
||||
from pandas.util._decorators import cache_readonly
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.cast import sanitize_to_nanoseconds
|
||||
from pandas.core.dtypes.common import (
|
||||
is_categorical_dtype,
|
||||
is_list_like,
|
||||
is_scalar,
|
||||
)
|
||||
|
||||
import pandas.core.algorithms as algorithms
|
||||
from pandas.core.arrays import (
|
||||
Categorical,
|
||||
ExtensionArray,
|
||||
)
|
||||
import pandas.core.common as com
|
||||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.groupby import ops
|
||||
from pandas.core.groupby.categorical import (
|
||||
recode_for_groupby,
|
||||
recode_from_groupby,
|
||||
)
|
||||
from pandas.core.indexes.api import (
|
||||
CategoricalIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
from pandas.core.series import Series
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.core.generic import NDFrame
|
||||
|
||||
|
||||
class Grouper:
|
||||
"""
|
||||
A Grouper allows the user to specify a groupby instruction for an object.
|
||||
|
||||
This specification will select a column via the key parameter, or if the
|
||||
level and/or axis parameters are given, a level of the index of the target
|
||||
object.
|
||||
|
||||
If `axis` and/or `level` are passed as keywords to both `Grouper` and
|
||||
`groupby`, the values passed to `Grouper` take precedence.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : str, defaults to None
|
||||
Groupby key, which selects the grouping column of the target.
|
||||
level : name/number, defaults to None
|
||||
The level for the target index.
|
||||
freq : str / frequency object, defaults to None
|
||||
This will groupby the specified frequency if the target selection
|
||||
(via key or level) is a datetime-like object. For full specification
|
||||
of available frequencies, please see `here
|
||||
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
|
||||
axis : str, int, defaults to 0
|
||||
Number/name of the axis.
|
||||
sort : bool, default to False
|
||||
Whether to sort the resulting labels.
|
||||
closed : {'left' or 'right'}
|
||||
Closed end of interval. Only when `freq` parameter is passed.
|
||||
label : {'left' or 'right'}
|
||||
Interval boundary to use for labeling.
|
||||
Only when `freq` parameter is passed.
|
||||
convention : {'start', 'end', 'e', 's'}
|
||||
If grouper is PeriodIndex and `freq` parameter is passed.
|
||||
base : int, default 0
|
||||
Only when `freq` parameter is passed.
|
||||
For frequencies that evenly subdivide 1 day, the "origin" of the
|
||||
aggregated intervals. For example, for '5min' frequency, base could
|
||||
range from 0 through 4. Defaults to 0.
|
||||
|
||||
.. deprecated:: 1.1.0
|
||||
The new arguments that you should use are 'offset' or 'origin'.
|
||||
|
||||
loffset : str, DateOffset, timedelta object
|
||||
Only when `freq` parameter is passed.
|
||||
|
||||
.. deprecated:: 1.1.0
|
||||
loffset is only working for ``.resample(...)`` and not for
|
||||
Grouper (:issue:`28302`).
|
||||
However, loffset is also deprecated for ``.resample(...)``
|
||||
See: :class:`DataFrame.resample`
|
||||
|
||||
origin : Timestamp or str, default 'start_day'
|
||||
The timestamp on which to adjust the grouping. The timezone of origin must
|
||||
match the timezone of the index.
|
||||
If string, must be one of the following:
|
||||
|
||||
- 'epoch': `origin` is 1970-01-01
|
||||
- 'start': `origin` is the first value of the timeseries
|
||||
- 'start_day': `origin` is the first day at midnight of the timeseries
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
|
||||
- 'end': `origin` is the last value of the timeseries
|
||||
- 'end_day': `origin` is the ceiling midnight of the last day
|
||||
|
||||
.. versionadded:: 1.3.0
|
||||
|
||||
offset : Timedelta or str, default is None
|
||||
An offset timedelta added to the origin.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
|
||||
dropna : bool, default True
|
||||
If True, and if group keys contain NA values, NA values together with
|
||||
row/column will be dropped. If False, NA values will also be treated as
|
||||
the key in groups.
|
||||
|
||||
.. versionadded:: 1.2.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
A specification for a groupby instruction
|
||||
|
||||
Examples
|
||||
--------
|
||||
Syntactic sugar for ``df.groupby('A')``
|
||||
|
||||
>>> df = pd.DataFrame(
|
||||
... {
|
||||
... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],
|
||||
... "Speed": [100, 5, 200, 300, 15],
|
||||
... }
|
||||
... )
|
||||
>>> df
|
||||
Animal Speed
|
||||
0 Falcon 100
|
||||
1 Parrot 5
|
||||
2 Falcon 200
|
||||
3 Falcon 300
|
||||
4 Parrot 15
|
||||
>>> df.groupby(pd.Grouper(key="Animal")).mean()
|
||||
Speed
|
||||
Animal
|
||||
Falcon 200.0
|
||||
Parrot 10.0
|
||||
|
||||
Specify a resample operation on the column 'Publish date'
|
||||
|
||||
>>> df = pd.DataFrame(
|
||||
... {
|
||||
... "Publish date": [
|
||||
... pd.Timestamp("2000-01-02"),
|
||||
... pd.Timestamp("2000-01-02"),
|
||||
... pd.Timestamp("2000-01-09"),
|
||||
... pd.Timestamp("2000-01-16")
|
||||
... ],
|
||||
... "ID": [0, 1, 2, 3],
|
||||
... "Price": [10, 20, 30, 40]
|
||||
... }
|
||||
... )
|
||||
>>> df
|
||||
Publish date ID Price
|
||||
0 2000-01-02 0 10
|
||||
1 2000-01-02 1 20
|
||||
2 2000-01-09 2 30
|
||||
3 2000-01-16 3 40
|
||||
>>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()
|
||||
ID Price
|
||||
Publish date
|
||||
2000-01-02 0.5 15.0
|
||||
2000-01-09 2.0 30.0
|
||||
2000-01-16 3.0 40.0
|
||||
|
||||
If you want to adjust the start of the bins based on a fixed timestamp:
|
||||
|
||||
>>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
|
||||
>>> rng = pd.date_range(start, end, freq='7min')
|
||||
>>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
|
||||
>>> ts
|
||||
2000-10-01 23:30:00 0
|
||||
2000-10-01 23:37:00 3
|
||||
2000-10-01 23:44:00 6
|
||||
2000-10-01 23:51:00 9
|
||||
2000-10-01 23:58:00 12
|
||||
2000-10-02 00:05:00 15
|
||||
2000-10-02 00:12:00 18
|
||||
2000-10-02 00:19:00 21
|
||||
2000-10-02 00:26:00 24
|
||||
Freq: 7T, dtype: int64
|
||||
|
||||
>>> ts.groupby(pd.Grouper(freq='17min')).sum()
|
||||
2000-10-01 23:14:00 0
|
||||
2000-10-01 23:31:00 9
|
||||
2000-10-01 23:48:00 21
|
||||
2000-10-02 00:05:00 54
|
||||
2000-10-02 00:22:00 24
|
||||
Freq: 17T, dtype: int64
|
||||
|
||||
>>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()
|
||||
2000-10-01 23:18:00 0
|
||||
2000-10-01 23:35:00 18
|
||||
2000-10-01 23:52:00 27
|
||||
2000-10-02 00:09:00 39
|
||||
2000-10-02 00:26:00 24
|
||||
Freq: 17T, dtype: int64
|
||||
|
||||
>>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
|
||||
2000-10-01 23:24:00 3
|
||||
2000-10-01 23:41:00 15
|
||||
2000-10-01 23:58:00 45
|
||||
2000-10-02 00:15:00 45
|
||||
Freq: 17T, dtype: int64
|
||||
|
||||
If you want to adjust the start of the bins with an `offset` Timedelta, the two
|
||||
following lines are equivalent:
|
||||
|
||||
>>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()
|
||||
2000-10-01 23:30:00 9
|
||||
2000-10-01 23:47:00 21
|
||||
2000-10-02 00:04:00 54
|
||||
2000-10-02 00:21:00 24
|
||||
Freq: 17T, dtype: int64
|
||||
|
||||
>>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()
|
||||
2000-10-01 23:30:00 9
|
||||
2000-10-01 23:47:00 21
|
||||
2000-10-02 00:04:00 54
|
||||
2000-10-02 00:21:00 24
|
||||
Freq: 17T, dtype: int64
|
||||
|
||||
To replace the use of the deprecated `base` argument, you can now use `offset`,
|
||||
in this example it is equivalent to have `base=2`:
|
||||
|
||||
>>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()
|
||||
2000-10-01 23:16:00 0
|
||||
2000-10-01 23:33:00 9
|
||||
2000-10-01 23:50:00 36
|
||||
2000-10-02 00:07:00 39
|
||||
2000-10-02 00:24:00 24
|
||||
Freq: 17T, dtype: int64
|
||||
"""
|
||||
|
||||
axis: int
|
||||
sort: bool
|
||||
dropna: bool
|
||||
_gpr_index: Index | None
|
||||
_grouper: Index | None
|
||||
|
||||
_attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort")
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if kwargs.get("freq") is not None:
|
||||
from pandas.core.resample import TimeGrouper
|
||||
|
||||
_check_deprecated_resample_kwargs(kwargs, origin=cls)
|
||||
cls = TimeGrouper
|
||||
return super().__new__(cls)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
key=None,
|
||||
level=None,
|
||||
freq=None,
|
||||
axis: int = 0,
|
||||
sort: bool = False,
|
||||
dropna: bool = True,
|
||||
):
|
||||
self.key = key
|
||||
self.level = level
|
||||
self.freq = freq
|
||||
self.axis = axis
|
||||
self.sort = sort
|
||||
|
||||
self.grouper = None
|
||||
self._gpr_index = None
|
||||
self.obj = None
|
||||
self.indexer = None
|
||||
self.binner = None
|
||||
self._grouper = None
|
||||
self._indexer = None
|
||||
self.dropna = dropna
|
||||
|
||||
@final
|
||||
@property
|
||||
def ax(self) -> Index:
|
||||
index = self._gpr_index
|
||||
if index is None:
|
||||
raise ValueError("_set_grouper must be called before ax is accessed")
|
||||
return index
|
||||
|
||||
def _get_grouper(
|
||||
self, obj: NDFrameT, validate: bool = True
|
||||
) -> tuple[Any, ops.BaseGrouper, NDFrameT]:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
obj : Series or DataFrame
|
||||
validate : bool, default True
|
||||
if True, validate the grouper
|
||||
|
||||
Returns
|
||||
-------
|
||||
a tuple of binner, grouper, obj (possibly sorted)
|
||||
"""
|
||||
self._set_grouper(obj)
|
||||
# error: Value of type variable "NDFrameT" of "get_grouper" cannot be
|
||||
# "Optional[Any]"
|
||||
# error: Incompatible types in assignment (expression has type "BaseGrouper",
|
||||
# variable has type "None")
|
||||
self.grouper, _, self.obj = get_grouper( # type: ignore[type-var,assignment]
|
||||
self.obj,
|
||||
[self.key],
|
||||
axis=self.axis,
|
||||
level=self.level,
|
||||
sort=self.sort,
|
||||
validate=validate,
|
||||
dropna=self.dropna,
|
||||
)
|
||||
|
||||
# error: Incompatible return value type (got "Tuple[None, None, None]",
|
||||
# expected "Tuple[Any, BaseGrouper, NDFrameT]")
|
||||
return self.binner, self.grouper, self.obj # type: ignore[return-value]
|
||||
|
||||
@final
|
||||
def _set_grouper(self, obj: NDFrame, sort: bool = False):
|
||||
"""
|
||||
given an object and the specifications, setup the internal grouper
|
||||
for this particular specification
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : Series or DataFrame
|
||||
sort : bool, default False
|
||||
whether the resulting grouper should be sorted
|
||||
"""
|
||||
assert obj is not None
|
||||
|
||||
if self.key is not None and self.level is not None:
|
||||
raise ValueError("The Grouper cannot specify both a key and a level!")
|
||||
|
||||
# Keep self.grouper value before overriding
|
||||
if self._grouper is None:
|
||||
# TODO: What are we assuming about subsequent calls?
|
||||
self._grouper = self._gpr_index
|
||||
self._indexer = self.indexer
|
||||
|
||||
# the key must be a valid info item
|
||||
if self.key is not None:
|
||||
key = self.key
|
||||
# The 'on' is already defined
|
||||
if getattr(self._gpr_index, "name", None) == key and isinstance(
|
||||
obj, Series
|
||||
):
|
||||
# Sometimes self._grouper will have been resorted while
|
||||
# obj has not. In this case there is a mismatch when we
|
||||
# call self._grouper.take(obj.index) so we need to undo the sorting
|
||||
# before we call _grouper.take.
|
||||
assert self._grouper is not None
|
||||
if self._indexer is not None:
|
||||
reverse_indexer = self._indexer.argsort()
|
||||
unsorted_ax = self._grouper.take(reverse_indexer)
|
||||
ax = unsorted_ax.take(obj.index)
|
||||
else:
|
||||
ax = self._grouper.take(obj.index)
|
||||
else:
|
||||
if key not in obj._info_axis:
|
||||
raise KeyError(f"The grouper name {key} is not found")
|
||||
ax = Index(obj[key], name=key)
|
||||
|
||||
else:
|
||||
ax = obj._get_axis(self.axis)
|
||||
if self.level is not None:
|
||||
level = self.level
|
||||
|
||||
# if a level is given it must be a mi level or
|
||||
# equivalent to the axis name
|
||||
if isinstance(ax, MultiIndex):
|
||||
level = ax._get_level_number(level)
|
||||
ax = Index(ax._get_level_values(level), name=ax.names[level])
|
||||
|
||||
else:
|
||||
if level not in (0, ax.name):
|
||||
raise ValueError(f"The level {level} is not valid")
|
||||
|
||||
# possibly sort
|
||||
if (self.sort or sort) and not ax.is_monotonic:
|
||||
# use stable sort to support first, last, nth
|
||||
# TODO: why does putting na_position="first" fix datetimelike cases?
|
||||
indexer = self.indexer = ax.array.argsort(
|
||||
kind="mergesort", na_position="first"
|
||||
)
|
||||
ax = ax.take(indexer)
|
||||
obj = obj.take(indexer, axis=self.axis)
|
||||
|
||||
# error: Incompatible types in assignment (expression has type
|
||||
# "NDFrameT", variable has type "None")
|
||||
self.obj = obj # type: ignore[assignment]
|
||||
self._gpr_index = ax
|
||||
return self._gpr_index
|
||||
|
||||
@final
|
||||
@property
|
||||
def groups(self):
|
||||
# error: "None" has no attribute "groups"
|
||||
return self.grouper.groups # type: ignore[attr-defined]
|
||||
|
||||
@final
|
||||
def __repr__(self) -> str:
|
||||
attrs_list = (
|
||||
f"{attr_name}={repr(getattr(self, attr_name))}"
|
||||
for attr_name in self._attributes
|
||||
if getattr(self, attr_name) is not None
|
||||
)
|
||||
attrs = ", ".join(attrs_list)
|
||||
cls_name = type(self).__name__
|
||||
return f"{cls_name}({attrs})"
|
||||
|
||||
|
||||
@final
|
||||
class Grouping:
|
||||
"""
|
||||
Holds the grouping information for a single key
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : Index
|
||||
grouper :
|
||||
obj : DataFrame or Series
|
||||
name : Label
|
||||
level :
|
||||
observed : bool, default False
|
||||
If we are a Categorical, use the observed values
|
||||
in_axis : if the Grouping is a column in self.obj and hence among
|
||||
Groupby.exclusions list
|
||||
|
||||
Returns
|
||||
-------
|
||||
**Attributes**:
|
||||
* indices : dict of {group -> index_list}
|
||||
* codes : ndarray, group codes
|
||||
* group_index : unique groups
|
||||
* groups : dict of {group -> label_list}
|
||||
"""
|
||||
|
||||
_codes: np.ndarray | None = None
|
||||
_group_index: Index | None = None
|
||||
_passed_categorical: bool
|
||||
_all_grouper: Categorical | None
|
||||
_index: Index
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index: Index,
|
||||
grouper=None,
|
||||
obj: NDFrame | None = None,
|
||||
level=None,
|
||||
sort: bool = True,
|
||||
observed: bool = False,
|
||||
in_axis: bool = False,
|
||||
dropna: bool = True,
|
||||
):
|
||||
self.level = level
|
||||
self._orig_grouper = grouper
|
||||
self.grouping_vector = _convert_grouper(index, grouper)
|
||||
self._all_grouper = None
|
||||
self._index = index
|
||||
self._sort = sort
|
||||
self.obj = obj
|
||||
self._observed = observed
|
||||
self.in_axis = in_axis
|
||||
self._dropna = dropna
|
||||
|
||||
self._passed_categorical = False
|
||||
|
||||
# we have a single grouper which may be a myriad of things,
|
||||
# some of which are dependent on the passing in level
|
||||
|
||||
ilevel = self._ilevel
|
||||
if ilevel is not None:
|
||||
mapper = self.grouping_vector
|
||||
# In extant tests, the new self.grouping_vector matches
|
||||
# `index.get_level_values(ilevel)` whenever
|
||||
# mapper is None and isinstance(index, MultiIndex)
|
||||
(
|
||||
self.grouping_vector, # Index
|
||||
self._codes,
|
||||
self._group_index,
|
||||
) = index._get_grouper_for_level(mapper, level=ilevel)
|
||||
|
||||
# a passed Grouper like, directly get the grouper in the same way
|
||||
# as single grouper groupby, use the group_info to get codes
|
||||
elif isinstance(self.grouping_vector, Grouper):
|
||||
# get the new grouper; we already have disambiguated
|
||||
# what key/level refer to exactly, don't need to
|
||||
# check again as we have by this point converted these
|
||||
# to an actual value (rather than a pd.Grouper)
|
||||
assert self.obj is not None # for mypy
|
||||
_, newgrouper, newobj = self.grouping_vector._get_grouper(
|
||||
self.obj, validate=False
|
||||
)
|
||||
self.obj = newobj
|
||||
|
||||
ng = newgrouper._get_grouper()
|
||||
if isinstance(newgrouper, ops.BinGrouper):
|
||||
# in this case we have `ng is newgrouper`
|
||||
self.grouping_vector = ng
|
||||
else:
|
||||
# ops.BaseGrouper
|
||||
# use Index instead of ndarray so we can recover the name
|
||||
self.grouping_vector = Index(ng, name=newgrouper.result_index.name)
|
||||
|
||||
elif is_categorical_dtype(self.grouping_vector):
|
||||
# a passed Categorical
|
||||
self._passed_categorical = True
|
||||
|
||||
self.grouping_vector, self._all_grouper = recode_for_groupby(
|
||||
self.grouping_vector, sort, observed
|
||||
)
|
||||
|
||||
elif not isinstance(
|
||||
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
|
||||
):
|
||||
# no level passed
|
||||
if getattr(self.grouping_vector, "ndim", 1) != 1:
|
||||
t = self.name or str(type(self.grouping_vector))
|
||||
raise ValueError(f"Grouper for '{t}' not 1-dimensional")
|
||||
|
||||
self.grouping_vector = index.map(self.grouping_vector)
|
||||
|
||||
if not (
|
||||
hasattr(self.grouping_vector, "__len__")
|
||||
and len(self.grouping_vector) == len(index)
|
||||
):
|
||||
grper = pprint_thing(self.grouping_vector)
|
||||
errmsg = (
|
||||
"Grouper result violates len(labels) == "
|
||||
f"len(data)\nresult: {grper}"
|
||||
)
|
||||
self.grouping_vector = None # Try for sanity
|
||||
raise AssertionError(errmsg)
|
||||
|
||||
if isinstance(self.grouping_vector, np.ndarray):
|
||||
# if we have a date/time-like grouper, make sure that we have
|
||||
# Timestamps like
|
||||
self.grouping_vector = sanitize_to_nanoseconds(self.grouping_vector)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Grouping({self.name})"
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.indices)
|
||||
|
||||
@cache_readonly
|
||||
def name(self) -> Hashable:
|
||||
ilevel = self._ilevel
|
||||
if ilevel is not None:
|
||||
return self._index.names[ilevel]
|
||||
|
||||
if isinstance(self._orig_grouper, (Index, Series)):
|
||||
return self._orig_grouper.name
|
||||
|
||||
elif isinstance(self.grouping_vector, ops.BaseGrouper):
|
||||
return self.grouping_vector.result_index.name
|
||||
|
||||
elif isinstance(self.grouping_vector, Index):
|
||||
return self.grouping_vector.name
|
||||
|
||||
# otherwise we have ndarray or ExtensionArray -> no name
|
||||
return None
|
||||
|
||||
@cache_readonly
|
||||
def _ilevel(self) -> int | None:
|
||||
"""
|
||||
If necessary, converted index level name to index level position.
|
||||
"""
|
||||
level = self.level
|
||||
if level is None:
|
||||
return None
|
||||
if not isinstance(level, int):
|
||||
index = self._index
|
||||
if level not in index.names:
|
||||
raise AssertionError(f"Level {level} not in index")
|
||||
return index.names.index(level)
|
||||
return level
|
||||
|
||||
@property
|
||||
def ngroups(self) -> int:
|
||||
return len(self.group_index)
|
||||
|
||||
@cache_readonly
|
||||
def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
|
||||
# we have a list of groupers
|
||||
if isinstance(self.grouping_vector, ops.BaseGrouper):
|
||||
return self.grouping_vector.indices
|
||||
|
||||
values = Categorical(self.grouping_vector)
|
||||
return values._reverse_indexer()
|
||||
|
||||
@property
|
||||
def codes(self) -> np.ndarray:
|
||||
if self._codes is not None:
|
||||
# _codes is set in __init__ for MultiIndex cases
|
||||
return self._codes
|
||||
|
||||
return self._codes_and_uniques[0]
|
||||
|
||||
@cache_readonly
|
||||
def group_arraylike(self) -> ArrayLike:
|
||||
"""
|
||||
Analogous to result_index, but holding an ArrayLike to ensure
|
||||
we can retain ExtensionDtypes.
|
||||
"""
|
||||
if self._group_index is not None:
|
||||
# _group_index is set in __init__ for MultiIndex cases
|
||||
return self._group_index._values
|
||||
|
||||
elif self._all_grouper is not None:
|
||||
# retain dtype for categories, including unobserved ones
|
||||
return self.result_index._values
|
||||
|
||||
return self._codes_and_uniques[1]
|
||||
|
||||
@cache_readonly
|
||||
def result_index(self) -> Index:
|
||||
# result_index retains dtype for categories, including unobserved ones,
|
||||
# which group_index does not
|
||||
if self._all_grouper is not None:
|
||||
group_idx = self.group_index
|
||||
assert isinstance(group_idx, CategoricalIndex)
|
||||
return recode_from_groupby(self._all_grouper, self._sort, group_idx)
|
||||
return self.group_index
|
||||
|
||||
@cache_readonly
|
||||
def group_index(self) -> Index:
|
||||
if self._group_index is not None:
|
||||
# _group_index is set in __init__ for MultiIndex cases
|
||||
return self._group_index
|
||||
|
||||
uniques = self._codes_and_uniques[1]
|
||||
return Index._with_infer(uniques, name=self.name)
|
||||
|
||||
@cache_readonly
|
||||
def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
|
||||
if self._passed_categorical:
|
||||
# we make a CategoricalIndex out of the cat grouper
|
||||
# preserving the categories / ordered attributes
|
||||
cat = self.grouping_vector
|
||||
categories = cat.categories
|
||||
|
||||
if self._observed:
|
||||
ucodes = algorithms.unique1d(cat.codes)
|
||||
ucodes = ucodes[ucodes != -1]
|
||||
if self._sort or cat.ordered:
|
||||
ucodes = np.sort(ucodes)
|
||||
else:
|
||||
ucodes = np.arange(len(categories))
|
||||
|
||||
uniques = Categorical.from_codes(
|
||||
codes=ucodes, categories=categories, ordered=cat.ordered
|
||||
)
|
||||
return cat.codes, uniques
|
||||
|
||||
elif isinstance(self.grouping_vector, ops.BaseGrouper):
|
||||
# we have a list of groupers
|
||||
codes = self.grouping_vector.codes_info
|
||||
uniques = self.grouping_vector.result_arraylike
|
||||
else:
|
||||
# GH35667, replace dropna=False with na_sentinel=None
|
||||
if not self._dropna:
|
||||
na_sentinel = None
|
||||
else:
|
||||
na_sentinel = -1
|
||||
codes, uniques = algorithms.factorize(
|
||||
self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel
|
||||
)
|
||||
return codes, uniques
|
||||
|
||||
@cache_readonly
|
||||
def groups(self) -> dict[Hashable, np.ndarray]:
|
||||
return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))
|
||||
|
||||
|
||||
def get_grouper(
|
||||
obj: NDFrameT,
|
||||
key=None,
|
||||
axis: int = 0,
|
||||
level=None,
|
||||
sort: bool = True,
|
||||
observed: bool = False,
|
||||
mutated: bool = False,
|
||||
validate: bool = True,
|
||||
dropna: bool = True,
|
||||
) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:
|
||||
"""
|
||||
Create and return a BaseGrouper, which is an internal
|
||||
mapping of how to create the grouper indexers.
|
||||
This may be composed of multiple Grouping objects, indicating
|
||||
multiple groupers
|
||||
|
||||
Groupers are ultimately index mappings. They can originate as:
|
||||
index mappings, keys to columns, functions, or Groupers
|
||||
|
||||
Groupers enable local references to axis,level,sort, while
|
||||
the passed in axis, level, and sort are 'global'.
|
||||
|
||||
This routine tries to figure out what the passing in references
|
||||
are and then creates a Grouping for each one, combined into
|
||||
a BaseGrouper.
|
||||
|
||||
If observed & we have a categorical grouper, only show the observed
|
||||
values.
|
||||
|
||||
If validate, then check for key/level overlaps.
|
||||
|
||||
"""
|
||||
group_axis = obj._get_axis(axis)
|
||||
|
||||
# validate that the passed single level is compatible with the passed
|
||||
# axis of the object
|
||||
if level is not None:
|
||||
# TODO: These if-block and else-block are almost same.
|
||||
# MultiIndex instance check is removable, but it seems that there are
|
||||
# some processes only for non-MultiIndex in else-block,
|
||||
# eg. `obj.index.name != level`. We have to consider carefully whether
|
||||
# these are applicable for MultiIndex. Even if these are applicable,
|
||||
# we need to check if it makes no side effect to subsequent processes
|
||||
# on the outside of this condition.
|
||||
# (GH 17621)
|
||||
if isinstance(group_axis, MultiIndex):
|
||||
if is_list_like(level) and len(level) == 1:
|
||||
level = level[0]
|
||||
|
||||
if key is None and is_scalar(level):
|
||||
# Get the level values from group_axis
|
||||
key = group_axis.get_level_values(level)
|
||||
level = None
|
||||
|
||||
else:
|
||||
# allow level to be a length-one list-like object
|
||||
# (e.g., level=[0])
|
||||
# GH 13901
|
||||
if is_list_like(level):
|
||||
nlevels = len(level)
|
||||
if nlevels == 1:
|
||||
level = level[0]
|
||||
elif nlevels == 0:
|
||||
raise ValueError("No group keys passed!")
|
||||
else:
|
||||
raise ValueError("multiple levels only valid with MultiIndex")
|
||||
|
||||
if isinstance(level, str):
|
||||
if obj._get_axis(axis).name != level:
|
||||
raise ValueError(
|
||||
f"level name {level} is not the name "
|
||||
f"of the {obj._get_axis_name(axis)}"
|
||||
)
|
||||
elif level > 0 or level < -1:
|
||||
raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
|
||||
|
||||
# NOTE: `group_axis` and `group_axis.get_level_values(level)`
|
||||
# are same in this section.
|
||||
level = None
|
||||
key = group_axis
|
||||
|
||||
# a passed-in Grouper, directly convert
|
||||
if isinstance(key, Grouper):
|
||||
binner, grouper, obj = key._get_grouper(obj, validate=False)
|
||||
if key.key is None:
|
||||
return grouper, frozenset(), obj
|
||||
else:
|
||||
return grouper, frozenset({key.key}), obj
|
||||
|
||||
# already have a BaseGrouper, just return it
|
||||
elif isinstance(key, ops.BaseGrouper):
|
||||
return key, frozenset(), obj
|
||||
|
||||
if not isinstance(key, list):
|
||||
keys = [key]
|
||||
match_axis_length = False
|
||||
else:
|
||||
keys = key
|
||||
match_axis_length = len(keys) == len(group_axis)
|
||||
|
||||
# what are we after, exactly?
|
||||
any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
|
||||
any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
|
||||
any_arraylike = any(
|
||||
isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
|
||||
)
|
||||
|
||||
# is this an index replacement?
|
||||
if (
|
||||
not any_callable
|
||||
and not any_arraylike
|
||||
and not any_groupers
|
||||
and match_axis_length
|
||||
and level is None
|
||||
):
|
||||
if isinstance(obj, DataFrame):
|
||||
all_in_columns_index = all(
|
||||
g in obj.columns or g in obj.index.names for g in keys
|
||||
)
|
||||
else:
|
||||
assert isinstance(obj, Series)
|
||||
all_in_columns_index = all(g in obj.index.names for g in keys)
|
||||
|
||||
if not all_in_columns_index:
|
||||
keys = [com.asarray_tuplesafe(keys)]
|
||||
|
||||
if isinstance(level, (tuple, list)):
|
||||
if key is None:
|
||||
keys = [None] * len(level)
|
||||
levels = level
|
||||
else:
|
||||
levels = [level] * len(keys)
|
||||
|
||||
groupings: list[Grouping] = []
|
||||
exclusions: set[Hashable] = set()
|
||||
|
||||
# if the actual grouper should be obj[key]
|
||||
def is_in_axis(key) -> bool:
|
||||
if not _is_label_like(key):
|
||||
# items -> .columns for DataFrame, .index for Series
|
||||
items = obj.axes[-1]
|
||||
try:
|
||||
items.get_loc(key)
|
||||
except (KeyError, TypeError, InvalidIndexError):
|
||||
# TypeError shows up here if we pass e.g. Int64Index
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# if the grouper is obj[name]
|
||||
def is_in_obj(gpr) -> bool:
|
||||
if not hasattr(gpr, "name"):
|
||||
return False
|
||||
try:
|
||||
return gpr is obj[gpr.name]
|
||||
except (KeyError, IndexError, InvalidIndexError):
|
||||
# IndexError reached in e.g. test_skip_group_keys when we pass
|
||||
# lambda here
|
||||
# InvalidIndexError raised on key-types inappropriate for index,
|
||||
# e.g. DatetimeIndex.get_loc(tuple())
|
||||
return False
|
||||
|
||||
for gpr, level in zip(keys, levels):
|
||||
|
||||
if is_in_obj(gpr): # df.groupby(df['name'])
|
||||
in_axis = True
|
||||
exclusions.add(gpr.name)
|
||||
|
||||
elif is_in_axis(gpr): # df.groupby('name')
|
||||
if gpr in obj:
|
||||
if validate:
|
||||
obj._check_label_or_level_ambiguity(gpr, axis=axis)
|
||||
in_axis, name, gpr = True, gpr, obj[gpr]
|
||||
if gpr.ndim != 1:
|
||||
# non-unique columns; raise here to get the name in the
|
||||
# exception message
|
||||
raise ValueError(f"Grouper for '{name}' not 1-dimensional")
|
||||
exclusions.add(name)
|
||||
elif obj._is_level_reference(gpr, axis=axis):
|
||||
in_axis, level, gpr = False, gpr, None
|
||||
else:
|
||||
raise KeyError(gpr)
|
||||
elif isinstance(gpr, Grouper) and gpr.key is not None:
|
||||
# Add key to exclusions
|
||||
exclusions.add(gpr.key)
|
||||
in_axis = False
|
||||
else:
|
||||
in_axis = False
|
||||
|
||||
# create the Grouping
|
||||
# allow us to passing the actual Grouping as the gpr
|
||||
ping = (
|
||||
Grouping(
|
||||
group_axis,
|
||||
gpr,
|
||||
obj=obj,
|
||||
level=level,
|
||||
sort=sort,
|
||||
observed=observed,
|
||||
in_axis=in_axis,
|
||||
dropna=dropna,
|
||||
)
|
||||
if not isinstance(gpr, Grouping)
|
||||
else gpr
|
||||
)
|
||||
|
||||
groupings.append(ping)
|
||||
|
||||
if len(groupings) == 0 and len(obj):
|
||||
raise ValueError("No group keys passed!")
|
||||
elif len(groupings) == 0:
|
||||
groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
|
||||
|
||||
# create the internals grouper
|
||||
grouper = ops.BaseGrouper(
|
||||
group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna
|
||||
)
|
||||
return grouper, frozenset(exclusions), obj
|
||||
|
||||
|
||||
def _is_label_like(val) -> bool:
|
||||
return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
|
||||
|
||||
|
||||
def _convert_grouper(axis: Index, grouper):
|
||||
if isinstance(grouper, dict):
|
||||
return grouper.get
|
||||
elif isinstance(grouper, Series):
|
||||
if grouper.index.equals(axis):
|
||||
return grouper._values
|
||||
else:
|
||||
return grouper.reindex(axis)._values
|
||||
elif isinstance(grouper, MultiIndex):
|
||||
return grouper._values
|
||||
elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):
|
||||
if len(grouper) != len(axis):
|
||||
raise ValueError("Grouper and axis must be same length")
|
||||
|
||||
if isinstance(grouper, (list, tuple)):
|
||||
grouper = com.asarray_tuplesafe(grouper)
|
||||
return grouper
|
||||
else:
|
||||
return grouper
|
||||
|
||||
|
||||
def _check_deprecated_resample_kwargs(kwargs, origin):
|
||||
"""
|
||||
Check for use of deprecated parameters in ``resample`` and related functions.
|
||||
|
||||
Raises the appropriate warnings if these parameters are detected.
|
||||
Only sets an approximate ``stacklevel`` for the warnings (see #37603, #36629).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kwargs : dict
|
||||
Dictionary of keyword arguments to check for deprecated parameters.
|
||||
origin : object
|
||||
From where this function is being called; either Grouper or TimeGrouper. Used
|
||||
to determine an approximate stacklevel.
|
||||
"""
|
||||
# Deprecation warning of `base` and `loffset` since v1.1.0:
|
||||
# we are raising the warning here to be able to set the `stacklevel`
|
||||
# properly since we need to raise the `base` and `loffset` deprecation
|
||||
# warning from three different cases:
|
||||
# core/generic.py::NDFrame.resample
|
||||
# core/groupby/groupby.py::GroupBy.resample
|
||||
# core/groupby/grouper.py::Grouper
|
||||
# raising these warnings from TimeGrouper directly would fail the test:
|
||||
# tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base
|
||||
|
||||
if kwargs.get("base", None) is not None:
|
||||
warnings.warn(
|
||||
"'base' in .resample() and in Grouper() is deprecated.\n"
|
||||
"The new arguments that you should use are 'offset' or 'origin'.\n"
|
||||
'\n>>> df.resample(freq="3s", base=2)\n'
|
||||
"\nbecomes:\n"
|
||||
'\n>>> df.resample(freq="3s", offset="2s")\n',
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
if kwargs.get("loffset", None) is not None:
|
||||
warnings.warn(
|
||||
"'loffset' in .resample() and in Grouper() is deprecated.\n"
|
||||
'\n>>> df.resample(freq="3s", loffset="8H")\n'
|
||||
"\nbecomes:\n"
|
||||
"\n>>> from pandas.tseries.frequencies import to_offset"
|
||||
'\n>>> df = df.resample(freq="3s").mean()'
|
||||
'\n>>> df.index = df.index.to_timestamp() + to_offset("8H")\n',
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
303
.venv/Lib/site-packages/pandas/core/groupby/indexing.py
Normal file
303
.venv/Lib/site-packages/pandas/core/groupby/indexing.py
Normal file
@ -0,0 +1,303 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Iterable,
|
||||
Literal,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import PositionalIndexer
|
||||
from pandas.util._decorators import (
|
||||
cache_readonly,
|
||||
doc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer,
|
||||
is_list_like,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.groupby import groupby
|
||||
|
||||
|
||||
class GroupByIndexingMixin:
|
||||
"""
|
||||
Mixin for adding ._positional_selector to GroupBy.
|
||||
"""
|
||||
|
||||
@cache_readonly
|
||||
def _positional_selector(self) -> GroupByPositionalSelector:
|
||||
"""
|
||||
Return positional selection for each group.
|
||||
|
||||
``groupby._positional_selector[i:j]`` is similar to
|
||||
``groupby.apply(lambda x: x.iloc[i:j])``
|
||||
but much faster and preserves the original index and order.
|
||||
|
||||
``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
|
||||
and :meth:`~GroupBy.tail`. For example:
|
||||
|
||||
- ``head(5)``
|
||||
- ``_positional_selector[5:-5]``
|
||||
- ``tail(5)``
|
||||
|
||||
together return all the rows.
|
||||
|
||||
Allowed inputs for the index are:
|
||||
|
||||
- An integer valued iterable, e.g. ``range(2, 4)``.
|
||||
- A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
|
||||
|
||||
The output format is the same as :meth:`~GroupBy.head` and
|
||||
:meth:`~GroupBy.tail`, namely
|
||||
a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
The filtered subset of the original Series.
|
||||
DataFrame
|
||||
The filtered subset of the original DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.iloc : Purely integer-location based indexing for selection by
|
||||
position.
|
||||
GroupBy.head : Return first n rows of each group.
|
||||
GroupBy.tail : Return last n rows of each group.
|
||||
GroupBy.nth : Take the nth row from each group if n is an int, or a
|
||||
subset of rows, if n is a list of ints.
|
||||
|
||||
Notes
|
||||
-----
|
||||
- The slice step cannot be negative.
|
||||
- If the index specification results in overlaps, the item is not duplicated.
|
||||
- If the index specification changes the order of items, then
|
||||
they are returned in their original order.
|
||||
By contrast, ``DataFrame.iloc`` can change the row order.
|
||||
- ``groupby()`` parameters such as as_index and dropna are ignored.
|
||||
|
||||
The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
|
||||
with ``as_index=False`` are:
|
||||
|
||||
- Input to ``_positional_selector`` can include
|
||||
one or more slices whereas ``nth``
|
||||
just handles an integer or a list of integers.
|
||||
- ``_positional_selector`` can accept a slice relative to the
|
||||
last row of each group.
|
||||
- ``_positional_selector`` does not have an equivalent to the
|
||||
``nth()`` ``dropna`` parameter.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
|
||||
... columns=["A", "B"])
|
||||
>>> df.groupby("A")._positional_selector[1:2]
|
||||
A B
|
||||
1 a 2
|
||||
4 b 5
|
||||
|
||||
>>> df.groupby("A")._positional_selector[1, -1]
|
||||
A B
|
||||
1 a 2
|
||||
2 a 3
|
||||
4 b 5
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
groupby_self = cast(groupby.GroupBy, self)
|
||||
else:
|
||||
groupby_self = self
|
||||
|
||||
return GroupByPositionalSelector(groupby_self)
|
||||
|
||||
def _make_mask_from_positional_indexer(
|
||||
self,
|
||||
arg: PositionalIndexer | tuple,
|
||||
) -> np.ndarray:
|
||||
if is_list_like(arg):
|
||||
if all(is_integer(i) for i in cast(Iterable, arg)):
|
||||
mask = self._make_mask_from_list(cast(Iterable[int], arg))
|
||||
else:
|
||||
mask = self._make_mask_from_tuple(cast(tuple, arg))
|
||||
|
||||
elif isinstance(arg, slice):
|
||||
mask = self._make_mask_from_slice(arg)
|
||||
elif is_integer(arg):
|
||||
mask = self._make_mask_from_int(cast(int, arg))
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Invalid index {type(arg)}. "
|
||||
"Must be integer, list-like, slice or a tuple of "
|
||||
"integers and slices"
|
||||
)
|
||||
|
||||
if isinstance(mask, bool):
|
||||
if mask:
|
||||
mask = self._ascending_count >= 0
|
||||
else:
|
||||
mask = self._ascending_count < 0
|
||||
|
||||
return cast(np.ndarray, mask)
|
||||
|
||||
def _make_mask_from_int(self, arg: int) -> np.ndarray:
|
||||
if arg >= 0:
|
||||
return self._ascending_count == arg
|
||||
else:
|
||||
return self._descending_count == (-arg - 1)
|
||||
|
||||
def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
|
||||
positive = [arg for arg in args if arg >= 0]
|
||||
negative = [-arg - 1 for arg in args if arg < 0]
|
||||
|
||||
mask: bool | np.ndarray = False
|
||||
|
||||
if positive:
|
||||
mask |= np.isin(self._ascending_count, positive)
|
||||
|
||||
if negative:
|
||||
mask |= np.isin(self._descending_count, negative)
|
||||
|
||||
return mask
|
||||
|
||||
def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
|
||||
mask: bool | np.ndarray = False
|
||||
|
||||
for arg in args:
|
||||
if is_integer(arg):
|
||||
mask |= self._make_mask_from_int(cast(int, arg))
|
||||
elif isinstance(arg, slice):
|
||||
mask |= self._make_mask_from_slice(arg)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid argument {type(arg)}. Should be int or slice."
|
||||
)
|
||||
|
||||
return mask
|
||||
|
||||
def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
|
||||
start = arg.start
|
||||
stop = arg.stop
|
||||
step = arg.step
|
||||
|
||||
if step is not None and step < 0:
|
||||
raise ValueError(f"Invalid step {step}. Must be non-negative")
|
||||
|
||||
mask: bool | np.ndarray = True
|
||||
|
||||
if step is None:
|
||||
step = 1
|
||||
|
||||
if start is None:
|
||||
if step > 1:
|
||||
mask &= self._ascending_count % step == 0
|
||||
|
||||
elif start >= 0:
|
||||
mask &= self._ascending_count >= start
|
||||
|
||||
if step > 1:
|
||||
mask &= (self._ascending_count - start) % step == 0
|
||||
|
||||
else:
|
||||
mask &= self._descending_count < -start
|
||||
|
||||
offset_array = self._descending_count + start + 1
|
||||
limit_array = (
|
||||
self._ascending_count + self._descending_count + (start + 1)
|
||||
) < 0
|
||||
offset_array = np.where(limit_array, self._ascending_count, offset_array)
|
||||
|
||||
mask &= offset_array % step == 0
|
||||
|
||||
if stop is not None:
|
||||
if stop >= 0:
|
||||
mask &= self._ascending_count < stop
|
||||
else:
|
||||
mask &= self._descending_count >= -stop
|
||||
|
||||
return mask
|
||||
|
||||
@cache_readonly
|
||||
def _ascending_count(self) -> np.ndarray:
|
||||
if TYPE_CHECKING:
|
||||
groupby_self = cast(groupby.GroupBy, self)
|
||||
else:
|
||||
groupby_self = self
|
||||
|
||||
return groupby_self._cumcount_array()
|
||||
|
||||
@cache_readonly
|
||||
def _descending_count(self) -> np.ndarray:
|
||||
if TYPE_CHECKING:
|
||||
groupby_self = cast(groupby.GroupBy, self)
|
||||
else:
|
||||
groupby_self = self
|
||||
|
||||
return groupby_self._cumcount_array(ascending=False)
|
||||
|
||||
|
||||
@doc(GroupByIndexingMixin._positional_selector)
|
||||
class GroupByPositionalSelector:
|
||||
def __init__(self, groupby_object: groupby.GroupBy):
|
||||
self.groupby_object = groupby_object
|
||||
|
||||
def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
|
||||
"""
|
||||
Select by positional index per group.
|
||||
|
||||
Implements GroupBy._positional_selector
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : PositionalIndexer | tuple
|
||||
Allowed values are:
|
||||
- int
|
||||
- int valued iterable such as list or range
|
||||
- slice with step either None or positive
|
||||
- tuple of integers and slices
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
The filtered subset of the original groupby Series.
|
||||
DataFrame
|
||||
The filtered subset of the original groupby DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.iloc : Integer-location based indexing for selection by position.
|
||||
GroupBy.head : Return first n rows of each group.
|
||||
GroupBy.tail : Return last n rows of each group.
|
||||
GroupBy._positional_selector : Return positional selection for each group.
|
||||
GroupBy.nth : Take the nth row from each group if n is an int, or a
|
||||
subset of rows, if n is a list of ints.
|
||||
"""
|
||||
self.groupby_object._reset_group_selection()
|
||||
mask = self.groupby_object._make_mask_from_positional_indexer(arg)
|
||||
return self.groupby_object._mask_selected_obj(mask)
|
||||
|
||||
|
||||
class GroupByNthSelector:
|
||||
"""
|
||||
Dynamically substituted for GroupBy.nth to enable both call and index
|
||||
"""
|
||||
|
||||
def __init__(self, groupby_object: groupby.GroupBy):
|
||||
self.groupby_object = groupby_object
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
n: PositionalIndexer | tuple,
|
||||
dropna: Literal["any", "all", None] = None,
|
||||
) -> DataFrame | Series:
|
||||
return self.groupby_object.nth_actual(n, dropna)
|
||||
|
||||
def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
|
||||
return self.groupby_object.nth_actual(n)
|
184
.venv/Lib/site-packages/pandas/core/groupby/numba_.py
Normal file
184
.venv/Lib/site-packages/pandas/core/groupby/numba_.py
Normal file
@ -0,0 +1,184 @@
|
||||
"""Common utilities for Numba operations with groupby ops"""
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import Scalar
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.util.numba_ import (
|
||||
NUMBA_FUNC_CACHE,
|
||||
NumbaUtilError,
|
||||
get_jit_arguments,
|
||||
jit_user_function,
|
||||
)
|
||||
|
||||
|
||||
def validate_udf(func: Callable) -> None:
|
||||
"""
|
||||
Validate user defined function for ops when using Numba with groupby ops.
|
||||
|
||||
The first signature arguments should include:
|
||||
|
||||
def f(values, index, ...):
|
||||
...
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function, default False
|
||||
user defined function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
Raises
|
||||
------
|
||||
NumbaUtilError
|
||||
"""
|
||||
udf_signature = list(inspect.signature(func).parameters.keys())
|
||||
expected_args = ["values", "index"]
|
||||
min_number_args = len(expected_args)
|
||||
if (
|
||||
len(udf_signature) < min_number_args
|
||||
or udf_signature[:min_number_args] != expected_args
|
||||
):
|
||||
raise NumbaUtilError(
|
||||
f"The first {min_number_args} arguments to {func.__name__} must be "
|
||||
f"{expected_args}"
|
||||
)
|
||||
|
||||
|
||||
def generate_numba_agg_func(
|
||||
kwargs: dict[str, Any],
|
||||
func: Callable[..., Scalar],
|
||||
engine_kwargs: dict[str, bool] | None,
|
||||
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
|
||||
"""
|
||||
Generate a numba jitted agg function specified by values from engine_kwargs.
|
||||
|
||||
1. jit the user's function
|
||||
2. Return a groupby agg function with the jitted function inline
|
||||
|
||||
Configurations specified in engine_kwargs apply to both the user's
|
||||
function _AND_ the groupby evaluation loop.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kwargs : dict
|
||||
**kwargs to be passed into the function
|
||||
func : function
|
||||
function to be applied to each window and will be JITed
|
||||
engine_kwargs : dict
|
||||
dictionary of arguments to be passed into numba.jit
|
||||
|
||||
Returns
|
||||
-------
|
||||
Numba function
|
||||
"""
|
||||
nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs)
|
||||
|
||||
validate_udf(func)
|
||||
cache_key = (func, "groupby_agg")
|
||||
if cache_key in NUMBA_FUNC_CACHE:
|
||||
return NUMBA_FUNC_CACHE[cache_key]
|
||||
|
||||
numba_func = jit_user_function(func, nopython, nogil, parallel)
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def group_agg(
|
||||
values: np.ndarray,
|
||||
index: np.ndarray,
|
||||
begin: np.ndarray,
|
||||
end: np.ndarray,
|
||||
num_columns: int,
|
||||
*args: Any,
|
||||
) -> np.ndarray:
|
||||
|
||||
assert len(begin) == len(end)
|
||||
num_groups = len(begin)
|
||||
|
||||
result = np.empty((num_groups, num_columns))
|
||||
for i in numba.prange(num_groups):
|
||||
group_index = index[begin[i] : end[i]]
|
||||
for j in numba.prange(num_columns):
|
||||
group = values[begin[i] : end[i], j]
|
||||
result[i, j] = numba_func(group, group_index, *args)
|
||||
return result
|
||||
|
||||
return group_agg
|
||||
|
||||
|
||||
def generate_numba_transform_func(
|
||||
kwargs: dict[str, Any],
|
||||
func: Callable[..., np.ndarray],
|
||||
engine_kwargs: dict[str, bool] | None,
|
||||
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
|
||||
"""
|
||||
Generate a numba jitted transform function specified by values from engine_kwargs.
|
||||
|
||||
1. jit the user's function
|
||||
2. Return a groupby transform function with the jitted function inline
|
||||
|
||||
Configurations specified in engine_kwargs apply to both the user's
|
||||
function _AND_ the groupby evaluation loop.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kwargs : dict
|
||||
**kwargs to be passed into the function
|
||||
func : function
|
||||
function to be applied to each window and will be JITed
|
||||
engine_kwargs : dict
|
||||
dictionary of arguments to be passed into numba.jit
|
||||
|
||||
Returns
|
||||
-------
|
||||
Numba function
|
||||
"""
|
||||
nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs)
|
||||
|
||||
validate_udf(func)
|
||||
cache_key = (func, "groupby_transform")
|
||||
if cache_key in NUMBA_FUNC_CACHE:
|
||||
return NUMBA_FUNC_CACHE[cache_key]
|
||||
|
||||
numba_func = jit_user_function(func, nopython, nogil, parallel)
|
||||
if TYPE_CHECKING:
|
||||
import numba
|
||||
else:
|
||||
numba = import_optional_dependency("numba")
|
||||
|
||||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
|
||||
def group_transform(
|
||||
values: np.ndarray,
|
||||
index: np.ndarray,
|
||||
begin: np.ndarray,
|
||||
end: np.ndarray,
|
||||
num_columns: int,
|
||||
*args: Any,
|
||||
) -> np.ndarray:
|
||||
|
||||
assert len(begin) == len(end)
|
||||
num_groups = len(begin)
|
||||
|
||||
result = np.empty((len(values), num_columns))
|
||||
for i in numba.prange(num_groups):
|
||||
group_index = index[begin[i] : end[i]]
|
||||
for j in numba.prange(num_columns):
|
||||
group = values[begin[i] : end[i], j]
|
||||
result[begin[i] : end[i], j] = numba_func(group, group_index, *args)
|
||||
return result
|
||||
|
||||
return group_transform
|
1272
.venv/Lib/site-packages/pandas/core/groupby/ops.py
Normal file
1272
.venv/Lib/site-packages/pandas/core/groupby/ops.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user