first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,13 @@
from pandas.core.window.ewm import ( # noqa:F401
ExponentialMovingWindow,
ExponentialMovingWindowGroupby,
)
from pandas.core.window.expanding import ( # noqa:F401
Expanding,
ExpandingGroupby,
)
from pandas.core.window.rolling import ( # noqa:F401
Rolling,
RollingGroupby,
Window,
)

View File

@ -0,0 +1,167 @@
"""Common utility functions for rolling operations"""
from collections import defaultdict
from typing import cast
import numpy as np
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.indexes.api import MultiIndex
def flex_binary_moment(arg1, arg2, f, pairwise=False):
if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries):
X, Y = prep_binary(arg1, arg2)
return f(X, Y)
elif isinstance(arg1, ABCDataFrame):
from pandas import DataFrame
def dataframe_from_int_dict(data, frame_template):
result = DataFrame(data, index=frame_template.index)
if len(result.columns) > 0:
result.columns = frame_template.columns[result.columns]
return result
results = {}
if isinstance(arg2, ABCDataFrame):
if pairwise is False:
if arg1 is arg2:
# special case in order to handle duplicate column names
for i in range(len(arg1.columns)):
results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
return dataframe_from_int_dict(results, arg1)
else:
if not arg1.columns.is_unique:
raise ValueError("'arg1' columns are not unique")
if not arg2.columns.is_unique:
raise ValueError("'arg2' columns are not unique")
X, Y = arg1.align(arg2, join="outer")
X, Y = prep_binary(X, Y)
res_columns = arg1.columns.union(arg2.columns)
for col in res_columns:
if col in X and col in Y:
results[col] = f(X[col], Y[col])
return DataFrame(results, index=X.index, columns=res_columns)
elif pairwise is True:
results = defaultdict(dict)
for i in range(len(arg1.columns)):
for j in range(len(arg2.columns)):
if j < i and arg2 is arg1:
# Symmetric case
results[i][j] = results[j][i]
else:
results[i][j] = f(
*prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])
)
from pandas import concat
result_index = arg1.index.union(arg2.index)
if len(result_index):
# construct result frame
result = concat(
[
concat(
[results[i][j] for j in range(len(arg2.columns))],
ignore_index=True,
)
for i in range(len(arg1.columns))
],
ignore_index=True,
axis=1,
)
result.columns = arg1.columns
# set the index and reorder
if arg2.columns.nlevels > 1:
# mypy needs to know columns is a MultiIndex, Index doesn't
# have levels attribute
arg2.columns = cast(MultiIndex, arg2.columns)
# GH 21157: Equivalent to MultiIndex.from_product(
# [result_index], <unique combinations of arg2.columns.levels>,
# )
# A normal MultiIndex.from_product will produce too many
# combinations.
result_level = np.tile(
result_index, len(result) // len(result_index)
)
arg2_levels = (
np.repeat(
arg2.columns.get_level_values(i),
len(result) // len(arg2.columns),
)
for i in range(arg2.columns.nlevels)
)
result_names = list(arg2.columns.names) + [result_index.name]
result.index = MultiIndex.from_arrays(
[*arg2_levels, result_level], names=result_names
)
# GH 34440
num_levels = len(result.index.levels)
new_order = [num_levels - 1] + list(range(num_levels - 1))
result = result.reorder_levels(new_order).sort_index()
else:
result.index = MultiIndex.from_product(
[range(len(arg2.columns)), range(len(result_index))]
)
result = result.swaplevel(1, 0).sort_index()
result.index = MultiIndex.from_product(
[result_index] + [arg2.columns]
)
else:
# empty result
result = DataFrame(
index=MultiIndex(
levels=[arg1.index, arg2.columns], codes=[[], []]
),
columns=arg2.columns,
dtype="float64",
)
# reset our index names to arg1 names
# reset our column names to arg2 names
# careful not to mutate the original names
result.columns = result.columns.set_names(arg1.columns.names)
result.index = result.index.set_names(
result_index.names + arg2.columns.names
)
return result
else:
results = {
i: f(*prep_binary(arg1.iloc[:, i], arg2))
for i in range(len(arg1.columns))
}
return dataframe_from_int_dict(results, arg1)
else:
return flex_binary_moment(arg2, arg1, f)
def zsqrt(x):
with np.errstate(all="ignore"):
result = np.sqrt(x)
mask = x < 0
if isinstance(x, ABCDataFrame):
if mask._values.any():
result[mask] = 0
else:
if mask.any():
result[mask] = 0
return result
def prep_binary(arg1, arg2):
# mask out values, this also makes a common index...
X = arg1 + 0 * arg2
Y = arg2 + 0 * arg1
return X, Y

View File

@ -0,0 +1,125 @@
"""Any shareable docstring components for rolling/expanding/ewm"""
from textwrap import dedent
from pandas.core.shared_docs import _shared_docs
_shared_docs = dict(**_shared_docs)
def create_section_header(header: str) -> str:
"""Create numpydoc section header"""
return "\n".join((header, "-" * len(header))) + "\n"
template_header = "\nCalculate the {window_method} {aggregation_description}.\n\n"
template_returns = dedent(
"""
Series or DataFrame
Return type is the same as the original object with ``np.float64`` dtype.\n
"""
).replace("\n", "", 1)
template_see_also = dedent(
"""
pandas.Series.{window_method} : Calling {window_method} with Series data.
pandas.DataFrame.{window_method} : Calling {window_method} with DataFrames.
pandas.Series.{agg_method} : Aggregating {agg_method} for Series.
pandas.DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n
"""
).replace("\n", "", 1)
args_compat = dedent(
"""
*args
For NumPy compatibility and will not have an effect on the result.\n
"""
).replace("\n", "", 1)
kwargs_compat = dedent(
"""
**kwargs
For NumPy compatibility and will not have an effect on the result.\n
"""
).replace("\n", "", 1)
kwargs_scipy = dedent(
"""
**kwargs
Keyword arguments to configure the ``SciPy`` weighted window type.\n
"""
).replace("\n", "", 1)
window_apply_parameters = dedent(
"""
func : function
Must produce a single value from an ndarray input if ``raw=True``
or a single value from a Series if ``raw=False``. Can also accept a
Numba JIT function with ``engine='numba'`` specified.
.. versionchanged:: 1.0.0
raw : bool, default False
* ``False`` : passes each row or column as a Series to the
function.
* ``True`` : the passed function will receive ndarray
objects instead.
If you are just applying a NumPy reduction function this will
achieve much better performance.
engine : str, default None
* ``'cython'`` : Runs rolling apply through C-extensions from cython.
* ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
Only available when ``raw`` is set to ``True``.
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
.. versionadded:: 1.0.0
engine_kwargs : dict, default None
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
applied to both the ``func`` and the ``apply`` rolling aggregation.
.. versionadded:: 1.0.0
args : tuple, default None
Positional arguments to be passed into func.
kwargs : dict, default None
Keyword arguments to be passed into func.\n
"""
).replace("\n", "", 1)
numba_notes = (
"See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for "
"extended documentation and performance considerations for the Numba engine.\n\n"
)
def window_agg_numba_parameters(version: str = "1.3") -> str:
return (
dedent(
"""
engine : str, default None
* ``'cython'`` : Runs the operation through C-extensions from cython.
* ``'numba'`` : Runs the operation through JIT compiled code from numba.
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
.. versionadded:: {version}.0
engine_kwargs : dict, default None
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{{'nopython': True, 'nogil': False, 'parallel': False}}``
.. versionadded:: {version}.0\n
"""
)
.replace("\n", "", 1)
.replace("{version}", version)
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,807 @@
from __future__ import annotations
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
from pandas._typing import (
Axis,
WindowingRankType,
)
if TYPE_CHECKING:
from pandas import DataFrame, Series
from pandas.core.generic import NDFrame
from pandas.compat.numpy import function as nv
from pandas.util._decorators import doc
from pandas.core.indexers.objects import (
BaseIndexer,
ExpandingIndexer,
GroupbyIndexer,
)
from pandas.core.window.doc import (
_shared_docs,
args_compat,
create_section_header,
kwargs_compat,
numba_notes,
template_header,
template_returns,
template_see_also,
window_agg_numba_parameters,
window_apply_parameters,
)
from pandas.core.window.rolling import (
BaseWindowGroupby,
RollingAndExpandingMixin,
)
class Expanding(RollingAndExpandingMixin):
"""
Provide expanding window calculations.
Parameters
----------
min_periods : int, default 1
Minimum number of observations in window required to have a value;
otherwise, result is ``np.nan``.
center : bool, default False
If False, set the window labels as the right edge of the window index.
If True, set the window labels as the center of the window index.
.. deprecated:: 1.1.0
axis : int or str, default 0
If ``0`` or ``'index'``, roll across the rows.
If ``1`` or ``'columns'``, roll across the columns.
method : str {'single', 'table'}, default 'single'
Execute the rolling operation per single column or row (``'single'``)
or over the entire object (``'table'``).
This argument is only implemented when specifying ``engine='numba'``
in the method call.
.. versionadded:: 1.3.0
Returns
-------
``Expanding`` subclass
See Also
--------
rolling : Provides rolling window calculations.
ewm : Provides exponential weighted functions.
Notes
-----
See :ref:`Windowing Operations <window.expanding>` for further usage details
and examples.
Examples
--------
>>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
>>> df
B
0 0.0
1 1.0
2 2.0
3 NaN
4 4.0
**min_periods**
Expanding sum with 1 vs 3 observations needed to calculate a value.
>>> df.expanding(1).sum()
B
0 0.0
1 1.0
2 3.0
3 3.0
4 7.0
>>> df.expanding(3).sum()
B
0 NaN
1 NaN
2 3.0
3 3.0
4 7.0
"""
_attributes: list[str] = ["min_periods", "center", "axis", "method"]
def __init__(
self,
obj: NDFrame,
min_periods: int = 1,
center=None,
axis: Axis = 0,
method: str = "single",
selection=None,
):
super().__init__(
obj=obj,
min_periods=min_periods,
center=center,
axis=axis,
method=method,
selection=selection,
)
def _get_window_indexer(self) -> BaseIndexer:
"""
Return an indexer class that will compute the window start and end bounds
"""
return ExpandingIndexer()
@doc(
_shared_docs["aggregate"],
see_also=dedent(
"""
See Also
--------
pandas.DataFrame.aggregate : Similar DataFrame method.
pandas.Series.aggregate : Similar Series method.
"""
),
examples=dedent(
"""
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
>>> df
A B C
0 1 4 7
1 2 5 8
2 3 6 9
>>> df.ewm(alpha=0.5).mean()
A B C
0 1.000000 4.000000 7.000000
1 1.666667 4.666667 7.666667
2 2.428571 5.428571 8.428571
"""
),
klass="Series/Dataframe",
axis="",
)
def aggregate(self, func, *args, **kwargs):
return super().aggregate(func, *args, **kwargs)
agg = aggregate
@doc(
template_header,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also[:-1],
window_method="expanding",
aggregation_description="count of non NaN observations",
agg_method="count",
)
def count(self):
return super().count()
@doc(
template_header,
create_section_header("Parameters"),
window_apply_parameters,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also[:-1],
window_method="expanding",
aggregation_description="custom aggregation function",
agg_method="apply",
)
def apply(
self,
func: Callable[..., Any],
raw: bool = False,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
args: tuple[Any, ...] | None = None,
kwargs: dict[str, Any] | None = None,
):
return super().apply(
func,
raw=raw,
engine=engine,
engine_kwargs=engine_kwargs,
args=args,
kwargs=kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
args_compat,
window_agg_numba_parameters(),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes[:-1],
window_method="expanding",
aggregation_description="sum",
agg_method="sum",
)
def sum(
self,
*args,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
nv.validate_expanding_func("sum", args, kwargs)
return super().sum(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs)
@doc(
template_header,
create_section_header("Parameters"),
args_compat,
window_agg_numba_parameters(),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes[:-1],
window_method="expanding",
aggregation_description="maximum",
agg_method="max",
)
def max(
self,
*args,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
nv.validate_expanding_func("max", args, kwargs)
return super().max(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs)
@doc(
template_header,
create_section_header("Parameters"),
args_compat,
window_agg_numba_parameters(),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes[:-1],
window_method="expanding",
aggregation_description="minimum",
agg_method="min",
)
def min(
self,
*args,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
nv.validate_expanding_func("min", args, kwargs)
return super().min(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs)
@doc(
template_header,
create_section_header("Parameters"),
args_compat,
window_agg_numba_parameters(),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes[:-1],
window_method="expanding",
aggregation_description="mean",
agg_method="mean",
)
def mean(
self,
*args,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
nv.validate_expanding_func("mean", args, kwargs)
return super().mean(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs)
@doc(
template_header,
create_section_header("Parameters"),
window_agg_numba_parameters(),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
numba_notes[:-1],
window_method="expanding",
aggregation_description="median",
agg_method="median",
)
def median(
self,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
return super().median(engine=engine, engine_kwargs=engine_kwargs, **kwargs)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.\n
"""
).replace("\n", "", 1),
args_compat,
window_agg_numba_parameters("1.4"),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"numpy.std : Equivalent method for NumPy array.\n",
template_see_also,
create_section_header("Notes"),
dedent(
"""
The default ``ddof`` of 1 used in :meth:`Series.std` is different
than the default ``ddof`` of 0 in :func:`numpy.std`.
A minimum of one period is required for the rolling calculation.\n
"""
).replace("\n", "", 1),
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.expanding(3).std()
0 NaN
1 NaN
2 0.577350
3 0.957427
4 0.894427
5 0.836660
6 0.786796
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="standard deviation",
agg_method="std",
)
def std(
self,
ddof: int = 1,
*args,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
nv.validate_expanding_func("std", args, kwargs)
return super().std(
ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.\n
"""
).replace("\n", "", 1),
args_compat,
window_agg_numba_parameters("1.4"),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"numpy.var : Equivalent method for NumPy array.\n",
template_see_also,
create_section_header("Notes"),
dedent(
"""
The default ``ddof`` of 1 used in :meth:`Series.var` is different
than the default ``ddof`` of 0 in :func:`numpy.var`.
A minimum of one period is required for the rolling calculation.\n
"""
).replace("\n", "", 1),
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.expanding(3).var()
0 NaN
1 NaN
2 0.333333
3 0.916667
4 0.800000
5 0.700000
6 0.619048
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="variance",
agg_method="var",
)
def var(
self,
ddof: int = 1,
*args,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
**kwargs,
):
nv.validate_expanding_func("var", args, kwargs)
return super().var(
ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.\n
"""
).replace("\n", "", 1),
args_compat,
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Notes"),
"A minimum of one period is required for the calculation.\n\n",
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([0, 1, 2, 3])
>>> s.expanding().sem()
0 NaN
1 0.707107
2 0.707107
3 0.745356
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="standard error of mean",
agg_method="sem",
)
def sem(self, ddof: int = 1, *args, **kwargs):
return super().sem(ddof=ddof, **kwargs)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"scipy.stats.skew : Third moment of a probability density.\n",
template_see_also,
create_section_header("Notes"),
"A minimum of three periods is required for the rolling calculation.\n",
window_method="expanding",
aggregation_description="unbiased skewness",
agg_method="skew",
)
def skew(self, **kwargs):
return super().skew(**kwargs)
@doc(
template_header,
create_section_header("Parameters"),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
"scipy.stats.kurtosis : Reference SciPy method.\n",
template_see_also,
create_section_header("Notes"),
"A minimum of four periods is required for the calculation.\n\n",
create_section_header("Examples"),
dedent(
"""
The example below will show a rolling calculation with a window size of
four matching the equivalent function call using `scipy.stats`.
>>> arr = [1, 2, 3, 4, 999]
>>> import scipy.stats
>>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}")
-1.200000
>>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}")
4.999874
>>> s = pd.Series(arr)
>>> s.expanding(4).kurt()
0 NaN
1 NaN
2 NaN
3 -1.200000
4 4.999874
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="Fisher's definition of kurtosis without bias",
agg_method="kurt",
)
def kurt(self, **kwargs):
return super().kurt(**kwargs)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
quantile : float
Quantile to compute. 0 <= quantile <= 1.
interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}}
This optional parameter specifies the interpolation method to use,
when the desired quantile lies between two data points `i` and `j`:
* linear: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
* lower: `i`.
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.
"""
).replace("\n", "", 1),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also[:-1],
window_method="expanding",
aggregation_description="quantile",
agg_method="quantile",
)
def quantile(
self,
quantile: float,
interpolation: str = "linear",
**kwargs,
):
return super().quantile(
quantile=quantile,
interpolation=interpolation,
**kwargs,
)
@doc(
template_header,
".. versionadded:: 1.4.0 \n\n",
create_section_header("Parameters"),
dedent(
"""
method : {{'average', 'min', 'max'}}, default 'average'
How to rank the group of records that have the same value (i.e. ties):
* average: average rank of the group
* min: lowest rank in the group
* max: highest rank in the group
ascending : bool, default True
Whether or not the elements should be ranked in ascending order.
pct : bool, default False
Whether or not to display the returned rankings in percentile
form.
"""
).replace("\n", "", 1),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
>>> s.expanding().rank()
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 3.5
dtype: float64
>>> s.expanding().rank(method="max")
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 4.0
dtype: float64
>>> s.expanding().rank(method="min")
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 3.0
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="rank",
agg_method="rank",
)
def rank(
self,
method: WindowingRankType = "average",
ascending: bool = True,
pct: bool = False,
**kwargs,
):
return super().rank(
method=method,
ascending=ascending,
pct=pct,
**kwargs,
)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
other : Series or DataFrame, optional
If not supplied then will default to self and produce pairwise
output.
pairwise : bool, default None
If False then only matching columns between self and other will be
used and the output will be a DataFrame.
If True then all pairwise combinations will be calculated and the
output will be a MultiIndexed DataFrame in the case of DataFrame
inputs. In the case of missing elements, only complete pairwise
observations will be used.
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.
"""
).replace("\n", "", 1),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also[:-1],
window_method="expanding",
aggregation_description="sample covariance",
agg_method="cov",
)
def cov(
self,
other: DataFrame | Series | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
):
return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs)
@doc(
template_header,
create_section_header("Parameters"),
dedent(
"""
other : Series or DataFrame, optional
If not supplied then will default to self and produce pairwise
output.
pairwise : bool, default None
If False then only matching columns between self and other will be
used and the output will be a DataFrame.
If True then all pairwise combinations will be calculated and the
output will be a MultiIndexed DataFrame in the case of DataFrame
inputs. In the case of missing elements, only complete pairwise
observations will be used.
"""
).replace("\n", "", 1),
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
dedent(
"""
cov : Similar method to calculate covariance.
numpy.corrcoef : NumPy Pearson's correlation calculation.
"""
).replace("\n", "", 1),
template_see_also,
create_section_header("Notes"),
dedent(
"""
This function uses Pearson's definition of correlation
(https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
When `other` is not specified, the output will be self correlation (e.g.
all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
set to `True`.
Function will return ``NaN`` for correlations of equal valued sequences;
this is the result of a 0/0 division error.
When `pairwise` is set to `False`, only matching columns between `self` and
`other` will be used.
When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
with the original index on the first level, and the `other` DataFrame
columns on the second level.
In the case of missing elements, only complete pairwise observations
will be used.
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="correlation",
agg_method="corr",
)
def corr(
self,
other: DataFrame | Series | None = None,
pairwise: bool | None = None,
ddof: int = 1,
**kwargs,
):
return super().corr(other=other, pairwise=pairwise, ddof=ddof, **kwargs)
class ExpandingGroupby(BaseWindowGroupby, Expanding):
"""
Provide a expanding groupby implementation.
"""
_attributes = Expanding._attributes + BaseWindowGroupby._attributes
def _get_window_indexer(self) -> GroupbyIndexer:
"""
Return an indexer class that will compute the window start and end bounds
Returns
-------
GroupbyIndexer
"""
window_indexer = GroupbyIndexer(
groupby_indices=self._grouper.indices,
window_indexer=ExpandingIndexer,
)
return window_indexer

View File

@ -0,0 +1,364 @@
from __future__ import annotations
import functools
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import numpy as np
from pandas._typing import Scalar
from pandas.compat._optional import import_optional_dependency
from pandas.core.util.numba_ import (
NUMBA_FUNC_CACHE,
get_jit_arguments,
jit_user_function,
)
def generate_numba_apply_func(
kwargs: dict[str, Any],
func: Callable[..., Scalar],
engine_kwargs: dict[str, bool] | None,
name: str,
):
"""
Generate a numba jitted apply function specified by values from engine_kwargs.
1. jit the user's function
2. Return a rolling apply function with the jitted function inline
Configurations specified in engine_kwargs apply to both the user's
function _AND_ the rolling apply function.
Parameters
----------
kwargs : dict
**kwargs to be passed into the function
func : function
function to be applied to each window and will be JITed
engine_kwargs : dict
dictionary of arguments to be passed into numba.jit
name: str
name of the caller (Rolling/Expanding)
Returns
-------
Numba function
"""
nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs)
cache_key = (func, f"{name}_apply_single")
if cache_key in NUMBA_FUNC_CACHE:
return NUMBA_FUNC_CACHE[cache_key]
numba_func = jit_user_function(func, nopython, nogil, parallel)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def roll_apply(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
*args: Any,
) -> np.ndarray:
result = np.empty(len(begin))
for i in numba.prange(len(result)):
start = begin[i]
stop = end[i]
window = values[start:stop]
count_nan = np.sum(np.isnan(window))
if len(window) - count_nan >= minimum_periods:
result[i] = numba_func(window, *args)
else:
result[i] = np.nan
return result
return roll_apply
def generate_numba_ewm_func(
engine_kwargs: dict[str, bool] | None,
com: float,
adjust: bool,
ignore_na: bool,
deltas: np.ndarray,
normalize: bool,
):
"""
Generate a numba jitted ewm mean or sum function specified by values
from engine_kwargs.
Parameters
----------
engine_kwargs : dict
dictionary of arguments to be passed into numba.jit
com : float
adjust : bool
ignore_na : bool
deltas : numpy.ndarray
normalize : bool
Returns
-------
Numba function
"""
nopython, nogil, parallel = get_jit_arguments(engine_kwargs)
str_key = "ewm_mean" if normalize else "ewm_sum"
cache_key = (lambda x: x, str_key)
if cache_key in NUMBA_FUNC_CACHE:
return NUMBA_FUNC_CACHE[cache_key]
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def ewm(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
) -> np.ndarray:
result = np.empty(len(values))
alpha = 1.0 / (1.0 + com)
old_wt_factor = 1.0 - alpha
new_wt = 1.0 if adjust else alpha
for i in numba.prange(len(begin)):
start = begin[i]
stop = end[i]
window = values[start:stop]
sub_result = np.empty(len(window))
weighted = window[0]
nobs = int(not np.isnan(weighted))
sub_result[0] = weighted if nobs >= minimum_periods else np.nan
old_wt = 1.0
for j in range(1, len(window)):
cur = window[j]
is_observation = not np.isnan(cur)
nobs += is_observation
if not np.isnan(weighted):
if is_observation or not ignore_na:
if normalize:
# note that len(deltas) = len(vals) - 1 and deltas[i]
# is to be used in conjunction with vals[i+1]
old_wt *= old_wt_factor ** deltas[start + j - 1]
else:
weighted = old_wt_factor * weighted
if is_observation:
if normalize:
# avoid numerical errors on constant series
if weighted != cur:
weighted = old_wt * weighted + new_wt * cur
if normalize:
weighted = weighted / (old_wt + new_wt)
if adjust:
old_wt += new_wt
else:
old_wt = 1.0
else:
weighted += cur
elif is_observation:
weighted = cur
sub_result[j] = weighted if nobs >= minimum_periods else np.nan
result[start:stop] = sub_result
return result
return ewm
def generate_numba_table_func(
kwargs: dict[str, Any],
func: Callable[..., np.ndarray],
engine_kwargs: dict[str, bool] | None,
name: str,
):
"""
Generate a numba jitted function to apply window calculations table-wise.
Func will be passed a M window size x N number of columns array, and
must return a 1 x N number of columns array. Func is intended to operate
row-wise, but the result will be transposed for axis=1.
1. jit the user's function
2. Return a rolling apply function with the jitted function inline
Parameters
----------
kwargs : dict
**kwargs to be passed into the function
func : function
function to be applied to each window and will be JITed
engine_kwargs : dict
dictionary of arguments to be passed into numba.jit
name : str
caller (Rolling/Expanding) and original method name for numba cache key
Returns
-------
Numba function
"""
nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs)
cache_key = (func, f"{name}_table")
if cache_key in NUMBA_FUNC_CACHE:
return NUMBA_FUNC_CACHE[cache_key]
numba_func = jit_user_function(func, nopython, nogil, parallel)
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def roll_table(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
*args: Any,
):
result = np.empty(values.shape)
min_periods_mask = np.empty(values.shape)
for i in numba.prange(len(result)):
start = begin[i]
stop = end[i]
window = values[start:stop]
count_nan = np.sum(np.isnan(window), axis=0)
sub_result = numba_func(window, *args)
nan_mask = len(window) - count_nan >= minimum_periods
min_periods_mask[i, :] = nan_mask
result[i, :] = sub_result
result = np.where(min_periods_mask, result, np.nan)
return result
return roll_table
# This function will no longer be needed once numba supports
# axis for all np.nan* agg functions
# https://github.com/numba/numba/issues/1269
@functools.lru_cache(maxsize=None)
def generate_manual_numpy_nan_agg_with_axis(nan_func):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=True, nogil=True, parallel=True)
def nan_agg_with_axis(table):
result = np.empty(table.shape[1])
for i in numba.prange(table.shape[1]):
partition = table[:, i]
result[i] = nan_func(partition)
return result
return nan_agg_with_axis
def generate_numba_ewm_table_func(
engine_kwargs: dict[str, bool] | None,
com: float,
adjust: bool,
ignore_na: bool,
deltas: np.ndarray,
normalize: bool,
):
"""
Generate a numba jitted ewm mean or sum function applied table wise specified
by values from engine_kwargs.
Parameters
----------
engine_kwargs : dict
dictionary of arguments to be passed into numba.jit
com : float
adjust : bool
ignore_na : bool
deltas : numpy.ndarray
normalize: bool
Returns
-------
Numba function
"""
nopython, nogil, parallel = get_jit_arguments(engine_kwargs)
str_key = "ewm_mean_table" if normalize else "ewm_sum_table"
cache_key = (lambda x: x, str_key)
if cache_key in NUMBA_FUNC_CACHE:
return NUMBA_FUNC_CACHE[cache_key]
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def ewm_table(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
) -> np.ndarray:
alpha = 1.0 / (1.0 + com)
old_wt_factor = 1.0 - alpha
new_wt = 1.0 if adjust else alpha
old_wt = np.ones(values.shape[1])
result = np.empty(values.shape)
weighted = values[0].copy()
nobs = (~np.isnan(weighted)).astype(np.int64)
result[0] = np.where(nobs >= minimum_periods, weighted, np.nan)
for i in range(1, len(values)):
cur = values[i]
is_observations = ~np.isnan(cur)
nobs += is_observations.astype(np.int64)
for j in numba.prange(len(cur)):
if not np.isnan(weighted[j]):
if is_observations[j] or not ignore_na:
if normalize:
# note that len(deltas) = len(vals) - 1 and deltas[i]
# is to be used in conjunction with vals[i+1]
old_wt[j] *= old_wt_factor ** deltas[i - 1]
else:
weighted[j] = old_wt_factor * weighted[j]
if is_observations[j]:
if normalize:
# avoid numerical errors on constant series
if weighted[j] != cur[j]:
weighted[j] = (
old_wt[j] * weighted[j] + new_wt * cur[j]
)
if normalize:
weighted[j] = weighted[j] / (old_wt[j] + new_wt)
if adjust:
old_wt[j] += new_wt
else:
old_wt[j] = 1.0
else:
weighted[j] += cur[j]
elif is_observations[j]:
weighted[j] = cur[j]
result[i] = np.where(nobs >= minimum_periods, weighted, np.nan)
return result
return ewm_table

View File

@ -0,0 +1,122 @@
from typing import (
TYPE_CHECKING,
Dict,
Optional,
)
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.util.numba_ import (
NUMBA_FUNC_CACHE,
get_jit_arguments,
)
def generate_online_numba_ewma_func(engine_kwargs: Optional[Dict[str, bool]]):
"""
Generate a numba jitted groupby ewma function specified by values
from engine_kwargs.
Parameters
----------
engine_kwargs : dict
dictionary of arguments to be passed into numba.jit
Returns
-------
Numba function
"""
nopython, nogil, parallel = get_jit_arguments(engine_kwargs)
cache_key = (lambda x: x, "online_ewma")
if cache_key in NUMBA_FUNC_CACHE:
return NUMBA_FUNC_CACHE[cache_key]
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def online_ewma(
values: np.ndarray,
deltas: np.ndarray,
minimum_periods: int,
old_wt_factor: float,
new_wt: float,
old_wt: np.ndarray,
adjust: bool,
ignore_na: bool,
):
"""
Compute online exponentially weighted mean per column over 2D values.
Takes the first observation as is, then computes the subsequent
exponentially weighted mean accounting minimum periods.
"""
result = np.empty(values.shape)
weighted_avg = values[0]
nobs = (~np.isnan(weighted_avg)).astype(np.int64)
result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
for i in range(1, len(values)):
cur = values[i]
is_observations = ~np.isnan(cur)
nobs += is_observations.astype(np.int64)
for j in numba.prange(len(cur)):
if not np.isnan(weighted_avg[j]):
if is_observations[j] or not ignore_na:
# note that len(deltas) = len(vals) - 1 and deltas[i] is to be
# used in conjunction with vals[i+1]
old_wt[j] *= old_wt_factor ** deltas[j - 1]
if is_observations[j]:
# avoid numerical errors on constant series
if weighted_avg[j] != cur[j]:
weighted_avg[j] = (
(old_wt[j] * weighted_avg[j]) + (new_wt * cur[j])
) / (old_wt[j] + new_wt)
if adjust:
old_wt[j] += new_wt
else:
old_wt[j] = 1.0
elif is_observations[j]:
weighted_avg[j] = cur[j]
result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
return result, old_wt
return online_ewma
class EWMMeanState:
def __init__(self, com, adjust, ignore_na, axis, shape):
alpha = 1.0 / (1.0 + com)
self.axis = axis
self.shape = shape
self.adjust = adjust
self.ignore_na = ignore_na
self.new_wt = 1.0 if adjust else alpha
self.old_wt_factor = 1.0 - alpha
self.old_wt = np.ones(self.shape[self.axis - 1])
self.last_ewm = None
def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func):
result, old_wt = ewm_func(
weighted_avg,
deltas,
min_periods,
self.old_wt_factor,
self.new_wt,
self.old_wt,
self.adjust,
self.ignore_na,
)
self.old_wt = old_wt
self.last_ewm = result[-1]
return result
def reset(self):
self.old_wt = np.ones(self.shape[self.axis - 1])
self.last_ewm = None

File diff suppressed because it is too large Load Diff