mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-03 22:57:06 +00:00
first commit
This commit is contained in:
23
.venv/Lib/site-packages/pandas/core/reshape/api.py
Normal file
23
.venv/Lib/site-packages/pandas/core/reshape/api.py
Normal file
@ -0,0 +1,23 @@
|
||||
# flake8: noqa:F401
|
||||
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.melt import (
|
||||
lreshape,
|
||||
melt,
|
||||
wide_to_long,
|
||||
)
|
||||
from pandas.core.reshape.merge import (
|
||||
merge,
|
||||
merge_asof,
|
||||
merge_ordered,
|
||||
)
|
||||
from pandas.core.reshape.pivot import (
|
||||
crosstab,
|
||||
pivot,
|
||||
pivot_table,
|
||||
)
|
||||
from pandas.core.reshape.reshape import get_dummies
|
||||
from pandas.core.reshape.tile import (
|
||||
cut,
|
||||
qcut,
|
||||
)
|
793
.venv/Lib/site-packages/pandas/core/reshape/concat.py
Normal file
793
.venv/Lib/site-packages/pandas/core/reshape/concat.py
Normal file
@ -0,0 +1,793 @@
|
||||
"""
|
||||
Concat routines.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Hashable,
|
||||
Iterable,
|
||||
Literal,
|
||||
Mapping,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import Axis
|
||||
from pandas.util._decorators import (
|
||||
cache_readonly,
|
||||
deprecate_nonkeyword_arguments,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.concat import concat_compat
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCSeries,
|
||||
)
|
||||
from pandas.core.dtypes.inference import is_bool
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core.arrays.categorical import (
|
||||
factorize_from_iterable,
|
||||
factorize_from_iterables,
|
||||
)
|
||||
import pandas.core.common as com
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
all_indexes_same,
|
||||
default_index,
|
||||
ensure_index,
|
||||
get_objs_combined_axis,
|
||||
get_unanimous_names,
|
||||
)
|
||||
from pandas.core.internals import concatenate_managers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.generic import NDFrame
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Concatenate DataFrame objects
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[DataFrame] | Mapping[Hashable, DataFrame],
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys=...,
|
||||
levels=...,
|
||||
names=...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool = ...,
|
||||
) -> DataFrame:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[Series] | Mapping[Hashable, Series],
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys=...,
|
||||
levels=...,
|
||||
names=...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool = ...,
|
||||
) -> Series:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[NDFrame] | Mapping[Hashable, NDFrame],
|
||||
axis: Literal[0, "index"] = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys=...,
|
||||
levels=...,
|
||||
names=...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool = ...,
|
||||
) -> DataFrame | Series:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[NDFrame] | Mapping[Hashable, NDFrame],
|
||||
axis: Literal[1, "columns"],
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys=...,
|
||||
levels=...,
|
||||
names=...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool = ...,
|
||||
) -> DataFrame:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def concat(
|
||||
objs: Iterable[NDFrame] | Mapping[Hashable, NDFrame],
|
||||
axis: Axis = ...,
|
||||
join: str = ...,
|
||||
ignore_index: bool = ...,
|
||||
keys=...,
|
||||
levels=...,
|
||||
names=...,
|
||||
verify_integrity: bool = ...,
|
||||
sort: bool = ...,
|
||||
copy: bool = ...,
|
||||
) -> DataFrame | Series:
|
||||
...
|
||||
|
||||
|
||||
@deprecate_nonkeyword_arguments(version=None, allowed_args=["objs"])
|
||||
def concat(
|
||||
objs: Iterable[NDFrame] | Mapping[Hashable, NDFrame],
|
||||
axis: Axis = 0,
|
||||
join: str = "outer",
|
||||
ignore_index: bool = False,
|
||||
keys=None,
|
||||
levels=None,
|
||||
names=None,
|
||||
verify_integrity: bool = False,
|
||||
sort: bool = False,
|
||||
copy: bool = True,
|
||||
) -> DataFrame | Series:
|
||||
"""
|
||||
Concatenate pandas objects along a particular axis with optional set logic
|
||||
along the other axes.
|
||||
|
||||
Can also add a layer of hierarchical indexing on the concatenation axis,
|
||||
which may be useful if the labels are the same (or overlapping) on
|
||||
the passed axis number.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
objs : a sequence or mapping of Series or DataFrame objects
|
||||
If a mapping is passed, the sorted keys will be used as the `keys`
|
||||
argument, unless it is passed, in which case the values will be
|
||||
selected (see below). Any None objects will be dropped silently unless
|
||||
they are all None in which case a ValueError will be raised.
|
||||
axis : {0/'index', 1/'columns'}, default 0
|
||||
The axis to concatenate along.
|
||||
join : {'inner', 'outer'}, default 'outer'
|
||||
How to handle indexes on other axis (or axes).
|
||||
ignore_index : bool, default False
|
||||
If True, do not use the index values along the concatenation axis. The
|
||||
resulting axis will be labeled 0, ..., n - 1. This is useful if you are
|
||||
concatenating objects where the concatenation axis does not have
|
||||
meaningful indexing information. Note the index values on the other
|
||||
axes are still respected in the join.
|
||||
keys : sequence, default None
|
||||
If multiple levels passed, should contain tuples. Construct
|
||||
hierarchical index using the passed keys as the outermost level.
|
||||
levels : list of sequences, default None
|
||||
Specific levels (unique values) to use for constructing a
|
||||
MultiIndex. Otherwise they will be inferred from the keys.
|
||||
names : list, default None
|
||||
Names for the levels in the resulting hierarchical index.
|
||||
verify_integrity : bool, default False
|
||||
Check whether the new concatenated axis contains duplicates. This can
|
||||
be very expensive relative to the actual data concatenation.
|
||||
sort : bool, default False
|
||||
Sort non-concatenation axis if it is not already aligned when `join`
|
||||
is 'outer'.
|
||||
This has no effect when ``join='inner'``, which already preserves
|
||||
the order of the non-concatenation axis.
|
||||
|
||||
.. versionchanged:: 1.0.0
|
||||
|
||||
Changed to not sort by default.
|
||||
|
||||
copy : bool, default True
|
||||
If False, do not copy data unnecessarily.
|
||||
|
||||
Returns
|
||||
-------
|
||||
object, type of objs
|
||||
When concatenating all ``Series`` along the index (axis=0), a
|
||||
``Series`` is returned. When ``objs`` contains at least one
|
||||
``DataFrame``, a ``DataFrame`` is returned. When concatenating along
|
||||
the columns (axis=1), a ``DataFrame`` is returned.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.append : Concatenate Series.
|
||||
DataFrame.append : Concatenate DataFrames.
|
||||
DataFrame.join : Join DataFrames using indexes.
|
||||
DataFrame.merge : Merge DataFrames by indexes or columns.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The keys, levels, and names arguments are all optional.
|
||||
|
||||
A walkthrough of how this method fits in with other tools for combining
|
||||
pandas objects can be found `here
|
||||
<https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Combine two ``Series``.
|
||||
|
||||
>>> s1 = pd.Series(['a', 'b'])
|
||||
>>> s2 = pd.Series(['c', 'd'])
|
||||
>>> pd.concat([s1, s2])
|
||||
0 a
|
||||
1 b
|
||||
0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Clear the existing index and reset it in the result
|
||||
by setting the ``ignore_index`` option to ``True``.
|
||||
|
||||
>>> pd.concat([s1, s2], ignore_index=True)
|
||||
0 a
|
||||
1 b
|
||||
2 c
|
||||
3 d
|
||||
dtype: object
|
||||
|
||||
Add a hierarchical index at the outermost level of
|
||||
the data with the ``keys`` option.
|
||||
|
||||
>>> pd.concat([s1, s2], keys=['s1', 's2'])
|
||||
s1 0 a
|
||||
1 b
|
||||
s2 0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Label the index keys you create with the ``names`` option.
|
||||
|
||||
>>> pd.concat([s1, s2], keys=['s1', 's2'],
|
||||
... names=['Series name', 'Row ID'])
|
||||
Series name Row ID
|
||||
s1 0 a
|
||||
1 b
|
||||
s2 0 c
|
||||
1 d
|
||||
dtype: object
|
||||
|
||||
Combine two ``DataFrame`` objects with identical columns.
|
||||
|
||||
>>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
|
||||
... columns=['letter', 'number'])
|
||||
>>> df1
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
>>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
|
||||
... columns=['letter', 'number'])
|
||||
>>> df2
|
||||
letter number
|
||||
0 c 3
|
||||
1 d 4
|
||||
>>> pd.concat([df1, df2])
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
0 c 3
|
||||
1 d 4
|
||||
|
||||
Combine ``DataFrame`` objects with overlapping columns
|
||||
and return everything. Columns outside the intersection will
|
||||
be filled with ``NaN`` values.
|
||||
|
||||
>>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
|
||||
... columns=['letter', 'number', 'animal'])
|
||||
>>> df3
|
||||
letter number animal
|
||||
0 c 3 cat
|
||||
1 d 4 dog
|
||||
>>> pd.concat([df1, df3], sort=False)
|
||||
letter number animal
|
||||
0 a 1 NaN
|
||||
1 b 2 NaN
|
||||
0 c 3 cat
|
||||
1 d 4 dog
|
||||
|
||||
Combine ``DataFrame`` objects with overlapping columns
|
||||
and return only those that are shared by passing ``inner`` to
|
||||
the ``join`` keyword argument.
|
||||
|
||||
>>> pd.concat([df1, df3], join="inner")
|
||||
letter number
|
||||
0 a 1
|
||||
1 b 2
|
||||
0 c 3
|
||||
1 d 4
|
||||
|
||||
Combine ``DataFrame`` objects horizontally along the x axis by
|
||||
passing in ``axis=1``.
|
||||
|
||||
>>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
|
||||
... columns=['animal', 'name'])
|
||||
>>> pd.concat([df1, df4], axis=1)
|
||||
letter number animal name
|
||||
0 a 1 bird polly
|
||||
1 b 2 monkey george
|
||||
|
||||
Prevent the result from including duplicate index values with the
|
||||
``verify_integrity`` option.
|
||||
|
||||
>>> df5 = pd.DataFrame([1], index=['a'])
|
||||
>>> df5
|
||||
0
|
||||
a 1
|
||||
>>> df6 = pd.DataFrame([2], index=['a'])
|
||||
>>> df6
|
||||
0
|
||||
a 2
|
||||
>>> pd.concat([df5, df6], verify_integrity=True)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Indexes have overlapping values: ['a']
|
||||
"""
|
||||
op = _Concatenator(
|
||||
objs,
|
||||
axis=axis,
|
||||
ignore_index=ignore_index,
|
||||
join=join,
|
||||
keys=keys,
|
||||
levels=levels,
|
||||
names=names,
|
||||
verify_integrity=verify_integrity,
|
||||
copy=copy,
|
||||
sort=sort,
|
||||
)
|
||||
|
||||
return op.get_result()
|
||||
|
||||
|
||||
class _Concatenator:
|
||||
"""
|
||||
Orchestrates a concatenation operation for BlockManagers
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
objs: Iterable[NDFrame] | Mapping[Hashable, NDFrame],
|
||||
axis=0,
|
||||
join: str = "outer",
|
||||
keys=None,
|
||||
levels=None,
|
||||
names=None,
|
||||
ignore_index: bool = False,
|
||||
verify_integrity: bool = False,
|
||||
copy: bool = True,
|
||||
sort=False,
|
||||
):
|
||||
if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
|
||||
raise TypeError(
|
||||
"first argument must be an iterable of pandas "
|
||||
f'objects, you passed an object of type "{type(objs).__name__}"'
|
||||
)
|
||||
|
||||
if join == "outer":
|
||||
self.intersect = False
|
||||
elif join == "inner":
|
||||
self.intersect = True
|
||||
else: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Only can inner (intersect) or outer (union) join the other axis"
|
||||
)
|
||||
|
||||
if isinstance(objs, abc.Mapping):
|
||||
if keys is None:
|
||||
keys = list(objs.keys())
|
||||
objs = [objs[k] for k in keys]
|
||||
else:
|
||||
objs = list(objs)
|
||||
|
||||
if len(objs) == 0:
|
||||
raise ValueError("No objects to concatenate")
|
||||
|
||||
if keys is None:
|
||||
objs = list(com.not_none(*objs))
|
||||
else:
|
||||
# #1649
|
||||
clean_keys = []
|
||||
clean_objs = []
|
||||
for k, v in zip(keys, objs):
|
||||
if v is None:
|
||||
continue
|
||||
clean_keys.append(k)
|
||||
clean_objs.append(v)
|
||||
objs = clean_objs
|
||||
|
||||
if isinstance(keys, MultiIndex):
|
||||
# TODO: retain levels?
|
||||
keys = type(keys).from_tuples(clean_keys, names=keys.names)
|
||||
else:
|
||||
name = getattr(keys, "name", None)
|
||||
keys = Index(clean_keys, name=name)
|
||||
|
||||
if len(objs) == 0:
|
||||
raise ValueError("All objects passed were None")
|
||||
|
||||
# figure out what our result ndim is going to be
|
||||
ndims = set()
|
||||
for obj in objs:
|
||||
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
|
||||
msg = (
|
||||
f"cannot concatenate object of type '{type(obj)}'; "
|
||||
"only Series and DataFrame objs are valid"
|
||||
)
|
||||
raise TypeError(msg)
|
||||
|
||||
ndims.add(obj.ndim)
|
||||
|
||||
# get the sample
|
||||
# want the highest ndim that we have, and must be non-empty
|
||||
# unless all objs are empty
|
||||
sample: NDFrame | None = None
|
||||
if len(ndims) > 1:
|
||||
max_ndim = max(ndims)
|
||||
for obj in objs:
|
||||
if obj.ndim == max_ndim and np.sum(obj.shape):
|
||||
sample = obj
|
||||
break
|
||||
|
||||
else:
|
||||
# filter out the empties if we have not multi-index possibilities
|
||||
# note to keep empty Series as it affect to result columns / name
|
||||
non_empties = [
|
||||
obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, ABCSeries)
|
||||
]
|
||||
|
||||
if len(non_empties) and (
|
||||
keys is None and names is None and levels is None and not self.intersect
|
||||
):
|
||||
objs = non_empties
|
||||
sample = objs[0]
|
||||
|
||||
if sample is None:
|
||||
sample = objs[0]
|
||||
self.objs = objs
|
||||
|
||||
# Standardize axis parameter to int
|
||||
if isinstance(sample, ABCSeries):
|
||||
from pandas import DataFrame
|
||||
|
||||
axis = DataFrame._get_axis_number(axis)
|
||||
else:
|
||||
axis = sample._get_axis_number(axis)
|
||||
|
||||
# Need to flip BlockManager axis in the DataFrame special case
|
||||
self._is_frame = isinstance(sample, ABCDataFrame)
|
||||
if self._is_frame:
|
||||
axis = sample._get_block_manager_axis(axis)
|
||||
|
||||
self._is_series = isinstance(sample, ABCSeries)
|
||||
if not 0 <= axis <= sample.ndim:
|
||||
raise AssertionError(
|
||||
f"axis must be between 0 and {sample.ndim}, input was {axis}"
|
||||
)
|
||||
|
||||
# if we have mixed ndims, then convert to highest ndim
|
||||
# creating column numbers as needed
|
||||
if len(ndims) > 1:
|
||||
current_column = 0
|
||||
max_ndim = sample.ndim
|
||||
self.objs, objs = [], self.objs
|
||||
for obj in objs:
|
||||
|
||||
ndim = obj.ndim
|
||||
if ndim == max_ndim:
|
||||
pass
|
||||
|
||||
elif ndim != max_ndim - 1:
|
||||
raise ValueError(
|
||||
"cannot concatenate unaligned mixed "
|
||||
"dimensional NDFrame objects"
|
||||
)
|
||||
|
||||
else:
|
||||
name = getattr(obj, "name", None)
|
||||
if ignore_index or name is None:
|
||||
name = current_column
|
||||
current_column += 1
|
||||
|
||||
# doing a row-wise concatenation so need everything
|
||||
# to line up
|
||||
if self._is_frame and axis == 1:
|
||||
name = 0
|
||||
# mypy needs to know sample is not an NDFrame
|
||||
sample = cast("DataFrame | Series", sample)
|
||||
obj = sample._constructor({name: obj})
|
||||
|
||||
self.objs.append(obj)
|
||||
|
||||
# note: this is the BlockManager axis (since DataFrame is transposed)
|
||||
self.bm_axis = axis
|
||||
self.axis = 1 - self.bm_axis if self._is_frame else 0
|
||||
self.keys = keys
|
||||
self.names = names or getattr(keys, "names", None)
|
||||
self.levels = levels
|
||||
|
||||
if not is_bool(sort):
|
||||
warnings.warn(
|
||||
"Passing non boolean values for sort is deprecated and "
|
||||
"will error in a future version!",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
self.sort = sort
|
||||
|
||||
self.ignore_index = ignore_index
|
||||
self.verify_integrity = verify_integrity
|
||||
self.copy = copy
|
||||
|
||||
self.new_axes = self._get_new_axes()
|
||||
|
||||
def get_result(self):
|
||||
cons: Callable[..., DataFrame | Series]
|
||||
sample: DataFrame | Series
|
||||
|
||||
# series only
|
||||
if self._is_series:
|
||||
sample = cast("Series", self.objs[0])
|
||||
|
||||
# stack blocks
|
||||
if self.bm_axis == 0:
|
||||
name = com.consensus_name_attr(self.objs)
|
||||
cons = sample._constructor
|
||||
|
||||
arrs = [ser._values for ser in self.objs]
|
||||
|
||||
res = concat_compat(arrs, axis=0)
|
||||
result = cons(res, index=self.new_axes[0], name=name, dtype=res.dtype)
|
||||
return result.__finalize__(self, method="concat")
|
||||
|
||||
# combine as columns in a frame
|
||||
else:
|
||||
data = dict(zip(range(len(self.objs)), self.objs))
|
||||
|
||||
# GH28330 Preserves subclassed objects through concat
|
||||
cons = sample._constructor_expanddim
|
||||
|
||||
index, columns = self.new_axes
|
||||
df = cons(data, index=index, copy=self.copy)
|
||||
df.columns = columns
|
||||
return df.__finalize__(self, method="concat")
|
||||
|
||||
# combine block managers
|
||||
else:
|
||||
sample = cast("DataFrame", self.objs[0])
|
||||
|
||||
mgrs_indexers = []
|
||||
for obj in self.objs:
|
||||
indexers = {}
|
||||
for ax, new_labels in enumerate(self.new_axes):
|
||||
# ::-1 to convert BlockManager ax to DataFrame ax
|
||||
if ax == self.bm_axis:
|
||||
# Suppress reindexing on concat axis
|
||||
continue
|
||||
|
||||
# 1-ax to convert BlockManager axis to DataFrame axis
|
||||
obj_labels = obj.axes[1 - ax]
|
||||
if not new_labels.equals(obj_labels):
|
||||
indexers[ax] = obj_labels.get_indexer(new_labels)
|
||||
|
||||
mgrs_indexers.append((obj._mgr, indexers))
|
||||
|
||||
new_data = concatenate_managers(
|
||||
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
|
||||
)
|
||||
if not self.copy:
|
||||
new_data._consolidate_inplace()
|
||||
|
||||
cons = sample._constructor
|
||||
return cons(new_data).__finalize__(self, method="concat")
|
||||
|
||||
def _get_result_dim(self) -> int:
|
||||
if self._is_series and self.bm_axis == 1:
|
||||
return 2
|
||||
else:
|
||||
return self.objs[0].ndim
|
||||
|
||||
def _get_new_axes(self) -> list[Index]:
|
||||
ndim = self._get_result_dim()
|
||||
return [
|
||||
self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
|
||||
for i in range(ndim)
|
||||
]
|
||||
|
||||
def _get_comb_axis(self, i: int) -> Index:
|
||||
data_axis = self.objs[0]._get_block_manager_axis(i)
|
||||
return get_objs_combined_axis(
|
||||
self.objs,
|
||||
axis=data_axis,
|
||||
intersect=self.intersect,
|
||||
sort=self.sort,
|
||||
copy=self.copy,
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def _get_concat_axis(self) -> Index:
|
||||
"""
|
||||
Return index to be used along concatenation axis.
|
||||
"""
|
||||
if self._is_series:
|
||||
if self.bm_axis == 0:
|
||||
indexes = [x.index for x in self.objs]
|
||||
elif self.ignore_index:
|
||||
idx = default_index(len(self.objs))
|
||||
return idx
|
||||
elif self.keys is None:
|
||||
names: list[Hashable] = [None] * len(self.objs)
|
||||
num = 0
|
||||
has_names = False
|
||||
for i, x in enumerate(self.objs):
|
||||
if not isinstance(x, ABCSeries):
|
||||
raise TypeError(
|
||||
f"Cannot concatenate type 'Series' with "
|
||||
f"object of type '{type(x).__name__}'"
|
||||
)
|
||||
if x.name is not None:
|
||||
names[i] = x.name
|
||||
has_names = True
|
||||
else:
|
||||
names[i] = num
|
||||
num += 1
|
||||
if has_names:
|
||||
return Index(names)
|
||||
else:
|
||||
return default_index(len(self.objs))
|
||||
else:
|
||||
return ensure_index(self.keys).set_names(self.names)
|
||||
else:
|
||||
indexes = [x.axes[self.axis] for x in self.objs]
|
||||
|
||||
if self.ignore_index:
|
||||
idx = default_index(sum(len(i) for i in indexes))
|
||||
return idx
|
||||
|
||||
if self.keys is None:
|
||||
concat_axis = _concat_indexes(indexes)
|
||||
else:
|
||||
concat_axis = _make_concat_multiindex(
|
||||
indexes, self.keys, self.levels, self.names
|
||||
)
|
||||
|
||||
self._maybe_check_integrity(concat_axis)
|
||||
|
||||
return concat_axis
|
||||
|
||||
def _maybe_check_integrity(self, concat_index: Index):
|
||||
if self.verify_integrity:
|
||||
if not concat_index.is_unique:
|
||||
overlap = concat_index[concat_index.duplicated()].unique()
|
||||
raise ValueError(f"Indexes have overlapping values: {overlap}")
|
||||
|
||||
|
||||
def _concat_indexes(indexes) -> Index:
|
||||
return indexes[0].append(indexes[1:])
|
||||
|
||||
|
||||
def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex:
|
||||
|
||||
if (levels is None and isinstance(keys[0], tuple)) or (
|
||||
levels is not None and len(levels) > 1
|
||||
):
|
||||
zipped = list(zip(*keys))
|
||||
if names is None:
|
||||
names = [None] * len(zipped)
|
||||
|
||||
if levels is None:
|
||||
_, levels = factorize_from_iterables(zipped)
|
||||
else:
|
||||
levels = [ensure_index(x) for x in levels]
|
||||
else:
|
||||
zipped = [keys]
|
||||
if names is None:
|
||||
names = [None]
|
||||
|
||||
if levels is None:
|
||||
levels = [ensure_index(keys)]
|
||||
else:
|
||||
levels = [ensure_index(x) for x in levels]
|
||||
|
||||
if not all_indexes_same(indexes) or not all(level.is_unique for level in levels):
|
||||
codes_list = []
|
||||
|
||||
# things are potentially different sizes, so compute the exact codes
|
||||
# for each level and pass those to MultiIndex.from_arrays
|
||||
|
||||
for hlevel, level in zip(zipped, levels):
|
||||
to_concat = []
|
||||
for key, index in zip(hlevel, indexes):
|
||||
# Find matching codes, include matching nan values as equal.
|
||||
mask = (isna(level) & isna(key)) | (level == key)
|
||||
if not mask.any():
|
||||
raise ValueError(f"Key {key} not in level {level}")
|
||||
i = np.nonzero(mask)[0][0]
|
||||
|
||||
to_concat.append(np.repeat(i, len(index)))
|
||||
codes_list.append(np.concatenate(to_concat))
|
||||
|
||||
concat_index = _concat_indexes(indexes)
|
||||
|
||||
# these go at the end
|
||||
if isinstance(concat_index, MultiIndex):
|
||||
levels.extend(concat_index.levels)
|
||||
codes_list.extend(concat_index.codes)
|
||||
else:
|
||||
codes, categories = factorize_from_iterable(concat_index)
|
||||
levels.append(categories)
|
||||
codes_list.append(codes)
|
||||
|
||||
if len(names) == len(levels):
|
||||
names = list(names)
|
||||
else:
|
||||
# make sure that all of the passed indices have the same nlevels
|
||||
if not len({idx.nlevels for idx in indexes}) == 1:
|
||||
raise AssertionError(
|
||||
"Cannot concat indices that do not have the same number of levels"
|
||||
)
|
||||
|
||||
# also copies
|
||||
names = list(names) + list(get_unanimous_names(*indexes))
|
||||
|
||||
return MultiIndex(
|
||||
levels=levels, codes=codes_list, names=names, verify_integrity=False
|
||||
)
|
||||
|
||||
new_index = indexes[0]
|
||||
n = len(new_index)
|
||||
kpieces = len(indexes)
|
||||
|
||||
# also copies
|
||||
new_names = list(names)
|
||||
new_levels = list(levels)
|
||||
|
||||
# construct codes
|
||||
new_codes = []
|
||||
|
||||
# do something a bit more speedy
|
||||
|
||||
for hlevel, level in zip(zipped, levels):
|
||||
hlevel = ensure_index(hlevel)
|
||||
mapped = level.get_indexer(hlevel)
|
||||
|
||||
mask = mapped == -1
|
||||
if mask.any():
|
||||
raise ValueError(f"Values not found in passed level: {hlevel[mask]!s}")
|
||||
|
||||
new_codes.append(np.repeat(mapped, n))
|
||||
|
||||
if isinstance(new_index, MultiIndex):
|
||||
new_levels.extend(new_index.levels)
|
||||
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
|
||||
else:
|
||||
new_levels.append(new_index.unique())
|
||||
single_codes = new_index.unique().get_indexer(new_index)
|
||||
new_codes.append(np.tile(single_codes, kpieces))
|
||||
|
||||
if len(new_names) < len(new_levels):
|
||||
new_names.extend(new_index.names)
|
||||
|
||||
return MultiIndex(
|
||||
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
||||
)
|
547
.venv/Lib/site-packages/pandas/core/reshape/melt.py
Normal file
547
.venv/Lib/site-packages/pandas/core/reshape/melt.py
Normal file
@ -0,0 +1,547 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import (
|
||||
Appender,
|
||||
deprecate_kwarg,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_extension_array_dtype,
|
||||
is_list_like,
|
||||
)
|
||||
from pandas.core.dtypes.concat import concat_compat
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
import pandas.core.algorithms as algos
|
||||
from pandas.core.arrays import Categorical
|
||||
import pandas.core.common as com
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.util import tile_compat
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
from pandas.core.tools.numeric import to_numeric
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
|
||||
def melt(
|
||||
frame: DataFrame,
|
||||
id_vars=None,
|
||||
value_vars=None,
|
||||
var_name=None,
|
||||
value_name="value",
|
||||
col_level=None,
|
||||
ignore_index: bool = True,
|
||||
) -> DataFrame:
|
||||
# If multiindex, gather names of columns on all level for checking presence
|
||||
# of `id_vars` and `value_vars`
|
||||
if isinstance(frame.columns, MultiIndex):
|
||||
cols = [x for c in frame.columns for x in c]
|
||||
else:
|
||||
cols = list(frame.columns)
|
||||
|
||||
if value_name in frame.columns:
|
||||
warnings.warn(
|
||||
"This dataframe has a column name that matches the 'value_name' column "
|
||||
"name of the resulting Dataframe. "
|
||||
"In the future this will raise an error, please set the 'value_name' "
|
||||
"parameter of DataFrame.melt to a unique name.",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
if id_vars is not None:
|
||||
if not is_list_like(id_vars):
|
||||
id_vars = [id_vars]
|
||||
elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):
|
||||
raise ValueError(
|
||||
"id_vars must be a list of tuples when columns are a MultiIndex"
|
||||
)
|
||||
else:
|
||||
# Check that `id_vars` are in frame
|
||||
id_vars = list(id_vars)
|
||||
missing = Index(com.flatten(id_vars)).difference(cols)
|
||||
if not missing.empty:
|
||||
raise KeyError(
|
||||
"The following 'id_vars' are not present "
|
||||
f"in the DataFrame: {list(missing)}"
|
||||
)
|
||||
else:
|
||||
id_vars = []
|
||||
|
||||
if value_vars is not None:
|
||||
if not is_list_like(value_vars):
|
||||
value_vars = [value_vars]
|
||||
elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):
|
||||
raise ValueError(
|
||||
"value_vars must be a list of tuples when columns are a MultiIndex"
|
||||
)
|
||||
else:
|
||||
value_vars = list(value_vars)
|
||||
# Check that `value_vars` are in frame
|
||||
missing = Index(com.flatten(value_vars)).difference(cols)
|
||||
if not missing.empty:
|
||||
raise KeyError(
|
||||
"The following 'value_vars' are not present in "
|
||||
f"the DataFrame: {list(missing)}"
|
||||
)
|
||||
if col_level is not None:
|
||||
idx = frame.columns.get_level_values(col_level).get_indexer(
|
||||
id_vars + value_vars
|
||||
)
|
||||
else:
|
||||
idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))
|
||||
frame = frame.iloc[:, idx]
|
||||
else:
|
||||
frame = frame.copy()
|
||||
|
||||
if col_level is not None: # allow list or other?
|
||||
# frame is a copy
|
||||
frame.columns = frame.columns.get_level_values(col_level)
|
||||
|
||||
if var_name is None:
|
||||
if isinstance(frame.columns, MultiIndex):
|
||||
if len(frame.columns.names) == len(set(frame.columns.names)):
|
||||
var_name = frame.columns.names
|
||||
else:
|
||||
var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
|
||||
else:
|
||||
var_name = [
|
||||
frame.columns.name if frame.columns.name is not None else "variable"
|
||||
]
|
||||
if isinstance(var_name, str):
|
||||
var_name = [var_name]
|
||||
|
||||
N, K = frame.shape
|
||||
K -= len(id_vars)
|
||||
|
||||
mdata = {}
|
||||
for col in id_vars:
|
||||
id_data = frame.pop(col)
|
||||
if is_extension_array_dtype(id_data):
|
||||
id_data = concat([id_data] * K, ignore_index=True)
|
||||
else:
|
||||
# Incompatible types in assignment (expression has type
|
||||
# "ndarray[Any, dtype[Any]]", variable has type "Series") [assignment]
|
||||
id_data = np.tile(id_data._values, K) # type: ignore[assignment]
|
||||
mdata[col] = id_data
|
||||
|
||||
mcolumns = id_vars + var_name + [value_name]
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "ndarray",
|
||||
# target has type "Series")
|
||||
mdata[value_name] = frame._values.ravel("F") # type: ignore[assignment]
|
||||
for i, col in enumerate(var_name):
|
||||
# asanyarray will keep the columns as an Index
|
||||
|
||||
# error: Incompatible types in assignment (expression has type "ndarray", target
|
||||
# has type "Series")
|
||||
mdata[col] = np.asanyarray( # type: ignore[assignment]
|
||||
frame.columns._get_level_values(i)
|
||||
).repeat(N)
|
||||
|
||||
result = frame._constructor(mdata, columns=mcolumns)
|
||||
|
||||
if not ignore_index:
|
||||
result.index = tile_compat(frame.index, K)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@deprecate_kwarg(old_arg_name="label", new_arg_name=None)
|
||||
def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame:
|
||||
"""
|
||||
Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
|
||||
|
||||
Accepts a dictionary, ``groups``, in which each key is a new column name
|
||||
and each value is a list of old column names that will be "melted" under
|
||||
the new column name as part of the reshape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame
|
||||
The wide-format DataFrame.
|
||||
groups : dict
|
||||
{new_name : list_of_columns}.
|
||||
dropna : bool, default True
|
||||
Do not include columns whose entries are all NaN.
|
||||
label : None
|
||||
Not used.
|
||||
|
||||
.. deprecated:: 1.0.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Reshaped DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
melt : Unpivot a DataFrame from wide to long format, optionally leaving
|
||||
identifiers set.
|
||||
pivot : Create a spreadsheet-style pivot table as a DataFrame.
|
||||
DataFrame.pivot : Pivot without aggregation that can handle
|
||||
non-numeric data.
|
||||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||||
duplicate values for one index/column pair.
|
||||
DataFrame.unstack : Pivot based on the index values instead of a
|
||||
column.
|
||||
wide_to_long : Wide panel to long format. Less flexible but more
|
||||
user-friendly than melt.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
|
||||
... 'team': ['Red Sox', 'Yankees'],
|
||||
... 'year1': [2007, 2007], 'year2': [2008, 2008]})
|
||||
>>> data
|
||||
hr1 hr2 team year1 year2
|
||||
0 514 545 Red Sox 2007 2008
|
||||
1 573 526 Yankees 2007 2008
|
||||
|
||||
>>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
|
||||
team year hr
|
||||
0 Red Sox 2007 514
|
||||
1 Yankees 2007 573
|
||||
2 Red Sox 2008 545
|
||||
3 Yankees 2008 526
|
||||
"""
|
||||
if isinstance(groups, dict):
|
||||
keys = list(groups.keys())
|
||||
values = list(groups.values())
|
||||
else:
|
||||
keys, values = zip(*groups)
|
||||
|
||||
all_cols = list(set.union(*(set(x) for x in values)))
|
||||
id_cols = list(data.columns.difference(all_cols))
|
||||
|
||||
K = len(values[0])
|
||||
|
||||
for seq in values:
|
||||
if len(seq) != K:
|
||||
raise ValueError("All column lists must be same length")
|
||||
|
||||
mdata = {}
|
||||
pivot_cols = []
|
||||
|
||||
for target, names in zip(keys, values):
|
||||
to_concat = [data[col]._values for col in names]
|
||||
|
||||
mdata[target] = concat_compat(to_concat)
|
||||
pivot_cols.append(target)
|
||||
|
||||
for col in id_cols:
|
||||
mdata[col] = np.tile(data[col]._values, K)
|
||||
|
||||
if dropna:
|
||||
mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
|
||||
for c in pivot_cols:
|
||||
mask &= notna(mdata[c])
|
||||
if not mask.all():
|
||||
mdata = {k: v[mask] for k, v in mdata.items()}
|
||||
|
||||
return data._constructor(mdata, columns=id_cols + pivot_cols)
|
||||
|
||||
|
||||
def wide_to_long(
|
||||
df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
|
||||
) -> DataFrame:
|
||||
r"""
|
||||
Unpivot a DataFrame from wide to long format.
|
||||
|
||||
Less flexible but more user-friendly than melt.
|
||||
|
||||
With stubnames ['A', 'B'], this function expects to find one or more
|
||||
group of columns with format
|
||||
A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
|
||||
You specify what you want to call this suffix in the resulting long format
|
||||
with `j` (for example `j='year'`)
|
||||
|
||||
Each row of these wide variables are assumed to be uniquely identified by
|
||||
`i` (can be a single column name or a list of column names)
|
||||
|
||||
All remaining variables in the data frame are left intact.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
The wide-format DataFrame.
|
||||
stubnames : str or list-like
|
||||
The stub name(s). The wide format variables are assumed to
|
||||
start with the stub names.
|
||||
i : str or list-like
|
||||
Column(s) to use as id variable(s).
|
||||
j : str
|
||||
The name of the sub-observation variable. What you wish to name your
|
||||
suffix in the long format.
|
||||
sep : str, default ""
|
||||
A character indicating the separation of the variable names
|
||||
in the wide format, to be stripped from the names in the long format.
|
||||
For example, if your column names are A-suffix1, A-suffix2, you
|
||||
can strip the hyphen by specifying `sep='-'`.
|
||||
suffix : str, default '\\d+'
|
||||
A regular expression capturing the wanted suffixes. '\\d+' captures
|
||||
numeric suffixes. Suffixes with no numbers could be specified with the
|
||||
negated character class '\\D+'. You can also further disambiguate
|
||||
suffixes, for example, if your wide variables are of the form A-one,
|
||||
B-two,.., and you have an unrelated column A-rating, you can ignore the
|
||||
last one by specifying `suffix='(!?one|two)'`. When all suffixes are
|
||||
numeric, they are cast to int64/float64.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame that contains each stub name as a variable, with new index
|
||||
(i, j).
|
||||
|
||||
See Also
|
||||
--------
|
||||
melt : Unpivot a DataFrame from wide to long format, optionally leaving
|
||||
identifiers set.
|
||||
pivot : Create a spreadsheet-style pivot table as a DataFrame.
|
||||
DataFrame.pivot : Pivot without aggregation that can handle
|
||||
non-numeric data.
|
||||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||||
duplicate values for one index/column pair.
|
||||
DataFrame.unstack : Pivot based on the index values instead of a
|
||||
column.
|
||||
|
||||
Notes
|
||||
-----
|
||||
All extra variables are left untouched. This simply uses
|
||||
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
|
||||
in a typical case.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> np.random.seed(123)
|
||||
>>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
|
||||
... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
|
||||
... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
|
||||
... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
|
||||
... "X" : dict(zip(range(3), np.random.randn(3)))
|
||||
... })
|
||||
>>> df["id"] = df.index
|
||||
>>> df
|
||||
A1970 A1980 B1970 B1980 X id
|
||||
0 a d 2.5 3.2 -1.085631 0
|
||||
1 b e 1.2 1.3 0.997345 1
|
||||
2 c f 0.7 0.1 0.282978 2
|
||||
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
X A B
|
||||
id year
|
||||
0 1970 -1.085631 a 2.5
|
||||
1 1970 0.997345 b 1.2
|
||||
2 1970 0.282978 c 0.7
|
||||
0 1980 -1.085631 d 3.2
|
||||
1 1980 0.997345 e 1.3
|
||||
2 1980 0.282978 f 0.1
|
||||
|
||||
With multiple id columns
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
... })
|
||||
>>> df
|
||||
famid birth ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
|
||||
>>> l
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
ht
|
||||
famid birth age
|
||||
1 1 1 2.8
|
||||
2 3.4
|
||||
2 1 2.9
|
||||
2 3.8
|
||||
3 1 2.2
|
||||
2 2.9
|
||||
2 1 1 2.0
|
||||
2 3.2
|
||||
2 1 1.8
|
||||
2 2.8
|
||||
3 1 1.9
|
||||
2 2.4
|
||||
3 1 1 2.2
|
||||
2 3.3
|
||||
2 1 2.3
|
||||
2 3.4
|
||||
3 1 2.1
|
||||
2 2.9
|
||||
|
||||
Going from long back to wide just takes some creative use of `unstack`
|
||||
|
||||
>>> w = l.unstack()
|
||||
>>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
|
||||
>>> w.reset_index()
|
||||
famid birth ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
|
||||
Less wieldy column names are also handled
|
||||
|
||||
>>> np.random.seed(0)
|
||||
>>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
|
||||
... 'A(weekly)-2011': np.random.rand(3),
|
||||
... 'B(weekly)-2010': np.random.rand(3),
|
||||
... 'B(weekly)-2011': np.random.rand(3),
|
||||
... 'X' : np.random.randint(3, size=3)})
|
||||
>>> df['id'] = df.index
|
||||
>>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
|
||||
A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
|
||||
0 0.548814 0.544883 0.437587 0.383442 0 0
|
||||
1 0.715189 0.423655 0.891773 0.791725 1 1
|
||||
2 0.602763 0.645894 0.963663 0.528895 1 2
|
||||
|
||||
>>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
|
||||
... j='year', sep='-')
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
X A(weekly) B(weekly)
|
||||
id year
|
||||
0 2010 0 0.548814 0.437587
|
||||
1 2010 1 0.715189 0.891773
|
||||
2 2010 1 0.602763 0.963663
|
||||
0 2011 0 0.544883 0.383442
|
||||
1 2011 1 0.423655 0.791725
|
||||
2 2011 1 0.645894 0.528895
|
||||
|
||||
If we have many columns, we could also use a regex to find our
|
||||
stubnames and pass that list on to wide_to_long
|
||||
|
||||
>>> stubnames = sorted(
|
||||
... set([match[0] for match in df.columns.str.findall(
|
||||
... r'[A-B]\(.*\)').values if match != []])
|
||||
... )
|
||||
>>> list(stubnames)
|
||||
['A(weekly)', 'B(weekly)']
|
||||
|
||||
All of the above examples have integers as suffixes. It is possible to
|
||||
have non-integers as suffixes.
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
... })
|
||||
>>> df
|
||||
famid birth ht_one ht_two
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
|
||||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
|
||||
... sep='_', suffix=r'\w+')
|
||||
>>> l
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
ht
|
||||
famid birth age
|
||||
1 1 one 2.8
|
||||
two 3.4
|
||||
2 one 2.9
|
||||
two 3.8
|
||||
3 one 2.2
|
||||
two 2.9
|
||||
2 1 one 2.0
|
||||
two 3.2
|
||||
2 one 1.8
|
||||
two 2.8
|
||||
3 one 1.9
|
||||
two 2.4
|
||||
3 1 one 2.2
|
||||
two 3.3
|
||||
2 one 2.3
|
||||
two 3.4
|
||||
3 one 2.1
|
||||
two 2.9
|
||||
"""
|
||||
|
||||
def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]:
|
||||
regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
|
||||
pattern = re.compile(regex)
|
||||
return [col for col in df.columns if pattern.match(col)]
|
||||
|
||||
def melt_stub(df, stub: str, i, j, value_vars, sep: str):
|
||||
newdf = melt(
|
||||
df,
|
||||
id_vars=i,
|
||||
value_vars=value_vars,
|
||||
value_name=stub.rstrip(sep),
|
||||
var_name=j,
|
||||
)
|
||||
newdf[j] = Categorical(newdf[j])
|
||||
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
|
||||
|
||||
# GH17627 Cast numerics suffixes to int/float
|
||||
newdf[j] = to_numeric(newdf[j], errors="ignore")
|
||||
|
||||
return newdf.set_index(i + [j])
|
||||
|
||||
if not is_list_like(stubnames):
|
||||
stubnames = [stubnames]
|
||||
else:
|
||||
stubnames = list(stubnames)
|
||||
|
||||
if any(col in stubnames for col in df.columns):
|
||||
raise ValueError("stubname can't be identical to a column name")
|
||||
|
||||
if not is_list_like(i):
|
||||
i = [i]
|
||||
else:
|
||||
i = list(i)
|
||||
|
||||
if df[i].duplicated().any():
|
||||
raise ValueError("the id variables need to uniquely identify each row")
|
||||
|
||||
value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
|
||||
|
||||
value_vars_flattened = [e for sublist in value_vars for e in sublist]
|
||||
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
|
||||
|
||||
_melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]
|
||||
melted = _melted[0].join(_melted[1:], how="outer")
|
||||
|
||||
if len(i) == 1:
|
||||
new = df[id_vars].set_index(i).join(melted)
|
||||
return new
|
||||
|
||||
new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
|
||||
|
||||
return new
|
2356
.venv/Lib/site-packages/pandas/core/reshape/merge.py
Normal file
2356
.venv/Lib/site-packages/pandas/core/reshape/merge.py
Normal file
File diff suppressed because it is too large
Load Diff
842
.venv/Lib/site-packages/pandas/core/reshape/pivot.py
Normal file
842
.venv/Lib/site-packages/pandas/core/reshape/pivot.py
Normal file
@ -0,0 +1,842 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
Hashable,
|
||||
Sequence,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
AggFuncType,
|
||||
AggFuncTypeBase,
|
||||
AggFuncTypeDict,
|
||||
IndexLabel,
|
||||
)
|
||||
from pandas.util._decorators import (
|
||||
Appender,
|
||||
Substitution,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
is_nested_list_like,
|
||||
is_scalar,
|
||||
)
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDataFrame,
|
||||
ABCSeries,
|
||||
)
|
||||
|
||||
import pandas.core.common as com
|
||||
from pandas.core.frame import _shared_docs
|
||||
from pandas.core.groupby import Grouper
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
get_objs_combined_axis,
|
||||
)
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.util import cartesian_product
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
# Note: We need to make sure `frame` is imported before `pivot`, otherwise
|
||||
# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency
|
||||
@Substitution("\ndata : DataFrame")
|
||||
@Appender(_shared_docs["pivot_table"], indents=1)
|
||||
def pivot_table(
|
||||
data: DataFrame,
|
||||
values=None,
|
||||
index=None,
|
||||
columns=None,
|
||||
aggfunc: AggFuncType = "mean",
|
||||
fill_value=None,
|
||||
margins: bool = False,
|
||||
dropna: bool = True,
|
||||
margins_name: str = "All",
|
||||
observed: bool = False,
|
||||
sort: bool = True,
|
||||
) -> DataFrame:
|
||||
index = _convert_by(index)
|
||||
columns = _convert_by(columns)
|
||||
|
||||
if isinstance(aggfunc, list):
|
||||
pieces: list[DataFrame] = []
|
||||
keys = []
|
||||
for func in aggfunc:
|
||||
_table = __internal_pivot_table(
|
||||
data,
|
||||
values=values,
|
||||
index=index,
|
||||
columns=columns,
|
||||
fill_value=fill_value,
|
||||
aggfunc=func,
|
||||
margins=margins,
|
||||
dropna=dropna,
|
||||
margins_name=margins_name,
|
||||
observed=observed,
|
||||
sort=sort,
|
||||
)
|
||||
pieces.append(_table)
|
||||
keys.append(getattr(func, "__name__", func))
|
||||
|
||||
table = concat(pieces, keys=keys, axis=1)
|
||||
return table.__finalize__(data, method="pivot_table")
|
||||
|
||||
table = __internal_pivot_table(
|
||||
data,
|
||||
values,
|
||||
index,
|
||||
columns,
|
||||
aggfunc,
|
||||
fill_value,
|
||||
margins,
|
||||
dropna,
|
||||
margins_name,
|
||||
observed,
|
||||
sort,
|
||||
)
|
||||
return table.__finalize__(data, method="pivot_table")
|
||||
|
||||
|
||||
def __internal_pivot_table(
|
||||
data: DataFrame,
|
||||
values,
|
||||
index,
|
||||
columns,
|
||||
aggfunc: AggFuncTypeBase | AggFuncTypeDict,
|
||||
fill_value,
|
||||
margins: bool,
|
||||
dropna: bool,
|
||||
margins_name: str,
|
||||
observed: bool,
|
||||
sort: bool,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``.
|
||||
"""
|
||||
keys = index + columns
|
||||
|
||||
values_passed = values is not None
|
||||
if values_passed:
|
||||
if is_list_like(values):
|
||||
values_multi = True
|
||||
values = list(values)
|
||||
else:
|
||||
values_multi = False
|
||||
values = [values]
|
||||
|
||||
# GH14938 Make sure value labels are in data
|
||||
for i in values:
|
||||
if i not in data:
|
||||
raise KeyError(i)
|
||||
|
||||
to_filter = []
|
||||
for x in keys + values:
|
||||
if isinstance(x, Grouper):
|
||||
x = x.key
|
||||
try:
|
||||
if x in data:
|
||||
to_filter.append(x)
|
||||
except TypeError:
|
||||
pass
|
||||
if len(to_filter) < len(data.columns):
|
||||
data = data[to_filter]
|
||||
|
||||
else:
|
||||
values = data.columns
|
||||
for key in keys:
|
||||
try:
|
||||
values = values.drop(key)
|
||||
except (TypeError, ValueError, KeyError):
|
||||
pass
|
||||
values = list(values)
|
||||
|
||||
grouped = data.groupby(keys, observed=observed, sort=sort)
|
||||
agged = grouped.agg(aggfunc)
|
||||
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
|
||||
agged = agged.dropna(how="all")
|
||||
|
||||
# gh-21133
|
||||
# we want to down cast if
|
||||
# the original values are ints
|
||||
# as we grouped with a NaN value
|
||||
# and then dropped, coercing to floats
|
||||
for v in values:
|
||||
if (
|
||||
v in data
|
||||
and is_integer_dtype(data[v])
|
||||
and v in agged
|
||||
and not is_integer_dtype(agged[v])
|
||||
):
|
||||
if not isinstance(agged[v], ABCDataFrame):
|
||||
# exclude DataFrame case bc maybe_downcast_to_dtype expects
|
||||
# ArrayLike
|
||||
# e.g. test_pivot_table_multiindex_columns_doctest_case
|
||||
# agged.columns is a MultiIndex and 'v' is indexing only
|
||||
# on its first level.
|
||||
agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)
|
||||
|
||||
table = agged
|
||||
|
||||
# GH17038, this check should only happen if index is defined (not None)
|
||||
if table.index.nlevels > 1 and index:
|
||||
# Related GH #17123
|
||||
# If index_names are integers, determine whether the integers refer
|
||||
# to the level position or name.
|
||||
index_names = agged.index.names[: len(index)]
|
||||
to_unstack = []
|
||||
for i in range(len(index), len(keys)):
|
||||
name = agged.index.names[i]
|
||||
if name is None or name in index_names:
|
||||
to_unstack.append(i)
|
||||
else:
|
||||
to_unstack.append(name)
|
||||
table = agged.unstack(to_unstack)
|
||||
|
||||
if not dropna:
|
||||
if isinstance(table.index, MultiIndex):
|
||||
m = MultiIndex.from_arrays(
|
||||
cartesian_product(table.index.levels), names=table.index.names
|
||||
)
|
||||
table = table.reindex(m, axis=0)
|
||||
|
||||
if isinstance(table.columns, MultiIndex):
|
||||
m = MultiIndex.from_arrays(
|
||||
cartesian_product(table.columns.levels), names=table.columns.names
|
||||
)
|
||||
table = table.reindex(m, axis=1)
|
||||
|
||||
if isinstance(table, ABCDataFrame):
|
||||
table = table.sort_index(axis=1)
|
||||
|
||||
if fill_value is not None:
|
||||
table = table.fillna(fill_value, downcast="infer")
|
||||
|
||||
if margins:
|
||||
if dropna:
|
||||
data = data[data.notna().all(axis=1)]
|
||||
table = _add_margins(
|
||||
table,
|
||||
data,
|
||||
values,
|
||||
rows=index,
|
||||
cols=columns,
|
||||
aggfunc=aggfunc,
|
||||
observed=dropna,
|
||||
margins_name=margins_name,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
|
||||
# discard the top level
|
||||
if values_passed and not values_multi and table.columns.nlevels > 1:
|
||||
table = table.droplevel(0, axis=1)
|
||||
if len(index) == 0 and len(columns) > 0:
|
||||
table = table.T
|
||||
|
||||
# GH 15193 Make sure empty columns are removed if dropna=True
|
||||
if isinstance(table, ABCDataFrame) and dropna:
|
||||
table = table.dropna(how="all", axis=1)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _add_margins(
|
||||
table: DataFrame | Series,
|
||||
data: DataFrame,
|
||||
values,
|
||||
rows,
|
||||
cols,
|
||||
aggfunc,
|
||||
observed=None,
|
||||
margins_name: str = "All",
|
||||
fill_value=None,
|
||||
):
|
||||
if not isinstance(margins_name, str):
|
||||
raise ValueError("margins_name argument must be a string")
|
||||
|
||||
msg = f'Conflicting name "{margins_name}" in margins'
|
||||
for level in table.index.names:
|
||||
if margins_name in table.index.get_level_values(level):
|
||||
raise ValueError(msg)
|
||||
|
||||
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
|
||||
|
||||
if table.ndim == 2:
|
||||
# i.e. DataFrame
|
||||
for level in table.columns.names[1:]:
|
||||
if margins_name in table.columns.get_level_values(level):
|
||||
raise ValueError(msg)
|
||||
|
||||
key: str | tuple[str, ...]
|
||||
if len(rows) > 1:
|
||||
key = (margins_name,) + ("",) * (len(rows) - 1)
|
||||
else:
|
||||
key = margins_name
|
||||
|
||||
if not values and isinstance(table, ABCSeries):
|
||||
# If there are no values and the table is a series, then there is only
|
||||
# one column in the data. Compute grand margin and return it.
|
||||
return table._append(Series({key: grand_margin[margins_name]}))
|
||||
|
||||
elif values:
|
||||
marginal_result_set = _generate_marginal_results(
|
||||
table, data, values, rows, cols, aggfunc, observed, margins_name
|
||||
)
|
||||
if not isinstance(marginal_result_set, tuple):
|
||||
return marginal_result_set
|
||||
result, margin_keys, row_margin = marginal_result_set
|
||||
else:
|
||||
# no values, and table is a DataFrame
|
||||
assert isinstance(table, ABCDataFrame)
|
||||
marginal_result_set = _generate_marginal_results_without_values(
|
||||
table, data, rows, cols, aggfunc, observed, margins_name
|
||||
)
|
||||
if not isinstance(marginal_result_set, tuple):
|
||||
return marginal_result_set
|
||||
result, margin_keys, row_margin = marginal_result_set
|
||||
|
||||
row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
|
||||
# populate grand margin
|
||||
for k in margin_keys:
|
||||
if isinstance(k, str):
|
||||
row_margin[k] = grand_margin[k]
|
||||
else:
|
||||
row_margin[k] = grand_margin[k[0]]
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
margin_dummy = DataFrame(row_margin, columns=[key]).T
|
||||
|
||||
row_names = result.index.names
|
||||
# check the result column and leave floats
|
||||
for dtype in set(result.dtypes):
|
||||
cols = result.select_dtypes([dtype]).columns
|
||||
margin_dummy[cols] = margin_dummy[cols].apply(
|
||||
maybe_downcast_to_dtype, args=(dtype,)
|
||||
)
|
||||
result = result._append(margin_dummy)
|
||||
result.index.names = row_names
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _compute_grand_margin(data: DataFrame, values, aggfunc, margins_name: str = "All"):
|
||||
|
||||
if values:
|
||||
grand_margin = {}
|
||||
for k, v in data[values].items():
|
||||
try:
|
||||
if isinstance(aggfunc, str):
|
||||
grand_margin[k] = getattr(v, aggfunc)()
|
||||
elif isinstance(aggfunc, dict):
|
||||
if isinstance(aggfunc[k], str):
|
||||
grand_margin[k] = getattr(v, aggfunc[k])()
|
||||
else:
|
||||
grand_margin[k] = aggfunc[k](v)
|
||||
else:
|
||||
grand_margin[k] = aggfunc(v)
|
||||
except TypeError:
|
||||
pass
|
||||
return grand_margin
|
||||
else:
|
||||
return {margins_name: aggfunc(data.index)}
|
||||
|
||||
|
||||
def _generate_marginal_results(
|
||||
table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All"
|
||||
):
|
||||
if len(cols) > 0:
|
||||
# need to "interleave" the margins
|
||||
table_pieces = []
|
||||
margin_keys = []
|
||||
|
||||
def _all_key(key):
|
||||
return (key, margins_name) + ("",) * (len(cols) - 1)
|
||||
|
||||
if len(rows) > 0:
|
||||
margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
|
||||
cat_axis = 1
|
||||
|
||||
for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
|
||||
all_key = _all_key(key)
|
||||
|
||||
# we are going to mutate this, so need to copy!
|
||||
piece = piece.copy()
|
||||
piece[all_key] = margin[key]
|
||||
|
||||
table_pieces.append(piece)
|
||||
margin_keys.append(all_key)
|
||||
else:
|
||||
from pandas import DataFrame
|
||||
|
||||
cat_axis = 0
|
||||
for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
|
||||
if len(cols) > 1:
|
||||
all_key = _all_key(key)
|
||||
else:
|
||||
all_key = margins_name
|
||||
table_pieces.append(piece)
|
||||
# GH31016 this is to calculate margin for each group, and assign
|
||||
# corresponded key as index
|
||||
transformed_piece = DataFrame(piece.apply(aggfunc)).T
|
||||
transformed_piece.index = Index([all_key], name=piece.index.name)
|
||||
|
||||
# append piece for margin into table_piece
|
||||
table_pieces.append(transformed_piece)
|
||||
margin_keys.append(all_key)
|
||||
|
||||
result = concat(table_pieces, axis=cat_axis)
|
||||
|
||||
if len(rows) == 0:
|
||||
return result
|
||||
else:
|
||||
result = table
|
||||
margin_keys = table.columns
|
||||
|
||||
if len(cols) > 0:
|
||||
row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
|
||||
row_margin = row_margin.stack()
|
||||
|
||||
# slight hack
|
||||
new_order = [len(cols)] + list(range(len(cols)))
|
||||
row_margin.index = row_margin.index.reorder_levels(new_order)
|
||||
else:
|
||||
row_margin = Series(np.nan, index=result.columns)
|
||||
|
||||
return result, margin_keys, row_margin
|
||||
|
||||
|
||||
def _generate_marginal_results_without_values(
|
||||
table: DataFrame, data, rows, cols, aggfunc, observed, margins_name: str = "All"
|
||||
):
|
||||
if len(cols) > 0:
|
||||
# need to "interleave" the margins
|
||||
margin_keys: list | Index = []
|
||||
|
||||
def _all_key():
|
||||
if len(cols) == 1:
|
||||
return margins_name
|
||||
return (margins_name,) + ("",) * (len(cols) - 1)
|
||||
|
||||
if len(rows) > 0:
|
||||
margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
|
||||
all_key = _all_key()
|
||||
table[all_key] = margin
|
||||
result = table
|
||||
margin_keys.append(all_key)
|
||||
|
||||
else:
|
||||
margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc)
|
||||
all_key = _all_key()
|
||||
table[all_key] = margin
|
||||
result = table
|
||||
margin_keys.append(all_key)
|
||||
return result
|
||||
else:
|
||||
result = table
|
||||
margin_keys = table.columns
|
||||
|
||||
if len(cols):
|
||||
row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc)
|
||||
else:
|
||||
row_margin = Series(np.nan, index=result.columns)
|
||||
|
||||
return result, margin_keys, row_margin
|
||||
|
||||
|
||||
def _convert_by(by):
|
||||
if by is None:
|
||||
by = []
|
||||
elif (
|
||||
is_scalar(by)
|
||||
or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper))
|
||||
or callable(by)
|
||||
):
|
||||
by = [by]
|
||||
else:
|
||||
by = list(by)
|
||||
return by
|
||||
|
||||
|
||||
@Substitution("\ndata : DataFrame")
|
||||
@Appender(_shared_docs["pivot"], indents=1)
|
||||
def pivot(
|
||||
data: DataFrame,
|
||||
index: IndexLabel | None = None,
|
||||
columns: IndexLabel | None = None,
|
||||
values: IndexLabel | None = None,
|
||||
) -> DataFrame:
|
||||
if columns is None:
|
||||
raise TypeError("pivot() missing 1 required argument: 'columns'")
|
||||
|
||||
columns_listlike = com.convert_to_list_like(columns)
|
||||
|
||||
if values is None:
|
||||
if index is not None:
|
||||
cols = com.convert_to_list_like(index)
|
||||
else:
|
||||
cols = []
|
||||
|
||||
append = index is None
|
||||
# error: Unsupported operand types for + ("List[Any]" and "ExtensionArray")
|
||||
# error: Unsupported left operand type for + ("ExtensionArray")
|
||||
indexed = data.set_index(
|
||||
cols + columns_listlike, append=append # type: ignore[operator]
|
||||
)
|
||||
else:
|
||||
if index is None:
|
||||
if isinstance(data.index, MultiIndex):
|
||||
# GH 23955
|
||||
index_list = [
|
||||
data.index.get_level_values(i) for i in range(data.index.nlevels)
|
||||
]
|
||||
else:
|
||||
index_list = [Series(data.index, name=data.index.name)]
|
||||
else:
|
||||
index_list = [data[idx] for idx in com.convert_to_list_like(index)]
|
||||
|
||||
data_columns = [data[col] for col in columns_listlike]
|
||||
index_list.extend(data_columns)
|
||||
multiindex = MultiIndex.from_arrays(index_list)
|
||||
|
||||
if is_list_like(values) and not isinstance(values, tuple):
|
||||
# Exclude tuple because it is seen as a single column name
|
||||
values = cast(Sequence[Hashable], values)
|
||||
indexed = data._constructor(
|
||||
data[values]._values, index=multiindex, columns=values
|
||||
)
|
||||
else:
|
||||
indexed = data._constructor_sliced(data[values]._values, index=multiindex)
|
||||
return indexed.unstack(columns_listlike)
|
||||
|
||||
|
||||
def crosstab(
|
||||
index,
|
||||
columns,
|
||||
values=None,
|
||||
rownames=None,
|
||||
colnames=None,
|
||||
aggfunc=None,
|
||||
margins: bool = False,
|
||||
margins_name: str = "All",
|
||||
dropna: bool = True,
|
||||
normalize=False,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Compute a simple cross tabulation of two (or more) factors. By default
|
||||
computes a frequency table of the factors unless an array of values and an
|
||||
aggregation function are passed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : array-like, Series, or list of arrays/Series
|
||||
Values to group by in the rows.
|
||||
columns : array-like, Series, or list of arrays/Series
|
||||
Values to group by in the columns.
|
||||
values : array-like, optional
|
||||
Array of values to aggregate according to the factors.
|
||||
Requires `aggfunc` be specified.
|
||||
rownames : sequence, default None
|
||||
If passed, must match number of row arrays passed.
|
||||
colnames : sequence, default None
|
||||
If passed, must match number of column arrays passed.
|
||||
aggfunc : function, optional
|
||||
If specified, requires `values` be specified as well.
|
||||
margins : bool, default False
|
||||
Add row/column margins (subtotals).
|
||||
margins_name : str, default 'All'
|
||||
Name of the row/column that will contain the totals
|
||||
when margins is True.
|
||||
dropna : bool, default True
|
||||
Do not include columns whose entries are all NaN.
|
||||
normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
|
||||
Normalize by dividing all values by the sum of values.
|
||||
|
||||
- If passed 'all' or `True`, will normalize over all values.
|
||||
- If passed 'index' will normalize over each row.
|
||||
- If passed 'columns' will normalize over each column.
|
||||
- If margins is `True`, will also normalize margin values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Cross tabulation of the data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.pivot : Reshape data based on column values.
|
||||
pivot_table : Create a pivot table as a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Any Series passed will have their name attributes used unless row or column
|
||||
names for the cross-tabulation are specified.
|
||||
|
||||
Any input passed containing Categorical data will have **all** of its
|
||||
categories included in the cross-tabulation, even if the actual data does
|
||||
not contain any instances of a particular category.
|
||||
|
||||
In the event that there aren't overlapping indexes an empty DataFrame will
|
||||
be returned.
|
||||
|
||||
Reference :ref:`the user guide <reshaping.crosstabulations>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
|
||||
... "bar", "bar", "foo", "foo", "foo"], dtype=object)
|
||||
>>> b = np.array(["one", "one", "one", "two", "one", "one",
|
||||
... "one", "two", "two", "two", "one"], dtype=object)
|
||||
>>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
|
||||
... "shiny", "dull", "shiny", "shiny", "shiny"],
|
||||
... dtype=object)
|
||||
>>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
|
||||
b one two
|
||||
c dull shiny dull shiny
|
||||
a
|
||||
bar 1 2 1 0
|
||||
foo 2 2 1 2
|
||||
|
||||
Here 'c' and 'f' are not represented in the data and will not be
|
||||
shown in the output because dropna is True by default. Set
|
||||
dropna=False to preserve categories with no data.
|
||||
|
||||
>>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
|
||||
>>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
|
||||
>>> pd.crosstab(foo, bar)
|
||||
col_0 d e
|
||||
row_0
|
||||
a 1 0
|
||||
b 0 1
|
||||
>>> pd.crosstab(foo, bar, dropna=False)
|
||||
col_0 d e f
|
||||
row_0
|
||||
a 1 0 0
|
||||
b 0 1 0
|
||||
c 0 0 0
|
||||
"""
|
||||
if values is None and aggfunc is not None:
|
||||
raise ValueError("aggfunc cannot be used without values.")
|
||||
|
||||
if values is not None and aggfunc is None:
|
||||
raise ValueError("values cannot be used without an aggfunc.")
|
||||
|
||||
if not is_nested_list_like(index):
|
||||
index = [index]
|
||||
if not is_nested_list_like(columns):
|
||||
columns = [columns]
|
||||
|
||||
common_idx = None
|
||||
pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))]
|
||||
if pass_objs:
|
||||
common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False)
|
||||
|
||||
rownames = _get_names(index, rownames, prefix="row")
|
||||
colnames = _get_names(columns, colnames, prefix="col")
|
||||
|
||||
# duplicate names mapped to unique names for pivot op
|
||||
(
|
||||
rownames_mapper,
|
||||
unique_rownames,
|
||||
colnames_mapper,
|
||||
unique_colnames,
|
||||
) = _build_names_mapper(rownames, colnames)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
data = {
|
||||
**dict(zip(unique_rownames, index)),
|
||||
**dict(zip(unique_colnames, columns)),
|
||||
}
|
||||
df = DataFrame(data, index=common_idx)
|
||||
|
||||
if values is None:
|
||||
df["__dummy__"] = 0
|
||||
kwargs = {"aggfunc": len, "fill_value": 0}
|
||||
else:
|
||||
df["__dummy__"] = values
|
||||
kwargs = {"aggfunc": aggfunc}
|
||||
|
||||
table = df.pivot_table(
|
||||
"__dummy__",
|
||||
index=unique_rownames,
|
||||
columns=unique_colnames,
|
||||
margins=margins,
|
||||
margins_name=margins_name,
|
||||
dropna=dropna,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Post-process
|
||||
if normalize is not False:
|
||||
table = _normalize(
|
||||
table, normalize=normalize, margins=margins, margins_name=margins_name
|
||||
)
|
||||
|
||||
table = table.rename_axis(index=rownames_mapper, axis=0)
|
||||
table = table.rename_axis(columns=colnames_mapper, axis=1)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _normalize(
|
||||
table: DataFrame, normalize, margins: bool, margins_name="All"
|
||||
) -> DataFrame:
|
||||
|
||||
if not isinstance(normalize, (bool, str)):
|
||||
axis_subs = {0: "index", 1: "columns"}
|
||||
try:
|
||||
normalize = axis_subs[normalize]
|
||||
except KeyError as err:
|
||||
raise ValueError("Not a valid normalize argument") from err
|
||||
|
||||
if margins is False:
|
||||
|
||||
# Actual Normalizations
|
||||
normalizers: dict[bool | str, Callable] = {
|
||||
"all": lambda x: x / x.sum(axis=1).sum(axis=0),
|
||||
"columns": lambda x: x / x.sum(),
|
||||
"index": lambda x: x.div(x.sum(axis=1), axis=0),
|
||||
}
|
||||
|
||||
normalizers[True] = normalizers["all"]
|
||||
|
||||
try:
|
||||
f = normalizers[normalize]
|
||||
except KeyError as err:
|
||||
raise ValueError("Not a valid normalize argument") from err
|
||||
|
||||
table = f(table)
|
||||
table = table.fillna(0)
|
||||
|
||||
elif margins is True:
|
||||
# keep index and column of pivoted table
|
||||
table_index = table.index
|
||||
table_columns = table.columns
|
||||
last_ind_or_col = table.iloc[-1, :].name
|
||||
|
||||
# check if margin name is not in (for MI cases) and not equal to last
|
||||
# index/column and save the column and index margin
|
||||
if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col):
|
||||
raise ValueError(f"{margins_name} not in pivoted DataFrame")
|
||||
column_margin = table.iloc[:-1, -1]
|
||||
index_margin = table.iloc[-1, :-1]
|
||||
|
||||
# keep the core table
|
||||
table = table.iloc[:-1, :-1]
|
||||
|
||||
# Normalize core
|
||||
table = _normalize(table, normalize=normalize, margins=False)
|
||||
|
||||
# Fix Margins
|
||||
if normalize == "columns":
|
||||
column_margin = column_margin / column_margin.sum()
|
||||
table = concat([table, column_margin], axis=1)
|
||||
table = table.fillna(0)
|
||||
table.columns = table_columns
|
||||
|
||||
elif normalize == "index":
|
||||
index_margin = index_margin / index_margin.sum()
|
||||
table = table._append(index_margin)
|
||||
table = table.fillna(0)
|
||||
table.index = table_index
|
||||
|
||||
elif normalize == "all" or normalize is True:
|
||||
column_margin = column_margin / column_margin.sum()
|
||||
index_margin = index_margin / index_margin.sum()
|
||||
index_margin.loc[margins_name] = 1
|
||||
table = concat([table, column_margin], axis=1)
|
||||
table = table._append(index_margin)
|
||||
|
||||
table = table.fillna(0)
|
||||
table.index = table_index
|
||||
table.columns = table_columns
|
||||
|
||||
else:
|
||||
raise ValueError("Not a valid normalize argument")
|
||||
|
||||
else:
|
||||
raise ValueError("Not a valid margins argument")
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _get_names(arrs, names, prefix: str = "row"):
|
||||
if names is None:
|
||||
names = []
|
||||
for i, arr in enumerate(arrs):
|
||||
if isinstance(arr, ABCSeries) and arr.name is not None:
|
||||
names.append(arr.name)
|
||||
else:
|
||||
names.append(f"{prefix}_{i}")
|
||||
else:
|
||||
if len(names) != len(arrs):
|
||||
raise AssertionError("arrays and names must have the same length")
|
||||
if not isinstance(names, list):
|
||||
names = list(names)
|
||||
|
||||
return names
|
||||
|
||||
|
||||
def _build_names_mapper(
|
||||
rownames: list[str], colnames: list[str]
|
||||
) -> tuple[dict[str, str], list[str], dict[str, str], list[str]]:
|
||||
"""
|
||||
Given the names of a DataFrame's rows and columns, returns a set of unique row
|
||||
and column names and mappers that convert to original names.
|
||||
|
||||
A row or column name is replaced if it is duplicate among the rows of the inputs,
|
||||
among the columns of the inputs or between the rows and the columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rownames: list[str]
|
||||
colnames: list[str]
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tuple(Dict[str, str], List[str], Dict[str, str], List[str])
|
||||
|
||||
rownames_mapper: dict[str, str]
|
||||
a dictionary with new row names as keys and original rownames as values
|
||||
unique_rownames: list[str]
|
||||
a list of rownames with duplicate names replaced by dummy names
|
||||
colnames_mapper: dict[str, str]
|
||||
a dictionary with new column names as keys and original column names as values
|
||||
unique_colnames: list[str]
|
||||
a list of column names with duplicate names replaced by dummy names
|
||||
|
||||
"""
|
||||
|
||||
def get_duplicates(names):
|
||||
seen: set = set()
|
||||
return {name for name in names if name not in seen}
|
||||
|
||||
shared_names = set(rownames).intersection(set(colnames))
|
||||
dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names
|
||||
|
||||
rownames_mapper = {
|
||||
f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names
|
||||
}
|
||||
unique_rownames = [
|
||||
f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames)
|
||||
]
|
||||
|
||||
colnames_mapper = {
|
||||
f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names
|
||||
}
|
||||
unique_colnames = [
|
||||
f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames)
|
||||
]
|
||||
|
||||
return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames
|
1143
.venv/Lib/site-packages/pandas/core/reshape/reshape.py
Normal file
1143
.venv/Lib/site-packages/pandas/core/reshape/reshape.py
Normal file
File diff suppressed because it is too large
Load Diff
646
.venv/Lib/site-packages/pandas/core/reshape/tile.py
Normal file
646
.venv/Lib/site-packages/pandas/core/reshape/tile.py
Normal file
@ -0,0 +1,646 @@
|
||||
"""
|
||||
Quantilization functions and related stuff
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
)
|
||||
from pandas._libs.lib import infer_dtype
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
DT64NS_DTYPE,
|
||||
ensure_platform_int,
|
||||
is_bool_dtype,
|
||||
is_categorical_dtype,
|
||||
is_datetime64_dtype,
|
||||
is_datetime64tz_dtype,
|
||||
is_datetime_or_timedelta_dtype,
|
||||
is_extension_array_dtype,
|
||||
is_integer,
|
||||
is_list_like,
|
||||
is_numeric_dtype,
|
||||
is_scalar,
|
||||
is_timedelta64_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.generic import ABCSeries
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
Index,
|
||||
IntervalIndex,
|
||||
to_datetime,
|
||||
to_timedelta,
|
||||
)
|
||||
import pandas.core.algorithms as algos
|
||||
import pandas.core.nanops as nanops
|
||||
|
||||
|
||||
def cut(
|
||||
x,
|
||||
bins,
|
||||
right: bool = True,
|
||||
labels=None,
|
||||
retbins: bool = False,
|
||||
precision: int = 3,
|
||||
include_lowest: bool = False,
|
||||
duplicates: str = "raise",
|
||||
ordered: bool = True,
|
||||
):
|
||||
"""
|
||||
Bin values into discrete intervals.
|
||||
|
||||
Use `cut` when you need to segment and sort data values into bins. This
|
||||
function is also useful for going from a continuous variable to a
|
||||
categorical variable. For example, `cut` could convert ages to groups of
|
||||
age ranges. Supports binning into an equal number of bins, or a
|
||||
pre-specified array of bins.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array-like
|
||||
The input array to be binned. Must be 1-dimensional.
|
||||
bins : int, sequence of scalars, or IntervalIndex
|
||||
The criteria to bin by.
|
||||
|
||||
* int : Defines the number of equal-width bins in the range of `x`. The
|
||||
range of `x` is extended by .1% on each side to include the minimum
|
||||
and maximum values of `x`.
|
||||
* sequence of scalars : Defines the bin edges allowing for non-uniform
|
||||
width. No extension of the range of `x` is done.
|
||||
* IntervalIndex : Defines the exact bins to be used. Note that
|
||||
IntervalIndex for `bins` must be non-overlapping.
|
||||
|
||||
right : bool, default True
|
||||
Indicates whether `bins` includes the rightmost edge or not. If
|
||||
``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
|
||||
indicate (1,2], (2,3], (3,4]. This argument is ignored when
|
||||
`bins` is an IntervalIndex.
|
||||
labels : array or False, default None
|
||||
Specifies the labels for the returned bins. Must be the same length as
|
||||
the resulting bins. If False, returns only integer indicators of the
|
||||
bins. This affects the type of the output container (see below).
|
||||
This argument is ignored when `bins` is an IntervalIndex. If True,
|
||||
raises an error. When `ordered=False`, labels must be provided.
|
||||
retbins : bool, default False
|
||||
Whether to return the bins or not. Useful when bins is provided
|
||||
as a scalar.
|
||||
precision : int, default 3
|
||||
The precision at which to store and display the bins labels.
|
||||
include_lowest : bool, default False
|
||||
Whether the first interval should be left-inclusive or not.
|
||||
duplicates : {default 'raise', 'drop'}, optional
|
||||
If bin edges are not unique, raise ValueError or drop non-uniques.
|
||||
ordered : bool, default True
|
||||
Whether the labels are ordered or not. Applies to returned types
|
||||
Categorical and Series (with Categorical dtype). If True,
|
||||
the resulting categorical will be ordered. If False, the resulting
|
||||
categorical will be unordered (labels must be provided).
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : Categorical, Series, or ndarray
|
||||
An array-like object representing the respective bin for each value
|
||||
of `x`. The type depends on the value of `labels`.
|
||||
|
||||
* None (default) : returns a Series for Series `x` or a
|
||||
Categorical for all other inputs. The values stored within
|
||||
are Interval dtype.
|
||||
|
||||
* sequence of scalars : returns a Series for Series `x` or a
|
||||
Categorical for all other inputs. The values stored within
|
||||
are whatever the type in the sequence is.
|
||||
|
||||
* False : returns an ndarray of integers.
|
||||
|
||||
bins : numpy.ndarray or IntervalIndex.
|
||||
The computed or specified bins. Only returned when `retbins=True`.
|
||||
For scalar or sequence `bins`, this is an ndarray with the computed
|
||||
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
|
||||
an IntervalIndex `bins`, this is equal to `bins`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
qcut : Discretize variable into equal-sized buckets based on rank
|
||||
or based on sample quantiles.
|
||||
Categorical : Array type for storing data that come from a
|
||||
fixed set of values.
|
||||
Series : One-dimensional array with axis labels (including time series).
|
||||
IntervalIndex : Immutable Index implementing an ordered, sliceable set.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Any NA values will be NA in the result. Out of bounds values will be NA in
|
||||
the resulting Series or Categorical object.
|
||||
|
||||
Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Discretize into three equal-sized bins.
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
|
||||
... # doctest: +ELLIPSIS
|
||||
[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
|
||||
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
|
||||
... # doctest: +ELLIPSIS
|
||||
([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
|
||||
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
|
||||
array([0.994, 3. , 5. , 7. ]))
|
||||
|
||||
Discovers the same bins, but assign them specific labels. Notice that
|
||||
the returned Categorical's categories are `labels` and is ordered.
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
|
||||
... 3, labels=["bad", "medium", "good"])
|
||||
['bad', 'good', 'medium', 'medium', 'good', 'bad']
|
||||
Categories (3, object): ['bad' < 'medium' < 'good']
|
||||
|
||||
``ordered=False`` will result in unordered categories when labels are passed.
|
||||
This parameter can be used to allow non-unique labels:
|
||||
|
||||
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
|
||||
... labels=["B", "A", "B"], ordered=False)
|
||||
['B', 'B', 'A', 'A', 'B', 'B']
|
||||
Categories (2, object): ['A', 'B']
|
||||
|
||||
``labels=False`` implies you just want the bins back.
|
||||
|
||||
>>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
|
||||
array([0, 1, 1, 3])
|
||||
|
||||
Passing a Series as an input returns a Series with categorical dtype:
|
||||
|
||||
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
|
||||
... index=['a', 'b', 'c', 'd', 'e'])
|
||||
>>> pd.cut(s, 3)
|
||||
... # doctest: +ELLIPSIS
|
||||
a (1.992, 4.667]
|
||||
b (1.992, 4.667]
|
||||
c (4.667, 7.333]
|
||||
d (7.333, 10.0]
|
||||
e (7.333, 10.0]
|
||||
dtype: category
|
||||
Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...
|
||||
|
||||
Passing a Series as an input returns a Series with mapping value.
|
||||
It is used to map numerically to intervals based on bins.
|
||||
|
||||
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
|
||||
... index=['a', 'b', 'c', 'd', 'e'])
|
||||
>>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
|
||||
... # doctest: +ELLIPSIS
|
||||
(a 1.0
|
||||
b 2.0
|
||||
c 3.0
|
||||
d 4.0
|
||||
e NaN
|
||||
dtype: float64,
|
||||
array([ 0, 2, 4, 6, 8, 10]))
|
||||
|
||||
Use `drop` optional when bins is not unique
|
||||
|
||||
>>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
|
||||
... right=False, duplicates='drop')
|
||||
... # doctest: +ELLIPSIS
|
||||
(a 1.0
|
||||
b 2.0
|
||||
c 3.0
|
||||
d 3.0
|
||||
e NaN
|
||||
dtype: float64,
|
||||
array([ 0, 2, 4, 6, 10]))
|
||||
|
||||
Passing an IntervalIndex for `bins` results in those categories exactly.
|
||||
Notice that values not covered by the IntervalIndex are set to NaN. 0
|
||||
is to the left of the first bin (which is closed on the right), and 1.5
|
||||
falls between two bins.
|
||||
|
||||
>>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
|
||||
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
|
||||
[NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
|
||||
Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
|
||||
"""
|
||||
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
|
||||
|
||||
original = x
|
||||
x = _preprocess_for_cut(x)
|
||||
x, dtype = _coerce_to_type(x)
|
||||
|
||||
if not np.iterable(bins):
|
||||
if is_scalar(bins) and bins < 1:
|
||||
raise ValueError("`bins` should be a positive integer.")
|
||||
|
||||
try: # for array-like
|
||||
sz = x.size
|
||||
except AttributeError:
|
||||
x = np.asarray(x)
|
||||
sz = x.size
|
||||
|
||||
if sz == 0:
|
||||
raise ValueError("Cannot cut empty array")
|
||||
|
||||
rng = (nanops.nanmin(x), nanops.nanmax(x))
|
||||
mn, mx = (mi + 0.0 for mi in rng)
|
||||
|
||||
if np.isinf(mn) or np.isinf(mx):
|
||||
# GH 24314
|
||||
raise ValueError(
|
||||
"cannot specify integer `bins` when input data contains infinity"
|
||||
)
|
||||
elif mn == mx: # adjust end points before binning
|
||||
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
|
||||
mx += 0.001 * abs(mx) if mx != 0 else 0.001
|
||||
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
|
||||
else: # adjust end points after binning
|
||||
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
|
||||
adj = (mx - mn) * 0.001 # 0.1% of the range
|
||||
if right:
|
||||
bins[0] -= adj
|
||||
else:
|
||||
bins[-1] += adj
|
||||
|
||||
elif isinstance(bins, IntervalIndex):
|
||||
if bins.is_overlapping:
|
||||
raise ValueError("Overlapping IntervalIndex is not accepted.")
|
||||
|
||||
else:
|
||||
if is_datetime64tz_dtype(bins):
|
||||
bins = np.asarray(bins, dtype=DT64NS_DTYPE)
|
||||
else:
|
||||
bins = np.asarray(bins)
|
||||
bins = _convert_bin_to_numeric_type(bins, dtype)
|
||||
|
||||
# GH 26045: cast to float64 to avoid an overflow
|
||||
if (np.diff(bins.astype("float64")) < 0).any():
|
||||
raise ValueError("bins must increase monotonically.")
|
||||
|
||||
fac, bins = _bins_to_cuts(
|
||||
x,
|
||||
bins,
|
||||
right=right,
|
||||
labels=labels,
|
||||
precision=precision,
|
||||
include_lowest=include_lowest,
|
||||
dtype=dtype,
|
||||
duplicates=duplicates,
|
||||
ordered=ordered,
|
||||
)
|
||||
|
||||
return _postprocess_for_cut(fac, bins, retbins, dtype, original)
|
||||
|
||||
|
||||
def qcut(
|
||||
x,
|
||||
q,
|
||||
labels=None,
|
||||
retbins: bool = False,
|
||||
precision: int = 3,
|
||||
duplicates: str = "raise",
|
||||
):
|
||||
"""
|
||||
Quantile-based discretization function.
|
||||
|
||||
Discretize variable into equal-sized buckets based on rank or based
|
||||
on sample quantiles. For example 1000 values for 10 quantiles would
|
||||
produce a Categorical object indicating quantile membership for each data point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : 1d ndarray or Series
|
||||
q : int or list-like of float
|
||||
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
|
||||
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
|
||||
labels : array or False, default None
|
||||
Used as labels for the resulting bins. Must be of the same length as
|
||||
the resulting bins. If False, return only integer indicators of the
|
||||
bins. If True, raises an error.
|
||||
retbins : bool, optional
|
||||
Whether to return the (bins, labels) or not. Can be useful if bins
|
||||
is given as a scalar.
|
||||
precision : int, optional
|
||||
The precision at which to store and display the bins labels.
|
||||
duplicates : {default 'raise', 'drop'}, optional
|
||||
If bin edges are not unique, raise ValueError or drop non-uniques.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : Categorical or Series or array of integers if labels is False
|
||||
The return type (Categorical or Series) depends on the input: a Series
|
||||
of type category if input is a Series else Categorical. Bins are
|
||||
represented as categories when categorical data is returned.
|
||||
bins : ndarray of floats
|
||||
Returned only if `retbins` is True.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Out of bounds values will be NA in the resulting Categorical object
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.qcut(range(5), 4)
|
||||
... # doctest: +ELLIPSIS
|
||||
[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
|
||||
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...
|
||||
|
||||
>>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
|
||||
... # doctest: +SKIP
|
||||
[good, good, medium, bad, bad]
|
||||
Categories (3, object): [good < medium < bad]
|
||||
|
||||
>>> pd.qcut(range(5), 4, labels=False)
|
||||
array([0, 0, 1, 2, 3])
|
||||
"""
|
||||
original = x
|
||||
x = _preprocess_for_cut(x)
|
||||
x, dtype = _coerce_to_type(x)
|
||||
|
||||
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
|
||||
|
||||
x_np = np.asarray(x)
|
||||
x_np = x_np[~np.isnan(x_np)]
|
||||
bins = np.quantile(x_np, quantiles)
|
||||
|
||||
fac, bins = _bins_to_cuts(
|
||||
x,
|
||||
bins,
|
||||
labels=labels,
|
||||
precision=precision,
|
||||
include_lowest=True,
|
||||
dtype=dtype,
|
||||
duplicates=duplicates,
|
||||
)
|
||||
|
||||
return _postprocess_for_cut(fac, bins, retbins, dtype, original)
|
||||
|
||||
|
||||
def _bins_to_cuts(
|
||||
x,
|
||||
bins: np.ndarray,
|
||||
right: bool = True,
|
||||
labels=None,
|
||||
precision: int = 3,
|
||||
include_lowest: bool = False,
|
||||
dtype=None,
|
||||
duplicates: str = "raise",
|
||||
ordered: bool = True,
|
||||
):
|
||||
if not ordered and labels is None:
|
||||
raise ValueError("'labels' must be provided if 'ordered = False'")
|
||||
|
||||
if duplicates not in ["raise", "drop"]:
|
||||
raise ValueError(
|
||||
"invalid value for 'duplicates' parameter, valid options are: raise, drop"
|
||||
)
|
||||
|
||||
if isinstance(bins, IntervalIndex):
|
||||
# we have a fast-path here
|
||||
ids = bins.get_indexer(x)
|
||||
result = Categorical.from_codes(ids, categories=bins, ordered=True)
|
||||
return result, bins
|
||||
|
||||
unique_bins = algos.unique(bins)
|
||||
if len(unique_bins) < len(bins) and len(bins) != 2:
|
||||
if duplicates == "raise":
|
||||
raise ValueError(
|
||||
f"Bin edges must be unique: {repr(bins)}.\n"
|
||||
f"You can drop duplicate edges by setting the 'duplicates' kwarg"
|
||||
)
|
||||
else:
|
||||
bins = unique_bins
|
||||
|
||||
side: Literal["left", "right"] = "left" if right else "right"
|
||||
ids = ensure_platform_int(bins.searchsorted(x, side=side))
|
||||
|
||||
if include_lowest:
|
||||
ids[np.asarray(x) == bins[0]] = 1
|
||||
|
||||
na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
|
||||
has_nas = na_mask.any()
|
||||
|
||||
if labels is not False:
|
||||
if not (labels is None or is_list_like(labels)):
|
||||
raise ValueError(
|
||||
"Bin labels must either be False, None or passed in as a "
|
||||
"list-like argument"
|
||||
)
|
||||
|
||||
elif labels is None:
|
||||
labels = _format_labels(
|
||||
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
|
||||
)
|
||||
elif ordered and len(set(labels)) != len(labels):
|
||||
raise ValueError(
|
||||
"labels must be unique if ordered=True; pass ordered=False "
|
||||
"for duplicate labels"
|
||||
)
|
||||
else:
|
||||
if len(labels) != len(bins) - 1:
|
||||
raise ValueError(
|
||||
"Bin labels must be one fewer than the number of bin edges"
|
||||
)
|
||||
if not is_categorical_dtype(labels):
|
||||
labels = Categorical(
|
||||
labels,
|
||||
categories=labels if len(set(labels)) == len(labels) else None,
|
||||
ordered=ordered,
|
||||
)
|
||||
# TODO: handle mismatch between categorical label order and pandas.cut order.
|
||||
np.putmask(ids, na_mask, 0)
|
||||
result = algos.take_nd(labels, ids - 1)
|
||||
|
||||
else:
|
||||
result = ids - 1
|
||||
if has_nas:
|
||||
result = result.astype(np.float64)
|
||||
np.putmask(result, na_mask, np.nan)
|
||||
|
||||
return result, bins
|
||||
|
||||
|
||||
def _coerce_to_type(x):
|
||||
"""
|
||||
if the passed data is of datetime/timedelta, bool or nullable int type,
|
||||
this method converts it to numeric so that cut or qcut method can
|
||||
handle it
|
||||
"""
|
||||
dtype = None
|
||||
|
||||
if is_datetime64tz_dtype(x.dtype):
|
||||
dtype = x.dtype
|
||||
elif is_datetime64_dtype(x.dtype):
|
||||
x = to_datetime(x)
|
||||
dtype = np.dtype("datetime64[ns]")
|
||||
elif is_timedelta64_dtype(x.dtype):
|
||||
x = to_timedelta(x)
|
||||
dtype = np.dtype("timedelta64[ns]")
|
||||
elif is_bool_dtype(x.dtype):
|
||||
# GH 20303
|
||||
x = x.astype(np.int64)
|
||||
# To support cut and qcut for IntegerArray we convert to float dtype.
|
||||
# Will properly support in the future.
|
||||
# https://github.com/pandas-dev/pandas/pull/31290
|
||||
# https://github.com/pandas-dev/pandas/issues/31389
|
||||
elif is_extension_array_dtype(x.dtype) and is_numeric_dtype(x.dtype):
|
||||
x = x.to_numpy(dtype=np.float64, na_value=np.nan)
|
||||
|
||||
if dtype is not None:
|
||||
# GH 19768: force NaT to NaN during integer conversion
|
||||
x = np.where(x.notna(), x.view(np.int64), np.nan)
|
||||
|
||||
return x, dtype
|
||||
|
||||
|
||||
def _convert_bin_to_numeric_type(bins, dtype):
|
||||
"""
|
||||
if the passed bin is of datetime/timedelta type,
|
||||
this method converts it to integer
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bins : list-like of bins
|
||||
dtype : dtype of data
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError if bins are not of a compat dtype to dtype
|
||||
"""
|
||||
bins_dtype = infer_dtype(bins, skipna=False)
|
||||
if is_timedelta64_dtype(dtype):
|
||||
if bins_dtype in ["timedelta", "timedelta64"]:
|
||||
bins = to_timedelta(bins).view(np.int64)
|
||||
else:
|
||||
raise ValueError("bins must be of timedelta64 dtype")
|
||||
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
|
||||
if bins_dtype in ["datetime", "datetime64"]:
|
||||
bins = to_datetime(bins).view(np.int64)
|
||||
else:
|
||||
raise ValueError("bins must be of datetime64 dtype")
|
||||
|
||||
return bins
|
||||
|
||||
|
||||
def _convert_bin_to_datelike_type(bins, dtype):
|
||||
"""
|
||||
Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
|
||||
datelike
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bins : list-like of bins
|
||||
dtype : dtype of data
|
||||
|
||||
Returns
|
||||
-------
|
||||
bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
|
||||
datelike
|
||||
"""
|
||||
if is_datetime64tz_dtype(dtype):
|
||||
bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
|
||||
elif is_datetime_or_timedelta_dtype(dtype):
|
||||
bins = Index(bins.astype(np.int64), dtype=dtype)
|
||||
return bins
|
||||
|
||||
|
||||
def _format_labels(
|
||||
bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None
|
||||
):
|
||||
"""based on the dtype, return our labels"""
|
||||
closed = "right" if right else "left"
|
||||
|
||||
formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
|
||||
|
||||
if is_datetime64tz_dtype(dtype):
|
||||
formatter = lambda x: Timestamp(x, tz=dtype.tz)
|
||||
adjust = lambda x: x - Timedelta("1ns")
|
||||
elif is_datetime64_dtype(dtype):
|
||||
formatter = Timestamp
|
||||
adjust = lambda x: x - Timedelta("1ns")
|
||||
elif is_timedelta64_dtype(dtype):
|
||||
formatter = Timedelta
|
||||
adjust = lambda x: x - Timedelta("1ns")
|
||||
else:
|
||||
precision = _infer_precision(precision, bins)
|
||||
formatter = lambda x: _round_frac(x, precision)
|
||||
adjust = lambda x: x - 10 ** (-precision)
|
||||
|
||||
breaks = [formatter(b) for b in bins]
|
||||
if right and include_lowest:
|
||||
# adjust lhs of first interval by precision to account for being right closed
|
||||
breaks[0] = adjust(breaks[0])
|
||||
|
||||
return IntervalIndex.from_breaks(breaks, closed=closed)
|
||||
|
||||
|
||||
def _preprocess_for_cut(x):
|
||||
"""
|
||||
handles preprocessing for cut where we convert passed
|
||||
input to array, strip the index information and store it
|
||||
separately
|
||||
"""
|
||||
# Check that the passed array is a Pandas or Numpy object
|
||||
# We don't want to strip away a Pandas data-type here (e.g. datetimetz)
|
||||
ndim = getattr(x, "ndim", None)
|
||||
if ndim is None:
|
||||
x = np.asarray(x)
|
||||
if x.ndim != 1:
|
||||
raise ValueError("Input array must be 1 dimensional")
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original):
|
||||
"""
|
||||
handles post processing for the cut method where
|
||||
we combine the index information if the originally passed
|
||||
datatype was a series
|
||||
"""
|
||||
if isinstance(original, ABCSeries):
|
||||
fac = original._constructor(fac, index=original.index, name=original.name)
|
||||
|
||||
if not retbins:
|
||||
return fac
|
||||
|
||||
bins = _convert_bin_to_datelike_type(bins, dtype)
|
||||
|
||||
return fac, bins
|
||||
|
||||
|
||||
def _round_frac(x, precision: int):
|
||||
"""
|
||||
Round the fractional part of the given number
|
||||
"""
|
||||
if not np.isfinite(x) or x == 0:
|
||||
return x
|
||||
else:
|
||||
frac, whole = np.modf(x)
|
||||
if whole == 0:
|
||||
digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
|
||||
else:
|
||||
digits = precision
|
||||
return np.around(x, digits)
|
||||
|
||||
|
||||
def _infer_precision(base_precision: int, bins) -> int:
|
||||
"""
|
||||
Infer an appropriate precision for _round_frac
|
||||
"""
|
||||
for precision in range(base_precision, 20):
|
||||
levels = [_round_frac(b, precision) for b in bins]
|
||||
if algos.unique(levels).size == bins.size:
|
||||
return precision
|
||||
return base_precision # default
|
70
.venv/Lib/site-packages/pandas/core/reshape/util.py
Normal file
70
.venv/Lib/site-packages/pandas/core/reshape/util.py
Normal file
@ -0,0 +1,70 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
|
||||
def cartesian_product(X):
|
||||
"""
|
||||
Numpy version of itertools.product.
|
||||
Sometimes faster (for large inputs)...
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : list-like of list-likes
|
||||
|
||||
Returns
|
||||
-------
|
||||
product : list of ndarrays
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> cartesian_product([list('ABC'), [1, 2]])
|
||||
[array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='<U1'), array([1, 2, 1, 2, 1, 2])]
|
||||
|
||||
See Also
|
||||
--------
|
||||
itertools.product : Cartesian product of input iterables. Equivalent to
|
||||
nested for-loops.
|
||||
"""
|
||||
msg = "Input must be a list-like of list-likes"
|
||||
if not is_list_like(X):
|
||||
raise TypeError(msg)
|
||||
for x in X:
|
||||
if not is_list_like(x):
|
||||
raise TypeError(msg)
|
||||
|
||||
if len(X) == 0:
|
||||
return []
|
||||
|
||||
lenX = np.fromiter((len(x) for x in X), dtype=np.intp)
|
||||
cumprodX = np.cumproduct(lenX)
|
||||
|
||||
if np.any(cumprodX < 0):
|
||||
raise ValueError("Product space too large to allocate arrays!")
|
||||
|
||||
a = np.roll(cumprodX, 1)
|
||||
a[0] = 1
|
||||
|
||||
if cumprodX[-1] != 0:
|
||||
b = cumprodX[-1] / cumprodX
|
||||
else:
|
||||
# if any factor is empty, the cartesian product is empty
|
||||
b = np.zeros_like(cumprodX)
|
||||
|
||||
return [tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)]
|
||||
|
||||
|
||||
def tile_compat(arr, num: int):
|
||||
"""
|
||||
Index compat for np.tile.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Does not support multi-dimensional `num`.
|
||||
"""
|
||||
if isinstance(arr, np.ndarray):
|
||||
return np.tile(arr, num)
|
||||
|
||||
# Otherwise we have an Index
|
||||
taker = np.tile(np.arange(len(arr)), num)
|
||||
return arr.take(taker)
|
Reference in New Issue
Block a user