mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-02 06:22:25 +00:00
first commit
This commit is contained in:
9
.venv/Lib/site-packages/pandas/tests/base/common.py
Normal file
9
.venv/Lib/site-packages/pandas/tests/base/common.py
Normal file
@ -0,0 +1,9 @@
|
||||
from typing import Any
|
||||
|
||||
from pandas import Index
|
||||
|
||||
|
||||
def allow_na_ops(obj: Any) -> bool:
|
||||
"""Whether to skip test cases including NaN"""
|
||||
is_bool_index = isinstance(obj, Index) and obj.is_boolean()
|
||||
return not is_bool_index and obj._can_hold_na
|
176
.venv/Lib/site-packages/pandas/tests/base/test_constructors.py
Normal file
176
.venv/Lib/site-packages/pandas/tests/base/test_constructors.py
Normal file
@ -0,0 +1,176 @@
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PYPY
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.accessor import PandasDelegate
|
||||
from pandas.core.base import (
|
||||
NoNewAttributesMixin,
|
||||
PandasObject,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
Series,
|
||||
lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"],
|
||||
lambda x, **kwargs: DataFrame(x, **kwargs)[0],
|
||||
Index,
|
||||
],
|
||||
ids=["Series", "DataFrame-dict", "DataFrame-array", "Index"],
|
||||
)
|
||||
def constructor(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestPandasDelegate:
|
||||
class Delegator:
|
||||
_properties = ["foo"]
|
||||
_methods = ["bar"]
|
||||
|
||||
def _set_foo(self, value):
|
||||
self.foo = value
|
||||
|
||||
def _get_foo(self):
|
||||
return self.foo
|
||||
|
||||
foo = property(_get_foo, _set_foo, doc="foo property")
|
||||
|
||||
def bar(self, *args, **kwargs):
|
||||
"""a test bar method"""
|
||||
pass
|
||||
|
||||
class Delegate(PandasDelegate, PandasObject):
|
||||
def __init__(self, obj):
|
||||
self.obj = obj
|
||||
|
||||
def setup_method(self, method):
|
||||
pass
|
||||
|
||||
def test_invalid_delegation(self):
|
||||
# these show that in order for the delegation to work
|
||||
# the _delegate_* methods need to be overridden to not raise
|
||||
# a TypeError
|
||||
|
||||
self.Delegate._add_delegate_accessors(
|
||||
delegate=self.Delegator,
|
||||
accessors=self.Delegator._properties,
|
||||
typ="property",
|
||||
)
|
||||
self.Delegate._add_delegate_accessors(
|
||||
delegate=self.Delegator, accessors=self.Delegator._methods, typ="method"
|
||||
)
|
||||
|
||||
delegate = self.Delegate(self.Delegator())
|
||||
|
||||
msg = "You cannot access the property foo"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.foo
|
||||
|
||||
msg = "The property foo cannot be set"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.foo = 5
|
||||
|
||||
msg = "You cannot access the property foo"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.foo()
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
|
||||
def test_memory_usage(self):
|
||||
# Delegate does not implement memory_usage.
|
||||
# Check that we fall back to in-built `__sizeof__`
|
||||
# GH 12924
|
||||
delegate = self.Delegate(self.Delegator())
|
||||
sys.getsizeof(delegate)
|
||||
|
||||
|
||||
class TestNoNewAttributesMixin:
|
||||
def test_mixin(self):
|
||||
class T(NoNewAttributesMixin):
|
||||
pass
|
||||
|
||||
t = T()
|
||||
assert not hasattr(t, "__frozen")
|
||||
|
||||
t.a = "test"
|
||||
assert t.a == "test"
|
||||
|
||||
t._freeze()
|
||||
assert "__frozen" in dir(t)
|
||||
assert getattr(t, "__frozen")
|
||||
msg = "You cannot add any new attribute"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
t.b = "test"
|
||||
|
||||
assert not hasattr(t, "b")
|
||||
|
||||
|
||||
class TestConstruction:
|
||||
# test certain constructor behaviours on dtype inference across Series,
|
||||
# Index and DataFrame
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"klass",
|
||||
[
|
||||
Series,
|
||||
lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"],
|
||||
lambda x, **kwargs: DataFrame(x, **kwargs)[0],
|
||||
Index,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"a",
|
||||
[
|
||||
np.array(["2263-01-01"], dtype="datetime64[D]"),
|
||||
np.array([datetime(2263, 1, 1)], dtype=object),
|
||||
np.array([np.datetime64("2263-01-01", "D")], dtype=object),
|
||||
np.array(["2263-01-01"], dtype=object),
|
||||
],
|
||||
ids=[
|
||||
"datetime64[D]",
|
||||
"object-datetime.datetime",
|
||||
"object-numpy-scalar",
|
||||
"object-string",
|
||||
],
|
||||
)
|
||||
def test_constructor_datetime_outofbound(self, a, klass):
|
||||
# GH-26853 (+ bug GH-26206 out of bound non-ns unit)
|
||||
|
||||
# No dtype specified (dtype inference)
|
||||
# datetime64[non-ns] raise error, other cases result in object dtype
|
||||
# and preserve original data
|
||||
if a.dtype.kind == "M":
|
||||
msg = "Out of bounds"
|
||||
with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg):
|
||||
klass(a)
|
||||
else:
|
||||
result = klass(a)
|
||||
assert result.dtype == "object"
|
||||
tm.assert_numpy_array_equal(result.to_numpy(), a)
|
||||
|
||||
# Explicit dtype specified
|
||||
# Forced conversion fails for all -> all cases raise error
|
||||
msg = "Out of bounds"
|
||||
with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg):
|
||||
klass(a, dtype="datetime64[ns]")
|
||||
|
||||
def test_constructor_datetime_nonns(self, constructor):
|
||||
arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]")
|
||||
expected = constructor(pd.to_datetime(["2020-01-01"]))
|
||||
result = constructor(arr)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/issues/34843
|
||||
arr.flags.writeable = False
|
||||
result = constructor(arr)
|
||||
tm.assert_equal(result, expected)
|
509
.venv/Lib/site-packages/pandas/tests/base/test_conversion.py
Normal file
509
.venv/Lib/site-packages/pandas/tests/base/test_conversion.py
Normal file
@ -0,0 +1,509 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_datetime64_dtype,
|
||||
is_timedelta64_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import (
|
||||
DatetimeArray,
|
||||
IntervalArray,
|
||||
PandasArray,
|
||||
PeriodArray,
|
||||
SparseArray,
|
||||
TimedeltaArray,
|
||||
)
|
||||
|
||||
|
||||
class TestToIterable:
|
||||
# test that we convert an iterable to python types
|
||||
|
||||
dtypes = [
|
||||
("int8", int),
|
||||
("int16", int),
|
||||
("int32", int),
|
||||
("int64", int),
|
||||
("uint8", int),
|
||||
("uint16", int),
|
||||
("uint32", int),
|
||||
("uint64", int),
|
||||
("float16", float),
|
||||
("float32", float),
|
||||
("float64", float),
|
||||
("datetime64[ns]", Timestamp),
|
||||
("datetime64[ns, US/Eastern]", Timestamp),
|
||||
("timedelta64[ns]", Timedelta),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize("dtype, rdtype", dtypes)
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_iterable(self, index_or_series, method, dtype, rdtype):
|
||||
# gh-10904
|
||||
# gh-13258
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
s = typ([1], dtype=dtype)
|
||||
result = method(s)[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, rdtype, obj",
|
||||
[
|
||||
("object", object, "a"),
|
||||
("object", int, 1),
|
||||
("category", object, "a"),
|
||||
("category", int, 1),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_iterable_object_and_category(
|
||||
self, index_or_series, method, dtype, rdtype, obj
|
||||
):
|
||||
# gh-10904
|
||||
# gh-13258
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
s = typ([obj], dtype=dtype)
|
||||
result = method(s)[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize("dtype, rdtype", dtypes)
|
||||
def test_iterable_items(self, dtype, rdtype):
|
||||
# gh-13258
|
||||
# test if items yields the correct boxed scalars
|
||||
# this only applies to series
|
||||
s = Series([1], dtype=dtype)
|
||||
_, result = list(s.items())[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
_, result = list(s.items())[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, rdtype", dtypes + [("object", int), ("category", int)]
|
||||
)
|
||||
def test_iterable_map(self, index_or_series, dtype, rdtype):
|
||||
# gh-13236
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
s = typ([1], dtype=dtype)
|
||||
result = s.map(type)[0]
|
||||
if not isinstance(rdtype, tuple):
|
||||
rdtype = (rdtype,)
|
||||
assert result in rdtype
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_categorial_datetimelike(self, method):
|
||||
i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")])
|
||||
|
||||
result = method(i)[0]
|
||||
assert isinstance(result, Timestamp)
|
||||
|
||||
def test_iter_box(self):
|
||||
vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")]
|
||||
s = Series(vals)
|
||||
assert s.dtype == "datetime64[ns]"
|
||||
for res, exp in zip(s, vals):
|
||||
assert isinstance(res, Timestamp)
|
||||
assert res.tz is None
|
||||
assert res == exp
|
||||
|
||||
vals = [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
]
|
||||
s = Series(vals)
|
||||
|
||||
assert s.dtype == "datetime64[ns, US/Eastern]"
|
||||
for res, exp in zip(s, vals):
|
||||
assert isinstance(res, Timestamp)
|
||||
assert res.tz == exp.tz
|
||||
assert res == exp
|
||||
|
||||
# timedelta
|
||||
vals = [Timedelta("1 days"), Timedelta("2 days")]
|
||||
s = Series(vals)
|
||||
assert s.dtype == "timedelta64[ns]"
|
||||
for res, exp in zip(s, vals):
|
||||
assert isinstance(res, Timedelta)
|
||||
assert res == exp
|
||||
|
||||
# period
|
||||
vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
|
||||
s = Series(vals)
|
||||
assert s.dtype == "Period[M]"
|
||||
for res, exp in zip(s, vals):
|
||||
assert isinstance(res, pd.Period)
|
||||
assert res.freq == "M"
|
||||
assert res == exp
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, expected_type, dtype",
|
||||
[
|
||||
(np.array([0, 1], dtype=np.int64), np.ndarray, "int64"),
|
||||
(np.array(["a", "b"]), np.ndarray, "object"),
|
||||
(pd.Categorical(["a", "b"]), pd.Categorical, "category"),
|
||||
(
|
||||
pd.DatetimeIndex(["2017", "2018"], tz="US/Central"),
|
||||
DatetimeArray,
|
||||
"datetime64[ns, US/Central]",
|
||||
),
|
||||
(
|
||||
pd.PeriodIndex([2018, 2019], freq="A"),
|
||||
PeriodArray,
|
||||
pd.core.dtypes.dtypes.PeriodDtype("A-DEC"),
|
||||
),
|
||||
(pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"),
|
||||
(
|
||||
pd.DatetimeIndex(["2017", "2018"]),
|
||||
DatetimeArray,
|
||||
"datetime64[ns]",
|
||||
),
|
||||
(
|
||||
pd.TimedeltaIndex([10**10]),
|
||||
TimedeltaArray,
|
||||
"m8[ns]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_values_consistent(arr, expected_type, dtype):
|
||||
l_values = Series(arr)._values
|
||||
r_values = pd.Index(arr)._values
|
||||
assert type(l_values) is expected_type
|
||||
assert type(l_values) is type(r_values)
|
||||
|
||||
tm.assert_equal(l_values, r_values)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arr", [np.array([1, 2, 3])])
|
||||
def test_numpy_array(arr):
|
||||
ser = Series(arr)
|
||||
result = ser.array
|
||||
expected = PandasArray(arr)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_numpy_array_all_dtypes(any_numpy_dtype):
|
||||
ser = Series(dtype=any_numpy_dtype)
|
||||
result = ser.array
|
||||
if is_datetime64_dtype(any_numpy_dtype):
|
||||
assert isinstance(result, DatetimeArray)
|
||||
elif is_timedelta64_dtype(any_numpy_dtype):
|
||||
assert isinstance(result, TimedeltaArray)
|
||||
else:
|
||||
assert isinstance(result, PandasArray)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, attr",
|
||||
[
|
||||
(pd.Categorical(["a", "b"]), "_codes"),
|
||||
(pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"),
|
||||
(pd.array([0, np.nan], dtype="Int64"), "_data"),
|
||||
(IntervalArray.from_breaks([0, 1]), "_left"),
|
||||
(SparseArray([0, 1]), "_sparse_values"),
|
||||
(DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"),
|
||||
# tz-aware Datetime
|
||||
(
|
||||
DatetimeArray(
|
||||
np.array(
|
||||
["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]"
|
||||
),
|
||||
dtype=DatetimeTZDtype(tz="US/Central"),
|
||||
),
|
||||
"_data",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_array(arr, attr, index_or_series, request):
|
||||
box = index_or_series
|
||||
warn = None
|
||||
if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index:
|
||||
mark = pytest.mark.xfail(reason="Index cannot yet store sparse dtype")
|
||||
request.node.add_marker(mark)
|
||||
warn = FutureWarning
|
||||
|
||||
with tm.assert_produces_warning(warn):
|
||||
result = box(arr, copy=False).array
|
||||
|
||||
if attr:
|
||||
arr = getattr(arr, attr)
|
||||
result = getattr(result, attr)
|
||||
|
||||
assert result is arr
|
||||
|
||||
|
||||
def test_array_multiindex_raises():
|
||||
idx = pd.MultiIndex.from_product([["A"], ["a", "b"]])
|
||||
msg = "MultiIndex has no single backing array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
idx.array
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, expected",
|
||||
[
|
||||
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
|
||||
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
|
||||
(
|
||||
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
|
||||
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
|
||||
),
|
||||
(pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)),
|
||||
(
|
||||
IntervalArray.from_breaks([0, 1, 2]),
|
||||
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
|
||||
),
|
||||
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
|
||||
# tz-naive datetime
|
||||
(
|
||||
DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")),
|
||||
np.array(["2000", "2001"], dtype="M8[ns]"),
|
||||
),
|
||||
# tz-aware stays tz`-aware
|
||||
(
|
||||
DatetimeArray(
|
||||
np.array(
|
||||
["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]"
|
||||
),
|
||||
dtype=DatetimeTZDtype(tz="US/Central"),
|
||||
),
|
||||
np.array(
|
||||
[
|
||||
Timestamp("2000-01-01", tz="US/Central"),
|
||||
Timestamp("2000-01-02", tz="US/Central"),
|
||||
]
|
||||
),
|
||||
),
|
||||
# Timedelta
|
||||
(
|
||||
TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"),
|
||||
np.array([0, 3600000000000], dtype="m8[ns]"),
|
||||
),
|
||||
# GH#26406 tz is preserved in Categorical[dt64tz]
|
||||
(
|
||||
pd.Categorical(date_range("2016-01-01", periods=2, tz="US/Pacific")),
|
||||
np.array(
|
||||
[
|
||||
Timestamp("2016-01-01", tz="US/Pacific"),
|
||||
Timestamp("2016-01-02", tz="US/Pacific"),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy(arr, expected, index_or_series_or_array, request):
|
||||
box = index_or_series_or_array
|
||||
|
||||
warn = None
|
||||
if index_or_series_or_array is pd.Index and isinstance(arr, SparseArray):
|
||||
warn = FutureWarning
|
||||
with tm.assert_produces_warning(warn):
|
||||
thing = box(arr)
|
||||
|
||||
if arr.dtype.name == "int64" and box is pd.array:
|
||||
mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
|
||||
request.node.add_marker(mark)
|
||||
|
||||
result = thing.to_numpy()
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = np.asarray(thing)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_series", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
|
||||
)
|
||||
def test_to_numpy_copy(arr, as_series):
|
||||
obj = pd.Index(arr, copy=False)
|
||||
if as_series:
|
||||
obj = Series(obj.values, copy=False)
|
||||
|
||||
# no copy by default
|
||||
result = obj.to_numpy()
|
||||
assert np.shares_memory(arr, result) is True
|
||||
|
||||
result = obj.to_numpy(copy=False)
|
||||
assert np.shares_memory(arr, result) is True
|
||||
|
||||
# copy=True
|
||||
result = obj.to_numpy(copy=True)
|
||||
assert np.shares_memory(arr, result) is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_series", [True, False])
|
||||
def test_to_numpy_dtype(as_series):
|
||||
tz = "US/Eastern"
|
||||
obj = pd.DatetimeIndex(["2000", "2001"], tz=tz)
|
||||
if as_series:
|
||||
obj = Series(obj)
|
||||
|
||||
# preserve tz by default
|
||||
result = obj.to_numpy()
|
||||
expected = np.array(
|
||||
[Timestamp("2000", tz=tz), Timestamp("2001", tz=tz)], dtype=object
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = obj.to_numpy(dtype="object")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = obj.to_numpy(dtype="M8[ns]")
|
||||
expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, dtype, na_value, expected",
|
||||
[
|
||||
([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]),
|
||||
(
|
||||
[Timestamp("2000"), Timestamp("2000"), pd.NaT],
|
||||
None,
|
||||
Timestamp("2000"),
|
||||
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_na_value_numpy_dtype(
|
||||
index_or_series, values, dtype, na_value, expected
|
||||
):
|
||||
obj = index_or_series(values)
|
||||
result = obj.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array(expected)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_numpy_kwargs_raises():
|
||||
# numpy
|
||||
s = Series([1, 2, 3])
|
||||
msg = r"to_numpy\(\) got an unexpected keyword argument 'foo'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_numpy(foo=True)
|
||||
|
||||
# extension
|
||||
s = Series([1, 2, 3], dtype="Int64")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_numpy(foo=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
{"a": [1, 2, 3], "b": [1, 2, None]},
|
||||
{"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
|
||||
{"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
|
||||
def test_to_numpy_dataframe_na_value(data, dtype, na_value):
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
df = pd.DataFrame(data)
|
||||
result = df.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected",
|
||||
[
|
||||
(
|
||||
{"a": pd.array([1, 2, None])},
|
||||
np.array([[1.0], [2.0], [np.nan]], dtype=float),
|
||||
),
|
||||
(
|
||||
{"a": [1, 2, 3], "b": [1, 2, 3]},
|
||||
np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_dataframe_single_block(data, expected):
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
df = pd.DataFrame(data)
|
||||
result = df.to_numpy(dtype=float, na_value=np.nan)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_numpy_dataframe_single_block_no_mutate():
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
|
||||
expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
|
||||
result.to_numpy(na_value=0.0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestAsArray:
|
||||
@pytest.mark.parametrize("tz", [None, "US/Central"])
|
||||
def test_asarray_object_dt64(self, tz):
|
||||
ser = Series(date_range("2000", periods=2, tz=tz))
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# Future behavior (for tzaware case) with no warning
|
||||
result = np.asarray(ser, dtype=object)
|
||||
|
||||
expected = np.array(
|
||||
[Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_asarray_tz_naive(self):
|
||||
# This shouldn't produce a warning.
|
||||
ser = Series(date_range("2000", periods=2))
|
||||
expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
|
||||
result = np.asarray(ser)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_asarray_tz_aware(self):
|
||||
tz = "US/Central"
|
||||
ser = Series(date_range("2000", periods=2, tz=tz))
|
||||
expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]")
|
||||
result = np.asarray(ser, dtype="datetime64[ns]")
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# Old behavior with no warning
|
||||
result = np.asarray(ser, dtype="M8[ns]")
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
60
.venv/Lib/site-packages/pandas/tests/base/test_fillna.py
Normal file
60
.venv/Lib/site-packages/pandas/tests/base/test_fillna.py
Normal file
@ -0,0 +1,60 @@
|
||||
"""
|
||||
Though Index.fillna and Series.fillna has separate impl,
|
||||
test here to confirm these works as the same
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import MultiIndex
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
def test_fillna(index_or_series_obj):
|
||||
# GH 11343
|
||||
obj = index_or_series_obj
|
||||
|
||||
if isinstance(obj, MultiIndex):
|
||||
msg = "isna is not defined for MultiIndex"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
obj.fillna(0)
|
||||
return
|
||||
|
||||
# values will not be changed
|
||||
fill_value = obj.values[0] if len(obj) > 0 else 0
|
||||
result = obj.fillna(fill_value)
|
||||
|
||||
tm.assert_equal(obj, result)
|
||||
|
||||
# check shallow_copied
|
||||
assert obj is not result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_fillna_null(null_obj, index_or_series_obj):
|
||||
# GH 11343
|
||||
obj = index_or_series_obj
|
||||
klass = type(obj)
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip(f"{klass} doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(obj, MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
fill_value = values[0]
|
||||
expected = values.copy()
|
||||
values[0:2] = null_obj
|
||||
expected[0:2] = fill_value
|
||||
|
||||
expected = klass(expected)
|
||||
obj = klass(values)
|
||||
|
||||
result = obj.fillna(fill_value)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# check shallow_copied
|
||||
assert obj is not result
|
188
.venv/Lib/site-packages/pandas/tests/base/test_misc.py
Normal file
188
.venv/Lib/site-packages/pandas/tests/base/test_misc.py
Normal file
@ -0,0 +1,188 @@
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import (
|
||||
IS64,
|
||||
PYPY,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_categorical_dtype,
|
||||
is_dtype_equal,
|
||||
is_object_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_isnull_notnull_docstrings():
|
||||
# GH#41855 make sure its clear these are aliases
|
||||
doc = pd.DataFrame.notnull.__doc__
|
||||
assert doc.startswith("\nDataFrame.notnull is an alias for DataFrame.notna.\n")
|
||||
doc = pd.DataFrame.isnull.__doc__
|
||||
assert doc.startswith("\nDataFrame.isnull is an alias for DataFrame.isna.\n")
|
||||
|
||||
doc = Series.notnull.__doc__
|
||||
assert doc.startswith("\nSeries.notnull is an alias for Series.notna.\n")
|
||||
doc = Series.isnull.__doc__
|
||||
assert doc.startswith("\nSeries.isnull is an alias for Series.isna.\n")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, op",
|
||||
[
|
||||
("add", "+"),
|
||||
("sub", "-"),
|
||||
("mul", "*"),
|
||||
("mod", "%"),
|
||||
("pow", "**"),
|
||||
("truediv", "/"),
|
||||
("floordiv", "//"),
|
||||
],
|
||||
)
|
||||
def test_binary_ops_docstring(frame_or_series, op_name, op):
|
||||
# not using the all_arithmetic_functions fixture with _get_opstr
|
||||
# as _get_opstr is used internally in the dynamic implementation of the docstring
|
||||
klass = frame_or_series
|
||||
|
||||
operand1 = klass.__name__.lower()
|
||||
operand2 = "other"
|
||||
expected_str = " ".join([operand1, op, operand2])
|
||||
assert expected_str in getattr(klass, op_name).__doc__
|
||||
|
||||
# reverse version of the binary ops
|
||||
expected_str = " ".join([operand2, op, operand1])
|
||||
assert expected_str in getattr(klass, "r" + op_name).__doc__
|
||||
|
||||
|
||||
def test_ndarray_compat_properties(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
# Check that we work.
|
||||
for p in ["shape", "dtype", "T", "nbytes"]:
|
||||
assert getattr(obj, p, None) is not None
|
||||
|
||||
# deprecated properties
|
||||
for p in ["strides", "itemsize", "base", "data"]:
|
||||
assert not hasattr(obj, p)
|
||||
|
||||
msg = "can only convert an array of size 1 to a Python scalar"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.item() # len > 1
|
||||
|
||||
assert obj.ndim == 1
|
||||
assert obj.size == len(obj)
|
||||
|
||||
assert Index([1]).item() == 1
|
||||
assert Series([1]).item() == 1
|
||||
|
||||
|
||||
def test_array_wrap_compat():
|
||||
# Note: at time of dask 2022.01.0, this is still used by eg dask
|
||||
# (https://github.com/dask/dask/issues/8580).
|
||||
# This test is a small dummy ensuring coverage
|
||||
orig = Series([1, 2, 3], dtype="int64", index=["a", "b", "c"])
|
||||
result = orig.__array_wrap__(np.array([2, 4, 6], dtype="int64"))
|
||||
expected = orig * 2
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
|
||||
def test_memory_usage(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
res = obj.memory_usage()
|
||||
res_deep = obj.memory_usage(deep=True)
|
||||
|
||||
is_ser = isinstance(obj, Series)
|
||||
is_object = is_object_dtype(obj) or (
|
||||
isinstance(obj, Series) and is_object_dtype(obj.index)
|
||||
)
|
||||
is_categorical = is_categorical_dtype(obj.dtype) or (
|
||||
isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype)
|
||||
)
|
||||
is_object_string = is_dtype_equal(obj, "string[python]") or (
|
||||
is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
|
||||
)
|
||||
|
||||
if len(obj) == 0:
|
||||
if isinstance(obj, Index):
|
||||
expected = 0
|
||||
else:
|
||||
expected = 108 if IS64 else 64
|
||||
assert res_deep == res == expected
|
||||
elif is_object or is_categorical or is_object_string:
|
||||
# only deep will pick them up
|
||||
assert res_deep > res
|
||||
else:
|
||||
assert res == res_deep
|
||||
|
||||
# sys.getsizeof will call the .memory_usage with
|
||||
# deep=True, and add on some GC overhead
|
||||
diff = res_deep - sys.getsizeof(obj)
|
||||
assert abs(diff) < 100
|
||||
|
||||
|
||||
def test_memory_usage_components_series(series_with_simple_index):
|
||||
series = series_with_simple_index
|
||||
total_usage = series.memory_usage(index=True)
|
||||
non_index_usage = series.memory_usage(index=False)
|
||||
index_usage = series.index.memory_usage()
|
||||
assert total_usage == non_index_usage + index_usage
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES)
|
||||
def test_memory_usage_components_narrow_series(dtype):
|
||||
series = tm.makeFloatSeries(name="a").astype(dtype)
|
||||
total_usage = series.memory_usage(index=True)
|
||||
non_index_usage = series.memory_usage(index=False)
|
||||
index_usage = series.index.memory_usage()
|
||||
assert total_usage == non_index_usage + index_usage
|
||||
|
||||
|
||||
def test_searchsorted(index_or_series_obj):
|
||||
# numpy.searchsorted calls obj.searchsorted under the hood.
|
||||
# See gh-12238
|
||||
obj = index_or_series_obj
|
||||
|
||||
if isinstance(obj, pd.MultiIndex):
|
||||
# See gh-14833
|
||||
pytest.skip("np.searchsorted doesn't work on pd.MultiIndex")
|
||||
|
||||
max_obj = max(obj, default=0)
|
||||
index = np.searchsorted(obj, max_obj)
|
||||
assert 0 <= index <= len(obj)
|
||||
|
||||
index = np.searchsorted(obj, max_obj, sorter=range(len(obj)))
|
||||
assert 0 <= index <= len(obj)
|
||||
|
||||
|
||||
def test_access_by_position(index_flat):
|
||||
index = index_flat
|
||||
|
||||
if len(index) == 0:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
|
||||
series = Series(index)
|
||||
assert index[0] == series.iloc[0]
|
||||
assert index[5] == series.iloc[5]
|
||||
assert index[-1] == series.iloc[-1]
|
||||
|
||||
size = len(index)
|
||||
assert index[-1] == index[size - 1]
|
||||
|
||||
msg = f"index {size} is out of bounds for axis 0 with size {size}"
|
||||
if is_dtype_equal(index.dtype, "string[pyarrow]"):
|
||||
msg = "index out of bounds"
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
index[size]
|
||||
msg = "single positional indexer is out-of-bounds"
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
series.iloc[size]
|
56
.venv/Lib/site-packages/pandas/tests/base/test_transpose.py
Normal file
56
.venv/Lib/site-packages/pandas/tests/base/test_transpose.py
Normal file
@ -0,0 +1,56 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_transpose(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
tm.assert_equal(obj.transpose(), obj)
|
||||
|
||||
|
||||
def test_transpose_non_default_axes(index_or_series_obj):
|
||||
msg = "the 'axes' parameter is not supported"
|
||||
obj = index_or_series_obj
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.transpose(1)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.transpose(axes=1)
|
||||
|
||||
|
||||
def test_numpy_transpose(index_or_series_obj):
|
||||
msg = "the 'axes' parameter is not supported"
|
||||
obj = index_or_series_obj
|
||||
tm.assert_equal(np.transpose(obj), obj)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
np.transpose(obj, axes=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, transposed_data, index, columns, dtype",
|
||||
[
|
||||
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
|
||||
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
|
||||
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
|
||||
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
|
||||
([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
|
||||
(
|
||||
[[1, 2], [3, 4]],
|
||||
[[1, 3], [2, 4]],
|
||||
["a", "a"],
|
||||
["b", "b"],
|
||||
CategoricalDtype([1, 2, 3, 4]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_duplicate_labels(data, transposed_data, index, columns, dtype):
|
||||
# GH 42380
|
||||
df = DataFrame(data, index=index, columns=columns, dtype=dtype)
|
||||
result = df.T
|
||||
expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
131
.venv/Lib/site-packages/pandas/tests/base/test_unique.py
Normal file
131
.venv/Lib/site-packages/pandas/tests/base/test_unique.py
Normal file
@ -0,0 +1,131 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import is_datetime64tz_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.api import NumericIndex
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
def test_unique(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
result = obj.unique()
|
||||
|
||||
# dict.fromkeys preserves the order
|
||||
unique_values = list(dict.fromkeys(obj.values))
|
||||
if isinstance(obj, pd.MultiIndex):
|
||||
expected = pd.MultiIndex.from_tuples(unique_values)
|
||||
expected.names = obj.names
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
elif isinstance(obj, pd.Index) and obj._is_backward_compat_public_numeric_index:
|
||||
expected = NumericIndex(unique_values, dtype=obj.dtype)
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
elif isinstance(obj, pd.Index):
|
||||
expected = pd.Index(unique_values, dtype=obj.dtype)
|
||||
if is_datetime64tz_dtype(obj.dtype):
|
||||
expected = expected.normalize()
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(unique_values)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_unique_null(null_obj, index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(obj, pd.MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
result = obj.unique()
|
||||
|
||||
unique_values_raw = dict.fromkeys(obj.values)
|
||||
# because np.nan == np.nan is False, but None == None is True
|
||||
# np.nan would be duplicated, whereas None wouldn't
|
||||
unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)]
|
||||
unique_values = [null_obj] + unique_values_not_null
|
||||
|
||||
if isinstance(obj, pd.Index) and obj._is_backward_compat_public_numeric_index:
|
||||
expected = NumericIndex(unique_values, dtype=obj.dtype)
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
elif isinstance(obj, pd.Index):
|
||||
expected = pd.Index(unique_values, dtype=obj.dtype)
|
||||
if is_datetime64tz_dtype(obj.dtype):
|
||||
result = result.normalize()
|
||||
expected = expected.normalize()
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(unique_values, dtype=obj.dtype)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
expected = len(obj.unique())
|
||||
assert obj.nunique(dropna=False) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_nunique_null(null_obj, index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif isinstance(obj, pd.MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
|
||||
if isinstance(obj, pd.CategoricalIndex):
|
||||
assert obj.nunique() == len(obj.categories)
|
||||
assert obj.nunique(dropna=False) == len(obj.categories) + 1
|
||||
else:
|
||||
num_unique_values = len(obj.unique())
|
||||
assert obj.nunique() == max(0, num_unique_values - 1)
|
||||
assert obj.nunique(dropna=False) == max(0, num_unique_values)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.xfail(
|
||||
reason="Flaky in the CI. Remove once CI has a single build: GH 44584", strict=False
|
||||
)
|
||||
def test_unique_bad_unicode(index_or_series):
|
||||
# regression test for #34550
|
||||
uval = "\ud83d" # smiley emoji
|
||||
|
||||
obj = index_or_series([uval] * 2)
|
||||
result = obj.unique()
|
||||
|
||||
if isinstance(obj, pd.Index):
|
||||
expected = pd.Index(["\ud83d"], dtype=object)
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(["\ud83d"], dtype=object)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_nunique_dropna(dropna):
|
||||
# GH37566
|
||||
ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])
|
||||
res = ser.nunique(dropna)
|
||||
assert res == 1 if dropna else 5
|
290
.venv/Lib/site-packages/pandas/tests/base/test_value_counts.py
Normal file
290
.venv/Lib/site-packages/pandas/tests/base/test_value_counts.py
Normal file
@ -0,0 +1,290 @@
|
||||
import collections
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
Series,
|
||||
Timedelta,
|
||||
TimedeltaIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
def test_value_counts(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
result = obj.value_counts()
|
||||
|
||||
counter = collections.Counter(obj)
|
||||
expected = Series(dict(counter.most_common()), dtype=np.int64, name=obj.name)
|
||||
expected.index = expected.index.astype(obj.dtype)
|
||||
if isinstance(obj, pd.MultiIndex):
|
||||
expected.index = Index(expected.index)
|
||||
|
||||
if not isinstance(result.dtype, np.dtype):
|
||||
# i.e IntegerDtype
|
||||
expected = expected.astype("Int64")
|
||||
|
||||
# TODO(GH#32514): Order of entries with the same count is inconsistent
|
||||
# on CI (gh-32449)
|
||||
if obj.duplicated().any():
|
||||
result = result.sort_index()
|
||||
expected = expected.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_value_counts_null(null_obj, index_or_series_obj):
|
||||
orig = index_or_series_obj
|
||||
obj = orig.copy()
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(orig, pd.MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
|
||||
# because np.nan == np.nan is False, but None == None is True
|
||||
# np.nan would be duplicated, whereas None wouldn't
|
||||
counter = collections.Counter(obj.dropna())
|
||||
expected = Series(dict(counter.most_common()), dtype=np.int64)
|
||||
expected.index = expected.index.astype(obj.dtype)
|
||||
|
||||
result = obj.value_counts()
|
||||
if obj.duplicated().any():
|
||||
# TODO(GH#32514):
|
||||
# Order of entries with the same count is inconsistent on CI (gh-32449)
|
||||
expected = expected.sort_index()
|
||||
result = result.sort_index()
|
||||
|
||||
if not isinstance(result.dtype, np.dtype):
|
||||
# i.e IntegerDtype
|
||||
expected = expected.astype("Int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected[null_obj] = 3
|
||||
|
||||
result = obj.value_counts(dropna=False)
|
||||
if obj.duplicated().any():
|
||||
# TODO(GH#32514):
|
||||
# Order of entries with the same count is inconsistent on CI (gh-32449)
|
||||
expected = expected.sort_index()
|
||||
result = result.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_value_counts_inferred(index_or_series):
|
||||
klass = index_or_series
|
||||
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
|
||||
s = klass(s_values)
|
||||
expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"])
|
||||
tm.assert_series_equal(s.value_counts(), expected)
|
||||
|
||||
if isinstance(s, Index):
|
||||
exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
|
||||
tm.assert_index_equal(s.unique(), exp)
|
||||
else:
|
||||
exp = np.unique(np.array(s_values, dtype=np.object_))
|
||||
tm.assert_numpy_array_equal(s.unique(), exp)
|
||||
|
||||
assert s.nunique() == 4
|
||||
# don't sort, have to sort after the fact as not sorting is
|
||||
# platform-dep
|
||||
hist = s.value_counts(sort=False).sort_values()
|
||||
expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values()
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
# sort ascending
|
||||
hist = s.value_counts(ascending=True)
|
||||
expected = Series([1, 2, 3, 4], index=list("cdab"))
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
# relative histogram.
|
||||
hist = s.value_counts(normalize=True)
|
||||
expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"])
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
|
||||
def test_value_counts_bins(index_or_series):
|
||||
klass = index_or_series
|
||||
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
|
||||
s = klass(s_values)
|
||||
|
||||
# bins
|
||||
msg = "bins argument only works with numeric data"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.value_counts(bins=1)
|
||||
|
||||
s1 = Series([1, 1, 2, 3])
|
||||
res1 = s1.value_counts(bins=1)
|
||||
exp1 = Series({Interval(0.997, 3.0): 4})
|
||||
tm.assert_series_equal(res1, exp1)
|
||||
res1n = s1.value_counts(bins=1, normalize=True)
|
||||
exp1n = Series({Interval(0.997, 3.0): 1.0})
|
||||
tm.assert_series_equal(res1n, exp1n)
|
||||
|
||||
if isinstance(s1, Index):
|
||||
tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
|
||||
else:
|
||||
exp = np.array([1, 2, 3], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(s1.unique(), exp)
|
||||
|
||||
assert s1.nunique() == 3
|
||||
|
||||
# these return the same
|
||||
res4 = s1.value_counts(bins=4, dropna=True)
|
||||
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
|
||||
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
|
||||
tm.assert_series_equal(res4, exp4)
|
||||
|
||||
res4 = s1.value_counts(bins=4, dropna=False)
|
||||
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
|
||||
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
|
||||
tm.assert_series_equal(res4, exp4)
|
||||
|
||||
res4n = s1.value_counts(bins=4, normalize=True)
|
||||
exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]))
|
||||
tm.assert_series_equal(res4n, exp4n)
|
||||
|
||||
# handle NA's properly
|
||||
s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
|
||||
s = klass(s_values)
|
||||
expected = Series([4, 3, 2], index=["b", "a", "d"])
|
||||
tm.assert_series_equal(s.value_counts(), expected)
|
||||
|
||||
if isinstance(s, Index):
|
||||
exp = Index(["a", "b", np.nan, "d"])
|
||||
tm.assert_index_equal(s.unique(), exp)
|
||||
else:
|
||||
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
|
||||
tm.assert_numpy_array_equal(s.unique(), exp)
|
||||
assert s.nunique() == 3
|
||||
|
||||
s = klass({}) if klass is dict else klass({}, dtype=object)
|
||||
expected = Series([], dtype=np.int64)
|
||||
tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
|
||||
# returned dtype differs depending on original
|
||||
if isinstance(s, Index):
|
||||
tm.assert_index_equal(s.unique(), Index([]), exact=False)
|
||||
else:
|
||||
tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
|
||||
|
||||
assert s.nunique() == 0
|
||||
|
||||
|
||||
def test_value_counts_datetime64(index_or_series):
|
||||
klass = index_or_series
|
||||
|
||||
# GH 3002, datetime64[ns]
|
||||
# don't test names though
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
|
||||
"dt": pd.to_datetime(
|
||||
[
|
||||
"2010-01-01",
|
||||
"2010-01-01",
|
||||
"2010-01-01",
|
||||
"2009-01-01",
|
||||
"2008-09-09",
|
||||
"2008-09-09",
|
||||
]
|
||||
),
|
||||
"food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
|
||||
}
|
||||
)
|
||||
|
||||
s = klass(df["dt"].copy())
|
||||
s.name = None
|
||||
idx = pd.to_datetime(
|
||||
["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
|
||||
)
|
||||
expected_s = Series([3, 2, 1], index=idx)
|
||||
tm.assert_series_equal(s.value_counts(), expected_s)
|
||||
|
||||
expected = np.array(
|
||||
["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
|
||||
dtype="datetime64[ns]",
|
||||
)
|
||||
if isinstance(s, Index):
|
||||
tm.assert_index_equal(s.unique(), DatetimeIndex(expected))
|
||||
else:
|
||||
tm.assert_numpy_array_equal(s.unique(), expected)
|
||||
|
||||
assert s.nunique() == 3
|
||||
|
||||
# with NaT
|
||||
s = df["dt"].copy()
|
||||
s = klass(list(s.values) + [pd.NaT] * 4)
|
||||
|
||||
result = s.value_counts()
|
||||
assert result.index.dtype == "datetime64[ns]"
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
result = s.value_counts(dropna=False)
|
||||
expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s])
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
assert s.dtype == "datetime64[ns]"
|
||||
unique = s.unique()
|
||||
assert unique.dtype == "datetime64[ns]"
|
||||
|
||||
# numpy_array_equal cannot compare pd.NaT
|
||||
if isinstance(s, Index):
|
||||
exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT])
|
||||
tm.assert_index_equal(unique, exp_idx)
|
||||
else:
|
||||
tm.assert_numpy_array_equal(unique[:3], expected)
|
||||
assert pd.isna(unique[3])
|
||||
|
||||
assert s.nunique() == 3
|
||||
assert s.nunique(dropna=False) == 4
|
||||
|
||||
# timedelta64[ns]
|
||||
td = df.dt - df.dt + timedelta(1)
|
||||
td = klass(td, name="dt")
|
||||
|
||||
result = td.value_counts()
|
||||
expected_s = Series([6], index=[Timedelta("1day")], name="dt")
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
expected = TimedeltaIndex(["1 days"], name="dt")
|
||||
if isinstance(td, Index):
|
||||
tm.assert_index_equal(td.unique(), expected)
|
||||
else:
|
||||
tm.assert_numpy_array_equal(td.unique(), expected.values)
|
||||
|
||||
td2 = timedelta(1) + (df.dt - df.dt)
|
||||
td2 = klass(td2, name="dt")
|
||||
result2 = td2.value_counts()
|
||||
tm.assert_series_equal(result2, expected_s)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_value_counts_with_nan(dropna, index_or_series):
|
||||
# GH31944
|
||||
klass = index_or_series
|
||||
values = [True, pd.NA, np.nan]
|
||||
obj = klass(values)
|
||||
res = obj.value_counts(dropna=dropna)
|
||||
if dropna is True:
|
||||
expected = Series([1], index=[True])
|
||||
else:
|
||||
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan])
|
||||
tm.assert_series_equal(res, expected)
|
Reference in New Issue
Block a user