first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,121 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import FloatingArray
@pytest.fixture
def data():
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def left_array():
return pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
@pytest.fixture
def right_array():
return pd.array([True, False, None] * 3, dtype="boolean")
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [True, True, None, True, False, None, None, None, None]),
("mul", [True, False, None, False, False, None, None, None, None]),
],
ids=["add", "mul"],
)
def test_add_mul(left_array, right_array, opname, exp):
op = getattr(operator, opname)
result = op(left_array, right_array)
expected = pd.array(exp, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_sub(left_array, right_array):
msg = (
r"numpy boolean subtract, the `-` operator, is (?:deprecated|not supported), "
r"use the bitwise_xor, the `\^` operator, or the logical_xor function instead\."
)
with pytest.raises(TypeError, match=msg):
left_array - right_array
def test_div(left_array, right_array):
result = left_array / right_array
expected = FloatingArray(
np.array(
[1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
dtype="float64",
),
np.array([False, False, True, False, False, True, True, True, True]),
)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"opname",
[
"floordiv",
"mod",
pytest.param(
"pow", marks=pytest.mark.xfail(reason="TODO follow int8 behaviour? GH34686")
),
],
)
def test_op_int8(left_array, right_array, opname):
op = getattr(operator, opname)
result = op(left_array, right_array)
expected = op(left_array.astype("Int8"), right_array.astype("Int8"))
tm.assert_extension_array_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators):
# invalid ops
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
msg = (
"did not contain a loop with signature matching types|"
"BooleanArray cannot perform the operation|"
"not supported for the input types, and the inputs could not be safely coerced "
"to any supported types according to the casting rule ''safe''"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
msg = (
r"unsupported operand type\(s\) for|"
"Concatenation operation is not implemented for NumPy arrays"
)
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
if op not in ("__mul__", "__rmul__"):
# TODO(extension) numpy's mul with object array sees booleans as numbers
msg = (
r"unsupported operand type\(s\) for|can only concatenate str|"
"not all arguments converted during string formatting"
)
with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))

View File

@ -0,0 +1,53 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype():
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert float NaN to"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("str")
expected = np.array(["True", "False", "<NA>"], dtype="<U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.astype("int64")
expected = np.array([1, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("boolean")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, arr)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("Int64")
expected = pd.array([1, 0, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,58 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.tests.arrays.masked_shared import ComparisonOps
@pytest.fixture
def data():
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def dtype():
return pd.BooleanDtype()
class TestComparisonOps(ComparisonOps):
def test_compare_scalar(self, data, comparison_op):
self._compare_other(data, comparison_op, True)
def test_compare_array(self, data, comparison_op):
other = pd.array([True] * len(data), dtype="boolean")
self._compare_other(data, comparison_op, other)
other = np.array([True] * len(data))
self._compare_other(data, comparison_op, other)
other = pd.Series([True] * len(data))
self._compare_other(data, comparison_op, other)
@pytest.mark.parametrize("other", [True, False, pd.NA])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_array(self, comparison_op):
op = comparison_op
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = op(a, b)
values = op(a._data, b._data)
mask = a._mask | b._mask
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)

View File

@ -0,0 +1,323 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.arrays.boolean import coerce_to_array
def test_boolean_array_constructor():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.tolist(), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, mask.tolist())
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.astype(int), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, None)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values.reshape(1, -1), mask)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values, mask.reshape(1, -1))
def test_boolean_array_constructor_copy():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
assert result._data is values
assert result._mask is mask
result = BooleanArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_boolean_array():
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, False])
)
result = pd.array([True, False, True], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True]), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, True])
)
result = pd.array([True, False, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_all_none():
expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True]))
result = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
([True, np.nan], [True, None]),
([True, pd.NA], [True, None]),
([np.nan, np.nan], [None, None]),
(np.array([np.nan, np.nan], dtype=float), [None, None]),
],
)
def test_to_boolean_array_missing_indicators(a, b):
result = pd.array(a, dtype="boolean")
expected = pd.array(b, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
["1", "2"],
# "foo",
[1, 2],
[1.0, 2.0],
pd.date_range("20130101", periods=2),
np.array(["foo"]),
np.array([1, 2]),
np.array([1.0, 2.0]),
[np.nan, {"a": 1}],
],
)
def test_to_boolean_array_error(values):
# error in converting existing arrays to BooleanArray
msg = "Need to pass bool-like value"
with pytest.raises(TypeError, match=msg):
pd.array(values, dtype="boolean")
def test_to_boolean_array_from_integer_array():
result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1, 0, 1, None]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_float_array():
result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_integer_like():
# integers of 0's and 1's
result = pd.array([1, 0, 1, 0], dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array([1, 0, 1, None], dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_coerce_to_array():
# TODO this is currently not public API
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is values
assert result._mask is mask
result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is not values
assert result._mask is not mask
# mixed missing from values and mask
values = [True, False, None, False]
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(
np.array([True, False, True, True]), np.array([False, False, True, True])
)
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask))
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(values, mask=mask.tolist()))
tm.assert_extension_array_equal(result, expected)
# raise errors for wrong dimension
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values.reshape(1, -1))
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values, mask=mask.reshape(1, -1))
def test_coerce_to_array_from_boolean_array():
# passing BooleanArray to coerce_to_array
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
arr = BooleanArray(values, mask)
result = BooleanArray(*coerce_to_array(arr))
tm.assert_extension_array_equal(result, arr)
# no copy
assert result._data is arr._data
assert result._mask is arr._mask
result = BooleanArray(*coerce_to_array(arr), copy=True)
tm.assert_extension_array_equal(result, arr)
assert result._data is not arr._data
assert result._mask is not arr._mask
with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"):
coerce_to_array(arr, mask=mask)
def test_coerce_to_numpy_array():
# with missing values -> object dtype
arr = pd.array([True, False, None], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
# also with no missing values -> object dtype
arr = pd.array([True, False, True], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, True], dtype="object")
tm.assert_numpy_array_equal(result, expected)
# force bool dtype
result = np.array(arr, dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
# with missing values will raise error
arr = pd.array([True, False, None], dtype="boolean")
msg = (
"cannot convert to 'bool'-dtype NumPy array with missing values. "
"Specify an appropriate 'na_value' for this dtype."
)
with pytest.raises(ValueError, match=msg):
np.array(arr, dtype="bool")
def test_to_boolean_array_from_strings():
result = BooleanArray._from_sequence_of_strings(
np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object)
)
expected = BooleanArray(
np.array([True, False, True, True, False, False, False]),
np.array([False, False, False, False, False, False, True]),
)
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_strings_invalid_string():
with pytest.raises(ValueError, match="cannot be cast"):
BooleanArray._from_sequence_of_strings(["donkey"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="object")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype="str")
expected = np.array([True, False, pd.NA], dtype="<U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values -> can convert to bool, otherwise raises
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"):
result = arr.to_numpy(dtype="bool")
# specify dtype and na_value
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([True, False, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([True, False, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([1, 0, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# converting to int or float without specifying na_value raises
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
arr.to_numpy(dtype="int64")
with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
arr.to_numpy(dtype="float64")
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool)
result[0] = False
tm.assert_extension_array_equal(
arr, pd.array([False, False, True], dtype="boolean")
)
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool, copy=True)
result[0] = False
tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean"))

View File

@ -0,0 +1,126 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
)
def test_ufuncs_binary(ufunc):
# two BooleanArrays
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a, a)
expected = pd.array(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s, a)
expected = pd.Series(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
# Boolean with numpy array
arr = np.array([True, True, False])
result = ufunc(a, arr)
expected = pd.array(ufunc(a._data, arr), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# BooleanArray with scalar
result = ufunc(a, True)
expected = pd.array(ufunc(a._data, True), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(True, a)
expected = pd.array(ufunc(True, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# not handled types
msg = r"operand type\(s\) all returned NotImplemented from __array_ufunc__"
with pytest.raises(TypeError, match=msg):
ufunc(a, "test")
@pytest.mark.parametrize("ufunc", [np.logical_not])
def test_ufuncs_unary(ufunc):
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a)
expected = pd.array(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ufunc(ser)
expected = pd.Series(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
def test_ufunc_numeric():
# np.sqrt on np.bool returns float16, which we upcast to Float32
# bc we do not have Float16
arr = pd.array([True, False, None], dtype="boolean")
res = np.sqrt(arr)
expected = pd.array([1, 0, None], dtype="Float32")
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("values", [[True, False], [True, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="boolean")
res = np.add.reduce(arr)
if arr[-1] is pd.NA:
expected = pd.NA
else:
expected = arr._data.sum()
tm.assert_almost_equal(res, expected)
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([True, False, pd.NA], dtype="boolean")
result = ser.value_counts(normalize=True)
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2
assert expected.index.dtype == "boolean"
tm.assert_series_equal(result, expected)
def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
)
result = pd.core.algorithms.diff(a, 1)
expected = pd.array(
[None, False, True, False, True, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ser.diff()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,13 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
def test_setitem_missing_values(na):
arr = pd.array([True, False, None], dtype="boolean")
expected = pd.array([True, None, None], dtype="boolean")
arr[1] = na
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,254 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.ops.mask_ops import (
kleene_and,
kleene_or,
kleene_xor,
)
from pandas.tests.extension.base import BaseOpsUtil
class TestLogicalOps(BaseOpsUtil):
def test_numpy_scalars_ok(self, all_logical_operators):
a = pd.array([True, False, None], dtype="boolean")
op = getattr(a, all_logical_operators)
tm.assert_extension_array_equal(op(True), op(np.bool_(True)))
tm.assert_extension_array_equal(op(False), op(np.bool_(False)))
def get_op_from_name(self, op_name):
short_opname = op_name.strip("_")
short_opname = short_opname if "xor" in short_opname else short_opname + "_"
try:
op = getattr(operator, short_opname)
except AttributeError:
# Assume it is the reverse operator
rop = getattr(operator, short_opname[1:])
op = lambda x, y: rop(y, x)
return op
def test_empty_ok(self, all_logical_operators):
a = pd.array([], dtype="boolean")
op_name = all_logical_operators
result = getattr(a, op_name)(True)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(False)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(pd.NA)
tm.assert_extension_array_equal(a, result)
@pytest.mark.parametrize(
"other", ["a", pd.Timestamp(2017, 1, 1, 12), np.timedelta64(4)]
)
def test_eq_mismatched_type(self, other):
# GH-44499
arr = pd.array([True, False])
result = arr == other
expected = pd.array([False, False])
tm.assert_extension_array_equal(result, expected)
result = arr != other
expected = pd.array([True, True])
tm.assert_extension_array_equal(result, expected)
def test_logical_length_mismatch_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Lengths must match to compare"
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)([True, False])
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(np.array([True, False]))
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(pd.array([True, False], dtype="boolean"))
def test_logical_nan_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Got float instead"
with pytest.raises(TypeError, match=msg):
getattr(a, op_name)(np.nan)
@pytest.mark.parametrize("other", ["a", 1])
def test_non_bool_or_na_other_raises(self, other, all_logical_operators):
a = pd.array([True, False], dtype="boolean")
with pytest.raises(TypeError, match=str(type(other).__name__)):
getattr(a, all_logical_operators)(other)
def test_kleene_or(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a | b
expected = pd.array(
[True, True, True, True, False, None, True, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [True, None, None]),
(True, [True, True, True]),
(np.bool_(True), [True, True, True]),
(False, [True, False, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_or_scalar(self, other, expected):
# TODO: test True & False
a = pd.array([True, False, None], dtype="boolean")
result = a | other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_and(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a & b
expected = pd.array(
[True, False, None, False, False, False, None, False, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, False, None]),
(True, [True, False, None]),
(False, [False, False, False]),
(np.bool_(True), [True, False, None]),
(np.bool_(False), [False, False, False]),
],
)
def test_kleene_and_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a & other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_xor(self):
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a ^ b
expected = pd.array(
[False, True, None, True, False, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, None, None]),
(True, [False, True, None]),
(np.bool_(True), [False, True, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_xor_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a ^ other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
@pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3])
def test_no_masked_assumptions(self, other, all_logical_operators):
# The logical operations should not assume that masked values are False!
a = pd.arrays.BooleanArray(
np.array([True, True, True, False, False, False, True, False, True]),
np.array([False] * 6 + [True, True, True]),
)
b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
if isinstance(other, list):
other = pd.array(other, dtype="boolean")
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
if isinstance(other, BooleanArray):
other._data[other._mask] = True
a._data[a._mask] = False
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("operation", [kleene_or, kleene_xor, kleene_and])
def test_error_both_scalar(operation):
msg = r"Either `left` or `right` need to be a np\.ndarray."
with pytest.raises(TypeError, match=msg):
# masks need to be non-None, otherwise it ends up in an infinite recursion
operation(True, True, np.zeros(1), np.zeros(1))

View File

@ -0,0 +1,27 @@
import pandas as pd
import pandas._testing as tm
class TestUnaryOps:
def test_invert(self):
a = pd.array([True, False, None], dtype="boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(~a, expected)
expected = pd.Series(expected, index=["a", "b", "c"], name="name")
result = ~pd.Series(a, index=["a", "b", "c"], name="name")
tm.assert_series_equal(result, expected)
df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"])
result = ~df
expected = pd.DataFrame(
{"A": expected, "B": [False, True, True]}, index=["a", "b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_abs(self):
# matching numpy behavior, abs is the identity function
arr = pd.array([True, False, None], dtype="boolean")
result = abs(arr)
tm.assert_extension_array_equal(result, arr)

View File

@ -0,0 +1,60 @@
import numpy as np
import pytest
import pandas as pd
@pytest.fixture
def data():
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.mark.parametrize(
"values, exp_any, exp_all, exp_any_noskip, exp_all_noskip",
[
([True, pd.NA], True, True, True, pd.NA),
([False, pd.NA], False, False, pd.NA, False),
([pd.NA], False, True, pd.NA, pd.NA),
([], False, True, False, True),
# GH-33253: all True / all False values buggy with skipna=False
([True, True], True, True, True, True),
([False, False], False, False, False, False),
],
)
def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip):
# the methods return numpy scalars
exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any)
exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all)
exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip)
exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip)
for con in [pd.array, pd.Series]:
a = con(values, dtype="boolean")
assert a.any() is exp_any
assert a.all() is exp_all
assert a.any(skipna=False) is exp_any_noskip
assert a.all(skipna=False) is exp_all_noskip
assert np.any(a.any()) is exp_any
assert np.all(a.all()) is exp_all
@pytest.mark.parametrize("dropna", [True, False])
def test_reductions_return_types(dropna, data, all_numeric_reductions):
op = all_numeric_reductions
s = pd.Series(data)
if dropna:
s = s.dropna()
if op == "sum":
assert isinstance(getattr(s, op)(), np.int_)
elif op == "prod":
assert isinstance(getattr(s, op)(), np.int_)
elif op in ("min", "max"):
assert isinstance(getattr(s, op)(), np.bool_)
else:
# "mean", "std", "var", "median", "kurt", "skew"
assert isinstance(getattr(s, op)(), np.float64)

View File

@ -0,0 +1,13 @@
import pandas as pd
def test_repr():
df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")})
expected = " A\n0 True\n1 False\n2 <NA>"
assert repr(df) == expected
expected = "0 True\n1 False\n2 <NA>\nName: A, dtype: boolean"
assert repr(df.A) == expected
expected = "<BooleanArray>\n[True, False, <NA>]\nLength: 3, dtype: boolean"
assert repr(df.A.array) == expected

View File

@ -0,0 +1,8 @@
from pandas import Categorical
class TestCategorical:
def setup_method(self, method):
self.factor = Categorical(
["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True
)

View File

@ -0,0 +1,7 @@
import pytest
@pytest.fixture(params=[True, False])
def allow_fill(request):
"""Boolean 'allow_fill' parameter for Categorical.take"""
return request.param

View File

@ -0,0 +1,83 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]])
def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
codes, uniques = pd.factorize(cat)
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort_ordered():
cat = pd.Categorical(
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_isin_cats():
# GH2003
cat = pd.Categorical(["a", "b", np.nan])
result = cat.isin(["a", np.nan])
expected = np.array([True, False, True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
result = cat.isin(["a", "c"])
expected = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("empty", [[], pd.Series(dtype=object), np.array([])])
def test_isin_empty(empty):
s = pd.Categorical(["a", "b"])
expected = np.array([False, False], dtype=bool)
result = s.isin(empty)
tm.assert_numpy_array_equal(expected, result)
def test_diff():
s = pd.Series([1, 2, 3], dtype="category")
with tm.assert_produces_warning(FutureWarning):
result = s.diff()
expected = pd.Series([np.nan, 1, 1])
tm.assert_series_equal(result, expected)
expected = expected.to_frame(name="A")
df = s.to_frame(name="A")
with tm.assert_produces_warning(FutureWarning):
result = df.diff()
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,365 @@
import re
import sys
import numpy as np
import pytest
from pandas.compat import PYPY
from pandas import (
Categorical,
CategoricalDtype,
Index,
NaT,
Series,
date_range,
)
import pandas._testing as tm
from pandas.api.types import is_scalar
class TestCategoricalAnalytics:
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_not_ordered_raises(self, aggregation):
# unordered cats have no min/max
cat = Categorical(["a", "b", "c", "d"], ordered=False)
msg = f"Categorical is not ordered for operation {aggregation}"
agg_func = getattr(cat, aggregation)
with pytest.raises(TypeError, match=msg):
agg_func()
ufunc = np.minimum if aggregation == "min" else np.maximum
with pytest.raises(TypeError, match=msg):
ufunc.reduce(cat)
def test_min_max_ordered(self, index_or_series_or_array):
cat = Categorical(["a", "b", "c", "d"], ordered=True)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "a"
assert _max == "d"
assert np.minimum.reduce(obj) == "a"
assert np.maximum.reduce(obj) == "d"
# TODO: raises if we pass axis=0 (on Index and Categorical, not Series)
cat = Categorical(
["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "d"
assert _max == "a"
assert np.minimum.reduce(obj) == "d"
assert np.maximum.reduce(obj) == "a"
@pytest.mark.parametrize(
"categories,expected",
[
(list("ABC"), np.NaN),
([1, 2, 3], np.NaN),
pytest.param(
Series(date_range("2020-01-01", periods=3), dtype="category"),
NaT,
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/29962"
),
),
],
)
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_ordered_empty(self, categories, expected, aggregation):
# GH 30227
cat = Categorical([], categories=categories, ordered=True)
agg_func = getattr(cat, aggregation)
result = agg_func()
assert result is expected
@pytest.mark.parametrize(
"values, categories",
[(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])],
)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("function", ["min", "max"])
def test_min_max_with_nan(self, values, categories, function, skipna):
# GH 25303
cat = Categorical(values, categories=categories, ordered=True)
result = getattr(cat, function)(skipna=skipna)
if skipna is False:
assert result is np.nan
else:
expected = categories[0] if function == "min" else categories[2]
assert result == expected
@pytest.mark.parametrize("function", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_only_nan(self, function, skipna):
# https://github.com/pandas-dev/pandas/issues/33450
cat = Categorical([np.nan], categories=[1, 2], ordered=True)
result = getattr(cat, function)(skipna=skipna)
assert result is np.nan
@pytest.mark.parametrize("method", ["min", "max"])
def test_deprecate_numeric_only_min_max(self, method):
# GH 25303
cat = Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
with tm.assert_produces_warning(expected_warning=FutureWarning):
getattr(cat, method)(numeric_only=True)
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_raises(self, method):
cat = Categorical(["a", "b", "c", "b"], ordered=False)
msg = (
f"Categorical is not ordered for operation {method}\n"
"you can use .as_ordered() to change the Categorical to an ordered one"
)
method = getattr(np, method)
with pytest.raises(TypeError, match=re.escape(msg)):
method(cat)
@pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"])
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
msg = (
f"the '{kwarg}' parameter is not supported in the pandas implementation "
f"of {method}"
)
if kwarg == "axis":
msg = r"`axis` must be fewer than the number of dimensions \(1\)"
kwargs = {kwarg: 42}
method = getattr(np, method)
with pytest.raises(ValueError, match=msg):
method(cat, **kwargs)
@pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")])
def test_numpy_min_max_axis_equals_none(self, method, expected):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
method = getattr(np, method)
result = method(cat, axis=None)
assert result == expected
@pytest.mark.parametrize(
"values,categories,exp_mode",
[
([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
],
)
def test_mode(self, values, categories, exp_mode):
s = Categorical(values, categories=categories, ordered=True)
msg = "Use Series.mode instead"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = s.mode()
exp = Categorical(exp_mode, categories=categories, ordered=True)
tm.assert_categorical_equal(res, exp)
def test_searchsorted(self, ordered):
# https://github.com/pandas-dev/pandas/issues/8420
# https://github.com/pandas-dev/pandas/issues/14522
cat = Categorical(
["cheese", "milk", "apple", "bread", "bread"],
categories=["cheese", "milk", "apple", "bread"],
ordered=ordered,
)
ser = Series(cat)
# Searching for single item argument, side='left' (default)
res_cat = cat.searchsorted("apple")
assert res_cat == 2
assert is_scalar(res_cat)
res_ser = ser.searchsorted("apple")
assert res_ser == 2
assert is_scalar(res_ser)
# Searching for single item array, side='left' (default)
res_cat = cat.searchsorted(["bread"])
res_ser = ser.searchsorted(["bread"])
exp = np.array([3], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for several items array, side='right'
res_cat = cat.searchsorted(["apple", "bread"], side="right")
res_ser = ser.searchsorted(["apple", "bread"], side="right")
exp = np.array([3, 5], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for a single value that is not from the Categorical
with pytest.raises(TypeError, match="cucumber"):
cat.searchsorted("cucumber")
with pytest.raises(TypeError, match="cucumber"):
ser.searchsorted("cucumber")
# Searching for multiple values one of each is not from the Categorical
msg = (
"Cannot setitem on a Categorical with a new category, "
"set the categories first"
)
with pytest.raises(TypeError, match=msg):
cat.searchsorted(["bread", "cucumber"])
with pytest.raises(TypeError, match=msg):
ser.searchsorted(["bread", "cucumber"])
def test_unique(self, ordered):
# GH38140
dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
# categories are reordered based on value when ordered=False
cat = Categorical(["a", "b", "c"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, cat)
cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
# nan must be removed
cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
def test_unique_index_series(self, ordered):
# GH38140
dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
# Categorical.unique sorts categories by appearance order
# if ordered=False
exp = Categorical([3, 1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
c = Categorical([1, 1, 2, 2], dtype=dtype)
exp = Categorical([1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
def test_shift(self):
# GH 9416
cat = Categorical(["a", "b", "c", "d", "a"])
# shift forward
sp1 = cat.shift(1)
xp1 = Categorical([np.nan, "a", "b", "c", "d"])
tm.assert_categorical_equal(sp1, xp1)
tm.assert_categorical_equal(cat[:-1], sp1[1:])
# shift back
sn2 = cat.shift(-2)
xp2 = Categorical(
["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
)
tm.assert_categorical_equal(sn2, xp2)
tm.assert_categorical_equal(cat[2:], sn2[:-2])
# shift by zero
tm.assert_categorical_equal(cat, cat.shift(0))
def test_nbytes(self):
cat = Categorical([1, 2, 3])
exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
assert cat.nbytes == exp
def test_memory_usage(self):
cat = Categorical([1, 2, 3])
# .categories is an index, so we include the hashtable
assert 0 < cat.nbytes <= cat.memory_usage()
assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
cat = Categorical(["foo", "foo", "bar"])
assert cat.memory_usage(deep=True) > cat.nbytes
if not PYPY:
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100
def test_map(self):
c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = c.map(lambda x: x.lower())
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
result = c.map(lambda x: x.lower())
exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
tm.assert_categorical_equal(result, exp)
result = c.map(lambda x: 1)
# GH 12766: Return an index not an array
tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
def test_validate_inplace_raises(self, value):
cat = Categorical(["A", "B", "B", "C", "A"])
msg = (
'For argument "inplace" expected type bool, '
f"received type {type(value).__name__}"
)
with pytest.raises(ValueError, match=msg):
cat.set_ordered(value=True, inplace=value)
with pytest.raises(ValueError, match=msg):
cat.as_ordered(inplace=value)
with pytest.raises(ValueError, match=msg):
cat.as_unordered(inplace=value)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.rename_categories(["X", "Y", "Z"], inplace=value)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.add_categories(new_categories=["D", "E", "F"], inplace=value)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.remove_categories(removals=["D", "E", "F"], inplace=value)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.remove_unused_categories(inplace=value)
with pytest.raises(ValueError, match=msg):
cat.sort_values(inplace=value)

View File

@ -0,0 +1,561 @@
import re
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Index,
Series,
)
import pandas._testing as tm
from pandas.core.arrays.categorical import recode_for_categories
from pandas.tests.arrays.categorical.common import TestCategorical
class TestCategoricalAPI:
def test_ordered_api(self):
# GH 9347
cat1 = Categorical(list("acb"), ordered=False)
tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"]))
assert not cat1.ordered
cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False)
tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"]))
assert not cat2.ordered
cat3 = Categorical(list("acb"), ordered=True)
tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"]))
assert cat3.ordered
cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True)
tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"]))
assert cat4.ordered
def test_set_ordered(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
cat2 = cat.as_unordered()
assert not cat2.ordered
cat2 = cat.as_ordered()
assert cat2.ordered
cat2.as_unordered(inplace=True)
assert not cat2.ordered
cat2.as_ordered(inplace=True)
assert cat2.ordered
assert cat2.set_ordered(True).ordered
assert not cat2.set_ordered(False).ordered
cat2.set_ordered(True, inplace=True)
assert cat2.ordered
cat2.set_ordered(False, inplace=True)
assert not cat2.ordered
# removed in 0.19.0
msg = "can't set attribute"
with pytest.raises(AttributeError, match=msg):
cat.ordered = True
with pytest.raises(AttributeError, match=msg):
cat.ordered = False
def test_rename_categories(self):
cat = Categorical(["a", "b", "c", "a"])
# inplace=False: the old one must not be changed
res = cat.rename_categories([1, 2, 3])
tm.assert_numpy_array_equal(
res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
)
tm.assert_index_equal(res.categories, Index([1, 2, 3]))
exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
exp_cat = Index(["a", "b", "c"])
tm.assert_index_equal(cat.categories, exp_cat)
# GH18862 (let rename_categories take callables)
result = cat.rename_categories(lambda x: x.upper())
expected = Categorical(["A", "B", "C", "A"])
tm.assert_categorical_equal(result, expected)
# and now inplace
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
res = cat.rename_categories([1, 2, 3], inplace=True)
assert res is None
tm.assert_numpy_array_equal(
cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
)
tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_rename_categories_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items as the "
"old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.rename_categories(new_categories)
def test_rename_categories_series(self):
# https://github.com/pandas-dev/pandas/issues/17981
c = Categorical(["a", "b"])
result = c.rename_categories(Series([0, 1], index=["a", "b"]))
expected = Categorical([0, 1])
tm.assert_categorical_equal(result, expected)
def test_rename_categories_dict(self):
# GH 17336
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1})
expected = Index([4, 3, 2, 1])
tm.assert_index_equal(res.categories, expected)
# Test for inplace
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True)
assert res is None
tm.assert_index_equal(cat.categories, expected)
# Test for dicts of smaller length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "c": 3})
expected = Index([1, "b", 3, "d"])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with bigger length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6})
expected = Index([1, 2, 3, 4])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with no items from old categories
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"f": 1, "g": 3})
expected = Index(["a", "b", "c", "d"])
tm.assert_index_equal(res.categories, expected)
def test_reorder_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True
)
# first inplace == False
res = cat.reorder_categories(["c", "b", "a"])
# cat must be the same as before
tm.assert_categorical_equal(cat, old)
# only res is changed
tm.assert_categorical_equal(res, new)
# inplace == True
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
res = cat.reorder_categories(["c", "b", "a"], inplace=True)
assert res is None
tm.assert_categorical_equal(cat, new)
@pytest.mark.parametrize(
"new_categories",
[
["a"], # not all "old" included in "new"
["a", "b", "d"], # still not all "old" in "new"
["a", "b", "c", "d"], # all "old" included in "new", but too long
],
)
def test_reorder_categories_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
msg = "items in new_categories are not the same as in old categories"
with pytest.raises(ValueError, match=msg):
cat.reorder_categories(new_categories)
def test_add_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True
)
# first inplace == False
res = cat.add_categories("d")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.add_categories(["d"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
# inplace == True
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
res = cat.add_categories("d", inplace=True)
tm.assert_categorical_equal(cat, new)
assert res is None
# GH 9927
cat = Categorical(list("abc"), ordered=True)
expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
# test with Series, np.array, index, list
res = cat.add_categories(Series(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(np.array(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(Index(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(["d", "e"])
tm.assert_categorical_equal(res, expected)
def test_add_categories_existing_raises(self):
# new is in old categories
cat = Categorical(["a", "b", "c", "d"], ordered=True)
msg = re.escape("new categories must not include old categories: {'d'}")
with pytest.raises(ValueError, match=msg):
cat.add_categories(["d"])
def test_set_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
exp_categories = Index(["c", "b", "a"])
exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
res = cat.set_categories(["c", "b", "a"], inplace=True)
tm.assert_index_equal(cat.categories, exp_categories)
tm.assert_numpy_array_equal(cat.__array__(), exp_values)
assert res is None
res = cat.set_categories(["a", "b", "c"])
# cat must be the same as before
tm.assert_index_equal(cat.categories, exp_categories)
tm.assert_numpy_array_equal(cat.__array__(), exp_values)
# only res is changed
exp_categories_back = Index(["a", "b", "c"])
tm.assert_index_equal(res.categories, exp_categories_back)
tm.assert_numpy_array_equal(res.__array__(), exp_values)
# not all "old" included in "new" -> all not included ones are now
# np.nan
cat = Categorical(["a", "b", "c", "a"], ordered=True)
res = cat.set_categories(["a"])
tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8))
# still not all "old" in "new"
res = cat.set_categories(["a", "b", "d"])
tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8))
tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
# all "old" included in "new"
cat = cat.set_categories(["a", "b", "c", "d"])
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_index_equal(cat.categories, exp_categories)
# internals...
c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8))
tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
# all "pointers" to '4' must be changed from 3 to 0,...
c = c.set_categories([4, 3, 2, 1])
# positions are changed
tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8))
# categories are now in new order
tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
# output is the same
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
assert c.min() == 4
assert c.max() == 1
# set_categories should set the ordering if specified
c2 = c.set_categories([4, 3, 2, 1], ordered=False)
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
# set_categories should pass thru the ordering
c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
def test_to_dense_deprecated(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
with tm.assert_produces_warning(FutureWarning):
cat.to_dense()
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_set_categories_rename_less(self):
# GH 24675
cat = Categorical(["A", "B"])
result = cat.set_categories(["A"], rename=True)
expected = Categorical(["A", np.nan])
tm.assert_categorical_equal(result, expected)
def test_set_categories_private(self):
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"])
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
# fastpath
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"], fastpath=True)
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
def test_remove_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True)
# first inplace == False
res = cat.remove_categories("c")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.remove_categories(["c"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
# inplace == True
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
res = cat.remove_categories("c", inplace=True)
tm.assert_categorical_equal(cat, new)
assert res is None
@pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]])
def test_remove_categories_raises(self, removals):
cat = Categorical(["a", "b", "a"])
message = re.escape("removals must all be in old categories: {'c'}")
with pytest.raises(ValueError, match=message):
cat.remove_categories(removals)
def test_remove_unused_categories(self):
c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"])
exp_categories_all = Index(["a", "b", "c", "d", "e"])
exp_categories_dropped = Index(["a", "b", "c", "d"])
tm.assert_index_equal(c.categories, exp_categories_all)
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, exp_categories_dropped)
tm.assert_index_equal(c.categories, exp_categories_all)
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
res = c.remove_unused_categories(inplace=True)
tm.assert_index_equal(c.categories, exp_categories_dropped)
assert res is None
# with NaN values (GH11599)
c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"])
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"])))
exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(res.codes, exp_codes)
tm.assert_index_equal(c.categories, exp_categories_all)
val = ["F", np.nan, "D", "B", "D", "F", np.nan]
cat = Categorical(values=val, categories=list("ABCDEFG"))
out = cat.remove_unused_categories()
tm.assert_index_equal(out.categories, Index(["B", "D", "F"]))
exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(out.codes, exp_codes)
assert out.tolist() == val
alpha = list("abcdefghijklmnopqrstuvwxyz")
val = np.random.choice(alpha[::2], 10000).astype("object")
val[np.random.choice(len(val), 100)] = np.nan
cat = Categorical(values=val, categories=alpha)
out = cat.remove_unused_categories()
assert out.tolist() == val.tolist()
class TestCategoricalAPIWithFactor(TestCategorical):
def test_describe(self):
# string type
desc = self.factor.describe()
assert self.factor.ordered
exp_index = CategoricalIndex(
["a", "b", "c"], name="categories", ordered=self.factor.ordered
)
expected = DataFrame(
{"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index
)
tm.assert_frame_equal(desc, expected)
# check unused categories
cat = self.factor.copy()
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.set_categories(["a", "b", "c", "d"], inplace=True)
desc = cat.describe()
exp_index = CategoricalIndex(
list("abcd"), ordered=self.factor.ordered, name="categories"
)
expected = DataFrame(
{"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# check an integer one
cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
desc = cat.describe()
exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories")
expected = DataFrame(
{"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# https://github.com/pandas-dev/pandas/issues/3678
# describe should work with NaN
cat = Categorical([np.nan, 1, 2, 2])
desc = cat.describe()
expected = DataFrame(
{"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]},
index=CategoricalIndex(
[1, 2, np.nan], categories=[1, 2], name="categories"
),
)
tm.assert_frame_equal(desc, expected)
def test_set_categories_inplace(self):
cat = self.factor.copy()
with tm.assert_produces_warning(FutureWarning):
# issue #37643 inplace kwarg deprecated
cat.set_categories(["a", "b", "c", "d"], inplace=True)
tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"]))
def test_codes_setter_deprecated(self):
cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
new_codes = cat._codes + 1
with tm.assert_produces_warning(FutureWarning):
# GH#40606
cat._codes = new_codes
assert cat._codes is new_codes
class TestPrivateCategoricalAPI:
def test_codes_immutable(self):
# Codes should be read only
c = Categorical(["a", "b", "c", "a", np.nan])
exp = np.array([0, 1, 2, 0, -1], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
# Assignments to codes should raise
with pytest.raises(AttributeError, match="can't set attribute"):
c.codes = np.array([0, 1, 2, 0, 1], dtype="int8")
# changes in the codes array should raise
codes = c.codes
with pytest.raises(ValueError, match="assignment destination is read-only"):
codes[4] = 1
# But even after getting the codes, the original array should still be
# writeable!
c[4] = "a"
exp = np.array([0, 1, 2, 0, 0], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
c._codes[4] = 2
exp = np.array([0, 1, 2, 0, 2], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
@pytest.mark.parametrize(
"codes, old, new, expected",
[
([0, 1], ["a", "b"], ["a", "b"], [0, 1]),
([0, 1], ["b", "a"], ["b", "a"], [0, 1]),
([0, 1], ["a", "b"], ["b", "a"], [1, 0]),
([0, 1], ["b", "a"], ["a", "b"], [1, 0]),
([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]),
([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]),
([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]),
([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]),
([-1, -1], [], ["a", "b"], [-1, -1]),
([1, 0], ["b", "a"], ["a", "b"], [0, 1]),
],
)
def test_recode_to_categories(self, codes, old, new, expected):
codes = np.asanyarray(codes, dtype=np.int8)
expected = np.asanyarray(expected, dtype=np.int8)
old = Index(old)
new = Index(new)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self):
N = 1000
codes = np.arange(N)
old = Index(codes)
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
new = Index(expected)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalDtype,
NaT,
Timestamp,
array,
to_datetime,
)
import pandas._testing as tm
class TestAstype:
def test_astype_str_int_categories_to_nullable_int(self):
# GH#39616
dtype = CategoricalDtype([str(i) for i in range(5)])
codes = np.random.randint(5, size=20)
arr = Categorical.from_codes(codes, dtype=dtype)
res = arr.astype("Int64")
expected = array(codes, dtype="Int64")
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_astype(self, ordered):
# string
cat = Categorical(list("abbaaccc"), ordered=ordered)
result = cat.astype(object)
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)
msg = r"Cannot cast object dtype to float64"
with pytest.raises(ValueError, match=msg):
cat.astype(float)
# numeric
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
result = cat.astype(object)
expected = np.array(cat, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(int)
expected = np.array(cat, dtype="int")
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(float)
expected = np.array(cat, dtype=float)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype_ordered", [True, False])
@pytest.mark.parametrize("cat_ordered", [True, False])
def test_astype_category(self, dtype_ordered, cat_ordered):
# GH#10696/GH#18593
data = list("abcaacbab")
cat = Categorical(data, categories=list("bac"), ordered=cat_ordered)
# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered)
tm.assert_categorical_equal(result, expected)
# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)
if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = cat.astype("category")
expected = cat
tm.assert_categorical_equal(result, expected)
def test_astype_object_datetime_categories(self):
# GH#40754
cat = Categorical(to_datetime(["2021-03-27", NaT]))
result = cat.astype(object)
expected = np.array([Timestamp("2021-03-27 00:00:00"), NaT], dtype="object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_object_timestamp_categories(self):
# GH#18024
cat = Categorical([Timestamp("2014-01-01")])
result = cat.astype(object)
expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object")
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,762 @@
from datetime import (
date,
datetime,
)
import numpy as np
import pytest
from pandas.compat import (
IS64,
is_platform_windows,
)
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
period_range,
timedelta_range,
)
import pandas._testing as tm
from pandas.core.api import Int64Index
class TestCategoricalConstructors:
def test_categorical_scalar_deprecated(self):
# GH#38433
with tm.assert_produces_warning(FutureWarning):
Categorical("A", categories=["A", "B"])
def test_categorical_1d_only(self):
# ndim > 1
msg = "> 1 ndim Categorical are not supported at this time"
with pytest.raises(NotImplementedError, match=msg):
Categorical(np.array([list("abcd")]))
def test_validate_ordered(self):
# see gh-14058
exp_msg = "'ordered' must either be 'True' or 'False'"
exp_err = TypeError
# This should be a boolean.
ordered = np.array([0, 1, 2])
with pytest.raises(exp_err, match=exp_msg):
Categorical([1, 2, 3], ordered=ordered)
with pytest.raises(exp_err, match=exp_msg):
Categorical.from_codes(
[0, 0, 1], categories=["a", "b", "c"], ordered=ordered
)
def test_constructor_empty(self):
# GH 17248
c = Categorical([])
expected = Index([])
tm.assert_index_equal(c.categories, expected)
c = Categorical([], categories=[1, 2, 3])
expected = Int64Index([1, 2, 3])
tm.assert_index_equal(c.categories, expected)
def test_constructor_empty_boolean(self):
# see gh-22702
cat = Categorical([], categories=[True, False])
categories = sorted(cat.categories.tolist())
assert categories == [False, True]
def test_constructor_tuples(self):
values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
result = Categorical(values)
expected = Index([(1,), (1, 2)], tupleize_cols=False)
tm.assert_index_equal(result.categories, expected)
assert result.ordered is False
def test_constructor_tuples_datetimes(self):
# numpy will auto reshape when all of the tuples are the
# same len, so add an extra one with 2 items and slice it off
values = np.array(
[
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
("a", "b"),
],
dtype=object,
)[:-1]
result = Categorical(values)
expected = Index(
[(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)],
tupleize_cols=False,
)
tm.assert_index_equal(result.categories, expected)
def test_constructor_unsortable(self):
# it works!
arr = np.array([1, 2, 3, datetime.now()], dtype="O")
factor = Categorical(arr, ordered=False)
assert not factor.ordered
# this however will raise as cannot be sorted
msg = (
"'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument."
)
with pytest.raises(TypeError, match=msg):
Categorical(arr, ordered=True)
def test_constructor_interval(self):
result = Categorical(
[Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True
)
ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
exp = Categorical(ii, ordered=True)
tm.assert_categorical_equal(result, exp)
tm.assert_index_equal(result.categories, ii)
def test_constructor(self):
exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
c1 = Categorical(exp_arr)
tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["c", "b", "a"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
# categories must be unique
msg = "Categorical categories must be unique"
with pytest.raises(ValueError, match=msg):
Categorical([1, 2], [1, 2, 2])
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ["a", "b", "b"])
# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
assert not c1.ordered
# Categorical as input
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
# Series of dtype category
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
# Series
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(Series(["a", "b", "c", "a"]))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(c1, c2)
# This should result in integer categories, not float!
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
assert is_integer_dtype(cat.categories)
# https://github.com/pandas-dev/pandas/issues/3678
cat = Categorical([np.nan, 1, 2, 3])
assert is_integer_dtype(cat.categories)
# this should result in floats
cat = Categorical([np.nan, 1, 2.0, 3])
assert is_float_dtype(cat.categories)
cat = Categorical([np.nan, 1.0, 2.0, 3.0])
assert is_float_dtype(cat.categories)
# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...
# vals = np.asarray(cat[cat.notna()])
# assert is_integer_dtype(vals)
# corner cases
cat = Categorical([1])
assert len(cat.categories) == 1
assert cat.categories[0] == 1
assert len(cat.codes) == 1
assert cat.codes[0] == 0
cat = Categorical(["a"])
assert len(cat.categories) == 1
assert cat.categories[0] == "a"
assert len(cat.codes) == 1
assert cat.codes[0] == 0
with tm.assert_produces_warning(FutureWarning):
# GH#38433
cat = Categorical(1)
assert len(cat.categories) == 1
assert cat.categories[0] == 1
assert len(cat.codes) == 1
assert cat.codes[0] == 0
# two arrays
# - when the first is an integer dtype and the second is not
# - when the resulting codes are all -1/NaN
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])
# the next one are from the old docs
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
cat = Categorical([1, 2], categories=[1, 2, 3])
# this is a legitimate constructor
with tm.assert_produces_warning(None):
Categorical(np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True)
def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)
c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)
def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
msg = r"^Parameter 'categories' must be list-like, was"
with pytest.raises(TypeError, match=msg):
Categorical(["a", "b"], categories="a")
def test_constructor_with_null(self):
# Cannot have NaN in categories
msg = "Categorical categories cannot be null"
with pytest.raises(ValueError, match=msg):
Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical(
DatetimeIndex(["nat", "20160101"]),
categories=[NaT, Timestamp("20160101")],
)
def test_constructor_with_index(self):
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(ci.values, Categorical(ci))
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(
ci.values, Categorical(ci.astype(object), categories=ci.categories)
)
def test_constructor_with_generator(self):
# This was raising an Error in isna(single_val).any() because isna
# returned a scalar for a generator
exp = Categorical([0, 1, 2])
cat = Categorical(x for x in [0, 1, 2])
tm.assert_categorical_equal(cat, exp)
cat = Categorical(range(3))
tm.assert_categorical_equal(cat, exp)
MultiIndex.from_product([range(5), ["a", "b", "c"]])
# check that categories accept generators and sequences
cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
tm.assert_categorical_equal(cat, exp)
cat = Categorical([0, 1, 2], categories=range(3))
tm.assert_categorical_equal(cat, exp)
def test_constructor_with_rangeindex(self):
# RangeIndex is preserved in Categories
rng = Index(range(3))
cat = Categorical(rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
cat = Categorical([1, 2, 0], categories=rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
@pytest.mark.parametrize(
"dtl",
[
date_range("1995-01-01 00:00:00", periods=5, freq="s"),
date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"),
timedelta_range("1 day", periods=5, freq="s"),
],
)
def test_constructor_with_datetimelike(self, dtl):
# see gh-12077
# constructor with a datetimelike and NaT
s = Series(dtl)
c = Categorical(s)
expected = type(dtl)(s)
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
# with NaT
s2 = s.copy()
s2.iloc[-1] = NaT
c = Categorical(s2)
expected = type(dtl)(s2.dropna())
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
tm.assert_numpy_array_equal(c.codes, exp)
result = repr(c)
assert "NaT" in result
def test_constructor_from_index_series_datetimetz(self):
idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern")
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_date_objects(self):
# we dont cast date objects to timestamps, matching Index constructor
v = date.today()
cat = Categorical([v, v])
assert cat.categories.dtype == object
assert type(cat.categories[0]) is date
def test_constructor_from_index_series_timedelta(self):
idx = timedelta_range("1 days", freq="D", periods=3)
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_from_index_series_period(self):
idx = period_range("2015-01-01", freq="D", periods=3)
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
@pytest.mark.parametrize(
"values",
[
np.array([1.0, 1.2, 1.8, np.nan]),
np.array([1, 2, 3], dtype="int64"),
["a", "b", "c", np.nan],
[pd.Period("2014-01"), pd.Period("2014-02"), NaT],
[Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT],
[
Timestamp("2014-01-01", tz="US/Eastern"),
Timestamp("2014-01-02", tz="US/Eastern"),
NaT,
],
],
)
def test_constructor_invariant(self, values):
# GH 14190
c = Categorical(values)
c2 = Categorical(c)
tm.assert_categorical_equal(c, c2)
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_with_dtype(self, ordered):
categories = ["b", "a", "c"]
dtype = CategoricalDtype(categories, ordered=ordered)
result = Categorical(["a", "b", "a", "c"], dtype=dtype)
expected = Categorical(
["a", "b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
assert result.ordered is ordered
def test_constructor_dtype_and_others_raises(self):
dtype = CategoricalDtype(["a", "b"], ordered=True)
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], categories=["a", "b"], dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=True, dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=False, dtype=dtype)
@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_str_category(self, categories, ordered):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")
def test_constructor_np_strs(self):
# GH#31499 Hastable.map_locations needs to work on np.str_ objects
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
assert all(isinstance(x, np.str_) for x in cat.categories)
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use dtype.categories, not values.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_with_unknown_dtype(self):
dtype = CategoricalDtype(None, ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use values.categories, not dtype.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "d"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_string(self):
values = Categorical(["a", "b", "d"])
# use categories, ordered
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
# No string
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(result, expected)
def test_constructor_with_categorical_categories(self):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list])
def test_construction_with_null(self, klass, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/31927
values = klass(["a", nulls_fixture, "b"])
result = Categorical(values)
dtype = CategoricalDtype(["a", "b"])
codes = [0, -1, 1]
expected = Categorical.from_codes(codes=codes, dtype=dtype)
tm.assert_categorical_equal(result, expected)
def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype):
# GH#39649
cats = pd.array(range(5), dtype=any_numeric_ea_dtype)
codes = np.random.randint(5, size=3)
dtype = CategoricalDtype(cats)
arr = Categorical.from_codes(codes, dtype=dtype)
assert arr.categories.dtype == cats.dtype
tm.assert_index_equal(arr.categories, Index(cats))
def test_from_codes_empty(self):
cat = ["a", "b", "c"]
result = Categorical.from_codes([], categories=cat)
expected = Categorical([], categories=cat)
tm.assert_categorical_equal(result, expected)
def test_from_codes_too_few_categories(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)
def test_from_codes_non_int_codes(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)
def test_from_codes_non_unique_categories(self):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
def test_from_codes_nan_cat_included(self):
with pytest.raises(ValueError, match="Categorical categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
def test_from_codes_too_negative(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)
def test_from_codes(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_categorical_categories(self, klass):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_non_unique_categorical_categories(self, klass):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], klass(["a", "b", "a"]))
def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
dtype = CategoricalDtype(categories=["a", "b", "c"])
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)
@pytest.mark.parametrize("codes", [[1.0, 2.0, 0], [1.1, 2.0, 0]])
def test_from_codes_with_float(self, codes):
# GH21767
# float codes should raise even if values are equal to integers
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype=dtype)
def test_from_codes_with_dtype_raises(self):
msg = "Cannot specify"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"])
)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"])
)
def test_from_codes_neither(self):
msg = "Both were None"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1])
def test_from_codes_with_nullable_int(self):
codes = pd.array([0, 1], dtype="Int64")
categories = ["a", "b"]
result = Categorical.from_codes(codes, categories=categories)
expected = Categorical.from_codes(codes.to_numpy(int), categories=categories)
tm.assert_categorical_equal(result, expected)
def test_from_codes_with_nullable_int_na_raises(self):
codes = pd.array([0, None], dtype="Int64")
categories = ["a", "b"]
msg = "codes cannot contain NA values"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, categories=categories)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories(self, dtype):
cats = ["a", "b"]
codes = np.array([0, 0, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes(codes, cats)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories_sorts(self, dtype):
cats = ["b", "a"]
codes = np.array([0, 1, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_dtype(self):
cats = ["a", "b", "d"]
codes = np.array([0, 1, 0, 2], dtype="i8")
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical(
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_coerces(self):
cats = ["1", "2", "bad"]
codes = np.array([0, 0, 1, 2], dtype="i8")
dtype = CategoricalDtype([1, 2])
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("ordered", [None, True, False])
def test_construction_with_ordered(self, ordered):
# GH 9347, 9190
cat = Categorical([0, 1, 2], ordered=ordered)
assert cat.ordered == bool(ordered)
@pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
def test_constructor_imaginary(self):
values = [1, 2, 3 + 1j]
c1 = Categorical(values)
tm.assert_index_equal(c1.categories, Index(values))
tm.assert_numpy_array_equal(np.array(c1), np.array(values))
def test_constructor_string_and_tuples(self):
# GH 21416
c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
expected_index = Index([("a", "b"), ("b", "a"), "c"])
assert c.categories.equals(expected_index)
def test_interval(self):
idx = pd.interval_range(0, 10, periods=10)
cat = Categorical(idx, categories=idx)
expected_codes = np.arange(10, dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# infer categories
cat = Categorical(idx)
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values
cat = Categorical(list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values, categories
cat = Categorical(list(idx), categories=list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# shuffled
values = idx.take([1, 2, 0])
cat = Categorical(values, categories=idx)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
tm.assert_index_equal(cat.categories, idx)
# extra
values = pd.interval_range(8, 11, periods=3)
cat = Categorical(values, categories=idx)
expected_codes = np.array([8, 9, -1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# overlapping
idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
cat = Categorical(idx, categories=idx)
expected_codes = np.array([0, 1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
def test_categorical_extension_array_nullable(self, nulls_fixture):
# GH:
arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
result = Categorical(arr)
assert arr.dtype == result.categories.dtype
expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
tm.assert_categorical_equal(result, expected)
def test_from_sequence_copy(self):
cat = Categorical(np.arange(5).repeat(2))
result = Categorical._from_sequence(cat, dtype=None, copy=False)
# more generally, we'd be OK with a view
assert result._codes is cat._codes
result = Categorical._from_sequence(cat, dtype=None, copy=True)
assert not tm.shares_memory(result, cat)
@pytest.mark.xfail(
not IS64 or is_platform_windows(),
reason="Incorrectly raising in ensure_datetime64ns",
)
def test_constructor_datetime64_non_nano(self):
categories = np.arange(10).view("M8[D]")
values = categories[::2].copy()
cat = Categorical(values, categories=categories)
assert (cat == values).all()

View File

@ -0,0 +1,136 @@
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas import (
Categorical,
CategoricalIndex,
Index,
Series,
Timestamp,
)
import pandas._testing as tm
class TestCategoricalDtypes:
def test_is_dtype_equal_deprecated(self):
# GH#37545
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
with tm.assert_produces_warning(FutureWarning):
c1.is_dtype_equal(c1)
def test_categories_match_up_to_permutation(self):
# test dtype comparisons between cats
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False)
c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True)
assert c1._categories_match_up_to_permutation(c1)
assert c2._categories_match_up_to_permutation(c2)
assert c3._categories_match_up_to_permutation(c3)
assert c1._categories_match_up_to_permutation(c2)
assert not c1._categories_match_up_to_permutation(c3)
assert not c1._categories_match_up_to_permutation(Index(list("aabca")))
assert not c1._categories_match_up_to_permutation(c1.astype(object))
assert c1._categories_match_up_to_permutation(CategoricalIndex(c1))
assert c1._categories_match_up_to_permutation(
CategoricalIndex(c1, categories=list("cab"))
)
assert not c1._categories_match_up_to_permutation(
CategoricalIndex(c1, ordered=True)
)
# GH 16659
s1 = Series(c1)
s2 = Series(c2)
s3 = Series(c3)
assert c1._categories_match_up_to_permutation(s1)
assert c2._categories_match_up_to_permutation(s2)
assert c3._categories_match_up_to_permutation(s3)
assert c1._categories_match_up_to_permutation(s2)
assert not c1._categories_match_up_to_permutation(s3)
assert not c1._categories_match_up_to_permutation(s1.astype(object))
def test_set_dtype_same(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(["a", "b", "c"]))
tm.assert_categorical_equal(result, c)
def test_set_dtype_new_categories(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(list("abcd")))
tm.assert_numpy_array_equal(result.codes, c.codes)
tm.assert_index_equal(result.dtype.categories, Index(list("abcd")))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_dtype_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c._set_dtype(expected.dtype)
tm.assert_categorical_equal(result, expected)
def test_set_dtype_no_overlap(self):
c = Categorical(["a", "b", "c"], ["d", "e"])
result = c._set_dtype(CategoricalDtype(["a", "b"]))
expected = Categorical([None, None, None], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_codes_dtypes(self):
# GH 8453
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = Categorical([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
result = Categorical([f"foo{i:05d}" for i in range(40000)])
assert result.codes.dtype == "int32"
# adding cats
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = result.add_categories([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
# removing cats
result = result.remove_categories([f"foo{i:05d}" for i in range(300)])
assert result.codes.dtype == "int8"
def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
assert isinstance(list(cat)[0], int)
assert isinstance(cat.tolist()[0], int)
def test_iter_python_types_datetime(self):
cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")])
assert isinstance(list(cat)[0], Timestamp)
assert isinstance(cat.tolist()[0], Timestamp)

View File

@ -0,0 +1,377 @@
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalIndex,
Index,
Interval,
IntervalIndex,
PeriodIndex,
Series,
Timedelta,
Timestamp,
)
import pandas._testing as tm
import pandas.core.common as com
from pandas.tests.arrays.categorical.common import TestCategorical
class TestCategoricalIndexingWithFactor(TestCategorical):
def test_getitem(self):
assert self.factor[0] == "a"
assert self.factor[-1] == "c"
subf = self.factor[[0, 1, 2]]
tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8))
subf = self.factor[np.asarray(self.factor) == "c"]
tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8))
def test_setitem(self):
# int/positional
c = self.factor.copy()
c[0] = "b"
assert c[0] == "b"
c[-1] = "a"
assert c[-1] == "a"
# boolean
c = self.factor.copy()
indexer = np.zeros(len(c), dtype="bool")
indexer[0] = True
indexer[-1] = True
c[indexer] = "c"
expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(c, expected)
@pytest.mark.parametrize(
"other",
[Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])],
)
def test_setitem_same_but_unordered(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
target[mask] = other[mask]
expected = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_categorical_equal(target, expected)
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"], categories=["b", "a", "c"]),
Categorical(["b", "a"], categories=["a", "b", "c"]),
Categorical(["a", "a"], categories=["a"]),
Categorical(["b", "b"], categories=["b"]),
],
)
def test_setitem_different_unordered_raises(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"]),
Categorical(["b", "a"], categories=["b", "a"], ordered=True),
Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True),
],
)
def test_setitem_same_ordered_raises(self, other):
# Gh-24142
target = Categorical(["a", "b"], categories=["a", "b"], ordered=True)
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
def test_setitem_tuple(self):
# GH#20439
cat = Categorical([(0, 1), (0, 2), (0, 1)])
# This should not raise
cat[1] = cat[0]
assert cat[1] == (0, 1)
def test_setitem_listlike(self):
# GH#9469
# properly coerce the input indexers
np.random.seed(1)
cat = Categorical(
np.random.randint(0, 5, size=150000).astype(np.int8)
).add_categories([-1000])
indexer = np.array([100000]).astype(np.int64)
cat[indexer] = -1000
# we are asserting the code result here
# which maps to the -1000 category
result = cat.codes[np.array([100000]).astype(np.int64)]
tm.assert_numpy_array_equal(result, np.array([5], dtype="int8"))
class TestCategoricalIndexing:
def test_getitem_slice(self):
cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
sliced = cat[3]
assert sliced == "d"
sliced = cat[3:5]
expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(sliced, expected)
def test_getitem_listlike(self):
# GH 9469
# properly coerce the input indexers
np.random.seed(1)
c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8))
result = c.codes[np.array([100000]).astype(np.int64)]
expected = c[np.array([100000]).astype(np.int64)].codes
tm.assert_numpy_array_equal(result, expected)
def test_periodindex(self):
idx1 = PeriodIndex(
["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M"
)
cat1 = Categorical(idx1)
str(cat1)
exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat1._codes, exp_arr)
tm.assert_index_equal(cat1.categories, exp_idx)
idx2 = PeriodIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M"
)
cat2 = Categorical(idx2, ordered=True)
str(cat2)
exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat2._codes, exp_arr)
tm.assert_index_equal(cat2.categories, exp_idx2)
idx3 = PeriodIndex(
[
"2013-12",
"2013-11",
"2013-10",
"2013-09",
"2013-08",
"2013-07",
"2013-05",
],
freq="M",
)
cat3 = Categorical(idx3, ordered=True)
exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
exp_idx = PeriodIndex(
[
"2013-05",
"2013-07",
"2013-08",
"2013-09",
"2013-10",
"2013-11",
"2013-12",
],
freq="M",
)
tm.assert_numpy_array_equal(cat3._codes, exp_arr)
tm.assert_index_equal(cat3.categories, exp_idx)
def test_categories_assignments(self):
cat = Categorical(["a", "b", "c", "a"])
exp = np.array([1, 2, 3, 1], dtype=np.int64)
cat.categories = [1, 2, 3]
tm.assert_numpy_array_equal(cat.__array__(), exp)
tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_categories_assignments_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items "
"as the old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.categories = new_categories
# Combinations of sorted/unique:
@pytest.mark.parametrize(
"idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]]
)
# Combinations of missing/unique
@pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]])
@pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex])
@pytest.mark.parametrize("dtype", [None, "category", "key"])
def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype):
# GH 21448
key = key_class(key_values, categories=range(1, 5))
if dtype == "key":
dtype = key.dtype
# Test for flat index and CategoricalIndex with same/different cats:
idx = Index(idx_values, dtype=dtype)
expected, exp_miss = idx.get_indexer_non_unique(key_values)
result, res_miss = idx.get_indexer_non_unique(key)
tm.assert_numpy_array_equal(expected, result)
tm.assert_numpy_array_equal(exp_miss, res_miss)
exp_unique = idx.unique().get_indexer(key_values)
res_unique = idx.unique().get_indexer(key)
tm.assert_numpy_array_equal(res_unique, exp_unique)
def test_where_unobserved_nan(self):
ser = Series(Categorical(["a", "b"]))
result = ser.where([True, False])
expected = Series(Categorical(["a", None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
# all NA
ser = Series(Categorical(["a", "b"]))
result = ser.where([False, False])
expected = Series(Categorical([None, None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
def test_where_unobserved_categories(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
result = ser.where([True, True, False], other="b")
expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories))
tm.assert_series_equal(result, expected)
def test_where_other_categorical(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"])
result = ser.where([True, False, True], other)
expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype))
tm.assert_series_equal(result, expected)
def test_where_new_category_raises(self):
ser = Series(Categorical(["a", "b", "c"]))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
ser.where([True, False, True], "d")
def test_where_ordered_differs_rasies(self):
ser = Series(
Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True)
)
other = Categorical(
["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
)
with pytest.raises(TypeError, match="without identical categories"):
ser.where([True, False, True], other)
class TestContains:
def test_contains(self):
# GH#21508
cat = Categorical(list("aabbca"), categories=list("cab"))
assert "b" in cat
assert "z" not in cat
assert np.nan not in cat
with pytest.raises(TypeError, match="unhashable type: 'list'"):
assert [1] in cat
# assert codes NOT in index
assert 0 not in cat
assert 1 not in cat
cat = Categorical(list("aabbca") + [np.nan], categories=list("cab"))
assert np.nan in cat
@pytest.mark.parametrize(
"item, expected",
[
(Interval(0, 1), True),
(1.5, True),
(Interval(0.5, 1.5), False),
("a", False),
(Timestamp(1), False),
(Timedelta(1), False),
],
ids=str,
)
def test_contains_interval(self, item, expected):
# GH#23705
cat = Categorical(IntervalIndex.from_breaks(range(3)))
result = item in cat
assert result is expected
def test_contains_list(self):
# GH#21729
cat = Categorical([1, 2, 3])
assert "a" not in cat
with pytest.raises(TypeError, match="unhashable type"):
["a"] in cat
with pytest.raises(TypeError, match="unhashable type"):
["a", "b"] in cat
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean(index):
ser = Series(range(3))
idx = Categorical([True, False, True])
if index:
idx = CategoricalIndex(idx)
assert com.is_bool_indexer(idx)
result = ser[idx]
expected = ser[idx.astype("object")]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean_na_treated_as_false(index):
# https://github.com/pandas-dev/pandas/issues/31503
ser = Series(range(3))
idx = Categorical([True, False, None])
if index:
idx = CategoricalIndex(idx)
result = ser[idx]
expected = ser[idx.fillna(False)]
tm.assert_series_equal(result, expected)
@pytest.fixture
def non_coercible_categorical(monkeypatch):
"""
Monkeypatch Categorical.__array__ to ensure no implicit conversion.
Raises
------
ValueError
When Categorical.__array__ is called.
"""
# TODO(Categorical): identify other places where this may be
# useful and move to a conftest.py
def array(self, dtype=None):
raise ValueError("I cannot be converted.")
with monkeypatch.context() as m:
m.setattr(Categorical, "__array__", array)
yield
def test_series_at(non_coercible_categorical):
arr = Categorical(["a", "b", "c"])
ser = Series(arr)
result = ser.at[0]
assert result == "a"

View File

@ -0,0 +1,213 @@
import collections
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
isna,
)
import pandas._testing as tm
class TestCategoricalMissing:
def test_isna(self):
exp = np.array([False, False, True])
cat = Categorical(["a", "b", np.nan])
res = cat.isna()
tm.assert_numpy_array_equal(res, exp)
def test_na_flags_int_categories(self):
# #1457
categories = list(range(10))
labels = np.random.randint(0, 10, 20)
labels[::5] = -1
cat = Categorical(labels, categories, fastpath=True)
repr(cat)
tm.assert_numpy_array_equal(isna(cat), labels == -1)
def test_nan_handling(self):
# Nans are represented as -1 in codes
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
def test_set_dtype_nans(self):
c = Categorical(["a", "b", np.nan])
result = c._set_dtype(CategoricalDtype(["a", "c"]))
tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
def test_set_item_nan(self):
cat = Categorical([1, 2, 3])
cat[1] = np.nan
exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)
@pytest.mark.parametrize(
"fillna_kwargs, msg",
[
(
{"value": 1, "method": "ffill"},
"Cannot specify both 'value' and 'method'.",
),
({}, "Must specify a fill 'value' or 'method'."),
({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
(
{"value": Series([1, 2, 3, 4, "a"])},
"Cannot setitem on a Categorical with a new category",
),
],
)
def test_fillna_raises(self, fillna_kwargs, msg):
# https://github.com/pandas-dev/pandas/issues/19682
# https://github.com/pandas-dev/pandas/issues/13628
cat = Categorical([1, 2, 3, None, None])
if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
err = TypeError
else:
err = ValueError
with pytest.raises(err, match=msg):
cat.fillna(**fillna_kwargs)
@pytest.mark.parametrize("named", [True, False])
def test_fillna_iterable_category(self, named):
# https://github.com/pandas-dev/pandas/issues/21097
if named:
Point = collections.namedtuple("Point", "x y")
else:
Point = lambda *args: args # tuple
cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
result = cat.fillna(Point(0, 0))
expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
tm.assert_categorical_equal(result, expected)
# Case where the Point is not among our categories; we want ValueError,
# not NotImplementedError GH#41914
cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
cat.fillna(Point(0, 0))
def test_fillna_array(self):
# accept Categorical or ndarray value if it holds appropriate values
cat = Categorical(["A", "B", "C", None, None])
other = cat.fillna("C")
result = cat.fillna(other)
tm.assert_categorical_equal(result, other)
assert isna(cat[-1]) # didn't modify original inplace
other = np.array(["A", "B", "C", "B", "A"])
result = cat.fillna(other)
expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
tm.assert_categorical_equal(result, expected)
assert isna(cat[-1]) # didn't modify original inplace
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
with pd.option_context("mode.use_inf_as_na", True):
cat = Categorical(values)
result = cat.isna()
tm.assert_numpy_array_equal(result, expected)
result = Series(cat).isna()
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = DataFrame(cat).isna()
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na_outside_context(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
# Using isna directly for Categorical will fail in general here
cat = Categorical(values)
with pd.option_context("mode.use_inf_as_na", True):
result = isna(cat)
tm.assert_numpy_array_equal(result, expected)
result = isna(Series(cat))
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = isna(DataFrame(cat))
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"a1, a2, categories",
[
(["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
],
)
def test_compare_categorical_with_missing(self, a1, a2, categories):
# GH 28384
cat_type = CategoricalDtype(categories)
# !=
result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
expected = Series(a1) != Series(a2)
tm.assert_series_equal(result, expected)
# ==
result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
expected = Series(a1) == Series(a2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"na_value, dtype",
[
(pd.NaT, "datetime64[ns]"),
(None, "float64"),
(np.nan, "float64"),
(pd.NA, "float64"),
],
)
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
# GH#44900
result = Categorical([na_value, na_value])
tm.assert_index_equal(result.categories, Index([], dtype=dtype))

View File

@ -0,0 +1,399 @@
import warnings
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
from pandas.tests.arrays.categorical.common import TestCategorical
class TestCategoricalOpsWithFactor(TestCategorical):
def test_categories_none_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(factor, self.factor)
def test_comparisons(self):
result = self.factor[self.factor == "a"]
expected = self.factor[np.asarray(self.factor) == "a"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor != "a"]
expected = self.factor[np.asarray(self.factor) != "a"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor < "c"]
expected = self.factor[np.asarray(self.factor) < "c"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor > "a"]
expected = self.factor[np.asarray(self.factor) > "a"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor >= "b"]
expected = self.factor[np.asarray(self.factor) >= "b"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor <= "b"]
expected = self.factor[np.asarray(self.factor) <= "b"]
tm.assert_categorical_equal(result, expected)
n = len(self.factor)
other = self.factor[np.random.permutation(n)]
result = self.factor == other
expected = np.asarray(self.factor) == np.asarray(other)
tm.assert_numpy_array_equal(result, expected)
result = self.factor == "d"
expected = np.zeros(len(self.factor), dtype=bool)
tm.assert_numpy_array_equal(result, expected)
# comparisons with categoricals
cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
cat_rev_base = Categorical(
["b", "b", "b"], categories=["c", "b", "a"], ordered=True
)
cat = Categorical(["a", "b", "c"], ordered=True)
cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = np.array([True, False, False])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = np.array([False, False, True])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res = cat > cat_base
exp = np.array([False, False, True])
tm.assert_numpy_array_equal(res, exp)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"])
with pytest.raises(TypeError, match=msg):
cat_rev > cat_rev_base2
# Only categories with same ordering information can be compared
cat_unorderd = cat.set_ordered(False)
assert not (cat > cat).any()
with pytest.raises(TypeError, match=msg):
cat > cat_unorderd
# comparison (in both directions) with Series will raise
s = Series(["b", "b", "b"])
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
# comparison with numpy.array will raise in both direction, but only on
# newer numpy versions
a = np.array(["b", "b", "b"])
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
# Make sure that unequal comparison take the categories order in
# account
cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True)
exp = np.array([True, False, False])
res = cat_rev > "b"
tm.assert_numpy_array_equal(res, exp)
# check that zero-dim array gets unboxed
res = cat_rev > np.array("b")
tm.assert_numpy_array_equal(res, exp)
class TestCategoricalOps:
def test_compare_frame(self):
# GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
data = ["a", "b", 2, "a"]
cat = Categorical(data)
df = DataFrame(cat)
result = cat == df.T
expected = DataFrame([[True, True, True, True]])
tm.assert_frame_equal(result, expected)
result = cat[::-1] != df.T
expected = DataFrame([[False, True, True, False]])
tm.assert_frame_equal(result, expected)
def test_compare_frame_raises(self, comparison_op):
# alignment raises unless we transpose
op = comparison_op
cat = Categorical(["a", "b", 2, "a"])
df = DataFrame(cat)
msg = "Unable to coerce to Series, length must be 1: given 4"
with pytest.raises(ValueError, match=msg):
op(cat, df)
def test_datetime_categorical_comparison(self):
dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True)
tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True]))
def test_reflected_comparison_with_scalars(self):
# GH8658
cat = Categorical([1, 2, 3], ordered=True)
tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True]))
def test_comparison_with_unknown_scalars(self):
# https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
# and following comparisons with scalars not in categories should raise
# for unequal comps, but not for equal/not equal
cat = Categorical([1, 2, 3], ordered=True)
msg = "Invalid comparison between dtype=category and int"
with pytest.raises(TypeError, match=msg):
cat < 4
with pytest.raises(TypeError, match=msg):
cat > 4
with pytest.raises(TypeError, match=msg):
4 < cat
with pytest.raises(TypeError, match=msg):
4 > cat
tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True]))
def test_comparison_with_tuple(self):
cat = Categorical(np.array(["foo", (0, 1), 3, (0, 1)], dtype=object))
result = cat == "foo"
expected = np.array([True, False, False, False], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat == (0, 1)
expected = np.array([False, True, False, True], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat != (0, 1)
tm.assert_numpy_array_equal(result, ~expected)
def test_comparison_of_ordered_categorical_with_nan_to_scalar(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# BUG: fix ordered categorical comparison with missing values (#26504 )
# and following comparisons with scalars in categories with missing
# values should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
scalar = 2
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar)
actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
tm.assert_numpy_array_equal(actual, expected)
def test_comparison_of_ordered_categorical_with_nan_to_listlike(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# and following comparisons of missing values in ordered Categorical
# with listlike should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
actual = getattr(cat, compare_operators_no_eq_ne)(other)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.parametrize(
"data,reverse,base",
[(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])],
)
def test_comparisons(self, data, reverse, base):
cat_rev = Series(Categorical(data, categories=reverse, ordered=True))
cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True))
cat = Series(Categorical(data, ordered=True))
cat_base = Series(
Categorical(base, categories=cat.cat.categories, ordered=True)
)
s = Series(base)
a = np.array(base)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = Series([True, False, False])
tm.assert_series_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = Series([False, False, True])
tm.assert_series_equal(res_rev, exp_rev)
res = cat > cat_base
exp = Series([False, False, True])
tm.assert_series_equal(res, exp)
scalar = base[1]
res = cat > scalar
exp = Series([False, False, True])
exp2 = cat.values > scalar
tm.assert_series_equal(res, exp)
tm.assert_numpy_array_equal(res.values, exp2)
res_rev = cat_rev > scalar
exp_rev = Series([True, False, False])
exp_rev2 = cat_rev.values > scalar
tm.assert_series_equal(res_rev, exp_rev)
tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
# categorical cannot be compared to Series or numpy array, and also
# not the other way around
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
with pytest.raises(TypeError, match=msg):
a < cat
with pytest.raises(TypeError, match=msg):
a < cat_rev
@pytest.mark.parametrize(
"ctor",
[
lambda *args, **kwargs: Categorical(*args, **kwargs),
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
],
)
def test_unordered_different_order_equal(self, ctor):
# https://github.com/pandas-dev/pandas/issues/16014
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
assert (c1 == c2).all()
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
result = c1 == c2
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
def test_unordered_different_categories_raises(self):
c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False)
c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False)
with pytest.raises(TypeError, match=("Categoricals can only be compared")):
c1 == c2
def test_compare_different_lengths(self):
c1 = Categorical([], categories=["a", "b"])
c2 = Categorical([], categories=["a"])
msg = "Categoricals can only be compared if 'categories' are the same."
with pytest.raises(TypeError, match=msg):
c1 == c2
def test_compare_unordered_different_order(self):
# https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
# 349290078
a = Categorical(["a"], categories=["a", "b"])
b = Categorical(["b"], categories=["b", "a"])
assert not a.equals(b)
def test_numeric_like_ops(self):
df = DataFrame({"value": np.random.randint(0, 10000, 100)})
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
)
# numeric ops should not succeed
for op, str_rep in [
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
]:
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(df, op)(df)
# reduction ops should not succeed (unless specifically defined, e.g.
# min/max)
s = df["value_group"]
for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]:
msg = f"does not support reduction '{op}'"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(numeric_only=False)
# mad technically works because it takes always the numeric data
# numpy ops
s = Series(Categorical([1, 2, 3, 4]))
with pytest.raises(TypeError, match="does not support reduction 'sum'"):
np.sum(s)
# numeric ops on a Series
for op, str_rep in [
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
]:
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(2)
# invalid ufunc
msg = "Object with dtype category cannot perform the numpy op log"
with pytest.raises(TypeError, match=msg):
np.log(s)

View File

@ -0,0 +1,72 @@
import pytest
import pandas as pd
from pandas import Categorical
import pandas._testing as tm
@pytest.mark.parametrize(
"to_replace,value,expected,flip_categories",
[
# one-to-one
(1, 2, [2, 2, 3], False),
(1, 4, [4, 2, 3], False),
(4, 1, [1, 2, 3], False),
(5, 6, [1, 2, 3], False),
# many-to-one
([1], 2, [2, 2, 3], False),
([1, 2], 3, [3, 3, 3], False),
([1, 2], 4, [4, 4, 3], False),
((1, 2, 4), 5, [5, 5, 3], False),
((5, 6), 2, [1, 2, 3], False),
([1], [2], [2, 2, 3], False),
([1, 4], [5, 2], [5, 2, 3], False),
# check_categorical sorts categories, which crashes on mixed dtypes
(3, "4", [1, 2, "4"], False),
([1, 2, "3"], "5", ["5", "5", 3], True),
],
)
def test_replace_categorical_series(to_replace, value, expected, flip_categories):
# GH 31720
ser = pd.Series([1, 2, 3], dtype="category")
result = ser.replace(to_replace, value)
expected = pd.Series(expected, dtype="category")
ser.replace(to_replace, value, inplace=True)
if flip_categories:
expected = expected.cat.set_categories(expected.cat.categories[::-1])
tm.assert_series_equal(expected, result, check_category_order=False)
tm.assert_series_equal(expected, ser, check_category_order=False)
@pytest.mark.parametrize(
"to_replace, value, result, expected_error_msg",
[
("b", "c", ["a", "c"], "Categorical.categories are different"),
("c", "d", ["a", "b"], None),
# https://github.com/pandas-dev/pandas/issues/33288
("a", "a", ["a", "b"], None),
("b", None, ["a", None], "Categorical.categories length are different"),
],
)
def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
with tm.assert_produces_warning(FutureWarning, match="Series.replace"):
# GH#44929 replace->_replace
result = cat.replace(to_replace, value)
tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)
with tm.assert_produces_warning(FutureWarning, match="Series.replace"):
# GH#44929 replace->_replace
cat.replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)

View File

@ -0,0 +1,534 @@
import numpy as np
from pandas import (
Categorical,
CategoricalIndex,
Series,
date_range,
option_context,
period_range,
timedelta_range,
)
from pandas.tests.arrays.categorical.common import TestCategorical
class TestCategoricalReprWithFactor(TestCategorical):
def test_print(self):
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(self.factor)
assert actual == expected
class TestCategoricalRepr:
def test_big_print(self):
factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True)
expected = [
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
"Length: 600",
"Categories (3, object): ['a', 'b', 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
def test_empty_print(self):
factor = Categorical([], ["a", "b", "c"])
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected
assert expected == actual
factor = Categorical([], ["a", "b", "c"], ordered=True)
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual
factor = Categorical([], [])
expected = "[], Categories (0, object): []"
assert expected == repr(factor)
def test_print_none_width(self):
# GH10087
a = Series(Categorical([1, 2, 3, 4]))
exp = (
"0 1\n1 2\n2 3\n3 4\n"
"dtype: category\nCategories (4, int64): [1, 2, 3, 4]"
)
with option_context("display.width", None):
assert exp == repr(a)
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
Length: 60
Categories (3, object): ['aaaaa', 'bb', 'cccc']"""
assert repr(c) == expected
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa:E501
assert repr(c) == expected
# unicode option should not affect to Categorical, as it doesn't care
# the repr width
with option_context("display.unicode.east_asian_width", True):
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa:E501
assert repr(c) == expected
def test_categorical_repr(self):
c = Categorical([1, 2, 3])
exp = """[1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1, 2, 3, 4, 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20))
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
assert repr(c) == exp
def test_categorical_repr_ordered(self):
c = Categorical([1, 2, 3], ordered=True)
exp = """[1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20), ordered=True)
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
assert repr(c) == exp
def test_categorical_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
""
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
)
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
"2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
"2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
def test_categorical_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa:E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa:E501
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa:E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa:E501
assert repr(c) == exp
def test_categorical_repr_int_with_nan(self):
c = Categorical([1, 2, np.nan])
c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]"""
assert repr(c) == c_exp
s = Series([1, 2, np.nan], dtype="object").astype("category")
s_exp = """0 1\n1 2\n2 NaN
dtype: category
Categories (2, int64): [1, 2]"""
assert repr(s) == s_exp
def test_categorical_repr_period(self):
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa:E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa:E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa:E501
assert repr(c) == exp
def test_categorical_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa:E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa:E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa:E501
assert repr(c) == exp
def test_categorical_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa:E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa:E501
assert repr(c) == exp
def test_categorical_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]"""
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa:E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa:E501
assert repr(c) == exp
def test_categorical_index_repr(self):
idx = CategoricalIndex(Categorical([1, 2, 3]))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa:E501
assert repr(idx) == exp
i = CategoricalIndex(Categorical(np.arange(10)))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_index_repr_ordered(self):
i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(np.arange(10), ordered=True))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_index_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_index_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
'2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
'2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_index_repr_period(self):
# test all length
idx = period_range("2011-01-01 09:00", freq="H", periods=1)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="H", periods=2)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="H", periods=3)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx)))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
'2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
'2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_index_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa:E501
assert repr(i) == exp
def test_categorical_str_repr(self):
# GH 33676
result = repr(Categorical([1, "2", 3, 4]))
expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']"
assert result == expected

View File

@ -0,0 +1,129 @@
import numpy as np
import pytest
from pandas import (
Categorical,
Index,
)
import pandas._testing as tm
class TestCategoricalSort:
def test_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(
c.argsort(ascending=True), expected, check_dtype=False
)
expected = expected[::-1]
tm.assert_numpy_array_equal(
c.argsort(ascending=False), expected, check_dtype=False
)
def test_numpy_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False)
tm.assert_numpy_array_equal(
np.argsort(c, kind="mergesort"), expected, check_dtype=False
)
msg = "the 'axis' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, axis=0)
msg = "the 'order' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, order="C")
def test_sort_values(self):
# unordered cats are sortable
cat = Categorical(["a", "b", "b", "a"], ordered=False)
cat.sort_values()
cat = Categorical(["a", "c", "b", "d"], ordered=True)
# sort_values
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
cat = Categorical(
["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
res = cat.sort_values(ascending=False)
exp = np.array(["d", "c", "b", "a"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# sort (inplace order)
cat1 = cat.copy()
orig_codes = cat1._codes
cat1.sort_values(inplace=True)
assert cat1._codes is orig_codes
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(cat1.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# reverse
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
res = cat.sort_values(ascending=False)
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
def test_sort_values_na_position(self):
# see gh-12882
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
exp_categories = Index([2, 5])
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values() # default arguments
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
res = cat.sort_values(ascending=True, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
res = cat.sort_values(ascending=False, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values(ascending=True, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
res = cat.sort_values(ascending=False, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="last")
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="first")
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)

View File

@ -0,0 +1,22 @@
from pandas import Categorical
import pandas._testing as tm
class TestCategoricalSubclassing:
def test_constructor(self):
sc = tm.SubclassedCategorical(["a", "b", "c"])
assert isinstance(sc, tm.SubclassedCategorical)
tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"]))
def test_from_codes(self):
sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"])
assert isinstance(sc, tm.SubclassedCategorical)
exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"])
tm.assert_categorical_equal(sc, exp)
def test_map(self):
sc = tm.SubclassedCategorical(["a", "b", "c"])
res = sc.map(lambda x: x.upper())
assert isinstance(res, tm.SubclassedCategorical)
exp = Categorical(["A", "B", "C"])
tm.assert_categorical_equal(res, exp)

View File

@ -0,0 +1,95 @@
import numpy as np
import pytest
from pandas import (
Categorical,
Index,
)
import pandas._testing as tm
class TestTake:
# https://github.com/pandas-dev/pandas/issues/20664
def test_take_default_allow_fill(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
result = cat.take([0, -1])
assert result.equals(cat)
def test_take_positive_no_warning(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
cat.take([0, 0])
def test_take_bounds(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical(["a", "b", "a"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "index 4 is out of bounds for( axis 0 with)? size 3"
with pytest.raises(IndexError, match=msg):
cat.take([4, 5], allow_fill=allow_fill)
def test_take_empty(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical([], categories=["a", "b"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "cannot do a non-empty take from an empty axes"
with pytest.raises(IndexError, match=msg):
cat.take([0], allow_fill=allow_fill)
def test_positional_take(self, ordered):
cat = Categorical(["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered)
result = cat.take([0, 1, 2], allow_fill=False)
expected = Categorical(
["a", "a", "b"], categories=cat.categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
def test_positional_take_unobserved(self, ordered):
cat = Categorical(["a", "b"], categories=["a", "b", "c"], ordered=ordered)
result = cat.take([1, 0], allow_fill=False)
expected = Categorical(["b", "a"], categories=cat.categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_take_allow_fill(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "a", "b"])
result = cat.take([0, -1, -1], allow_fill=True)
expected = Categorical(["a", np.nan, np.nan], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_with_negative_one(self):
# -1 was a category
cat = Categorical([-1, 0, 1])
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
expected = Categorical([-1, -1, 0], categories=[-1, 0, 1])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
result = cat.take([0, 1, -1], fill_value="a", allow_fill=True)
expected = Categorical(["a", "b", "a"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value_new_raises(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
xpr = r"Cannot setitem on a Categorical with a new category \(d\)"
with pytest.raises(TypeError, match=xpr):
cat.take([0, 1, -1], fill_value="d", allow_fill=True)
def test_take_nd_deprecated(self):
cat = Categorical(["a", "b", "c"])
with tm.assert_produces_warning(FutureWarning):
cat.take_nd([0, 1])
ci = Index(cat)
with tm.assert_produces_warning(FutureWarning):
ci.take_nd([0, 1])

View File

@ -0,0 +1,22 @@
import pytest
from pandas.util._test_decorators import async_mark
import pandas._testing as tm
class TestCategoricalWarnings:
@async_mark()
async def test_tab_complete_warning(self, ip):
# https://github.com/pandas-dev/pandas/issues/16409
pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; c = Categorical([])"
await ip.run_code(code)
# GH 31324 newer jedi version raises Deprecation warning;
# appears resolved 2021-02-02
with tm.assert_produces_warning(None):
with provisionalcompleter("ignore"):
list(ip.Completer.completions("c.", 1))

View File

@ -0,0 +1,156 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
from pandas.core.arrays.datetimes import _sequence_to_dt64ns
class TestDatetimeArrayConstructor:
def test_from_sequence_invalid_type(self):
mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)])
with pytest.raises(TypeError, match="Cannot create a DatetimeArray"):
DatetimeArray._from_sequence(mi)
def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
DatetimeArray(arr.reshape(2, 2, 1))
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
DatetimeArray(arr[[0]].squeeze())
def test_freq_validation(self):
# GH#24623 check that invalid instances cannot be created with the
# public constructor
arr = np.arange(5, dtype=np.int64) * 3600 * 10**9
msg = (
"Inferred frequency H from passed values does not "
"conform to passed frequency W-SUN"
)
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, freq="W")
@pytest.mark.parametrize(
"meth",
[
DatetimeArray._from_sequence,
_sequence_to_dt64ns,
pd.to_datetime,
pd.DatetimeIndex,
],
)
def test_mixing_naive_tzaware_raises(self, meth):
# GH#24569
arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")])
msg = (
"Cannot mix tz-aware with tz-naive values|"
"Tz-aware datetime.datetime cannot be converted "
"to datetime64 unless utc=True"
)
for obj in [arr, arr[::-1]]:
# check that we raise regardless of whether naive is found
# before aware or vice-versa
with pytest.raises(ValueError, match=msg):
meth(obj)
def test_from_pandas_array(self):
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9
result = DatetimeArray._from_sequence(arr)._with_freq("infer")
expected = pd.date_range("1970-01-01", periods=5, freq="H")._data
tm.assert_datetime_array_equal(result, expected)
def test_mismatched_timezone_raises(self):
arr = DatetimeArray(
np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"),
dtype=DatetimeTZDtype(tz="US/Central"),
)
dtype = DatetimeTZDtype(tz="US/Eastern")
with pytest.raises(TypeError, match="Timezone of the array"):
DatetimeArray(arr, dtype=dtype)
def test_non_array_raises(self):
with pytest.raises(ValueError, match="list"):
DatetimeArray([1, 2, 3])
def test_bool_dtype_raises(self):
arr = np.array([1, 2, 3], dtype="bool")
with pytest.raises(
ValueError, match="The dtype of 'values' is incorrect.*bool"
):
DatetimeArray(arr)
msg = r"dtype bool cannot be converted to datetime64\[ns\]"
with pytest.raises(TypeError, match=msg):
DatetimeArray._from_sequence(arr)
with pytest.raises(TypeError, match=msg):
_sequence_to_dt64ns(arr)
with pytest.raises(TypeError, match=msg):
pd.DatetimeIndex(arr)
with pytest.raises(TypeError, match=msg):
pd.to_datetime(arr)
def test_incorrect_dtype_raises(self):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
def test_freq_infer_raises(self):
with pytest.raises(ValueError, match="Frequency inference"):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer")
def test_copy(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray(data, copy=False)
assert arr._data is data
arr = DatetimeArray(data, copy=True)
assert arr._data is not data
class TestSequenceToDT64NS:
def test_tz_dtype_mismatch_raises(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
with pytest.raises(TypeError, match="data is already tz-aware"):
_sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="UTC"))
def test_tz_dtype_matches(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
result, _, _ = _sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central"))
tm.assert_numpy_array_equal(arr._data, result)
@pytest.mark.parametrize("order", ["F", "C"])
def test_2d(self, order):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
arr = np.array(dti, dtype=object).reshape(3, 2)
if order == "F":
arr = arr.T
res = _sequence_to_dt64ns(arr)
expected = _sequence_to_dt64ns(arr.ravel())
tm.assert_numpy_array_equal(res[0].ravel(), expected[0])
assert res[1] == expected[1]
assert res[2] == expected[2]
res = DatetimeArray._from_sequence(arr)
expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape)
tm.assert_datetime_array_equal(res, expected)

View File

@ -0,0 +1,175 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import NaT
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestReductions:
@pytest.fixture
def arr1d(self, tz_naive_fixture):
tz = tz_naive_fixture
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
dtype=dtype,
)
return arr
def test_min_max(self, arr1d):
arr = arr1d
tz = arr.tz
result = arr.min()
expected = pd.Timestamp("2000-01-02", tz=tz)
assert result == expected
result = arr.max()
expected = pd.Timestamp("2000-01-05", tz=tz)
assert result == expected
result = arr.min(skipna=False)
assert result is NaT
result = arr.max(skipna=False)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.min(skipna=skipna)
assert result is NaT
result = arr.max(skipna=skipna)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_median_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.median(skipna=skipna)
assert result is NaT
arr = arr.reshape(0, 3)
result = arr.median(axis=0, skipna=skipna)
expected = type(arr)._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=skipna)
expected = type(arr)._from_sequence([], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_median(self, arr1d):
arr = arr1d
result = arr.median()
assert result == arr[0]
result = arr.median(skipna=False)
assert result is NaT
result = arr.dropna().median(skipna=False)
assert result == arr[0]
result = arr.median(axis=0)
assert result == arr[0]
def test_median_axis(self, arr1d):
arr = arr1d
assert arr.median(axis=0) == arr.median()
assert arr.median(axis=0, skipna=False) is NaT
msg = r"abs\(axis\) must be less than ndim"
with pytest.raises(ValueError, match=msg):
arr.median(axis=1)
@pytest.mark.filterwarnings("ignore:All-NaN slice encountered:RuntimeWarning")
def test_median_2d(self, arr1d):
arr = arr1d.reshape(1, -1)
# axis = None
assert arr.median() == arr1d.median()
assert arr.median(skipna=False) is NaT
# axis = 0
result = arr.median(axis=0)
expected = arr1d
tm.assert_equal(result, expected)
# Since column 3 is all-NaT, we get NaT there with or without skipna
result = arr.median(axis=0, skipna=False)
expected = arr1d
tm.assert_equal(result, expected)
# axis = 1
result = arr.median(axis=1)
expected = type(arr)._from_sequence([arr1d.median()])
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=False)
expected = type(arr)._from_sequence([NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_mean(self, arr1d):
arr = arr1d
# manually verified result
expected = arr[0] + 0.4 * pd.Timedelta(days=1)
result = arr.mean()
assert result == expected
result = arr.mean(skipna=False)
assert result is NaT
result = arr.dropna().mean(skipna=False)
assert result == expected
result = arr.mean(axis=0)
assert result == expected
def test_mean_2d(self):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
dta = dti._data.reshape(3, 2)
result = dta.mean(axis=0)
expected = dta[1]
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=1)
expected = dta[:, 0] + pd.Timedelta(hours=12)
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=None)
expected = dti.mean()
assert result == expected
@pytest.mark.parametrize("skipna", [True, False])
def test_mean_empty(self, arr1d, skipna):
arr = arr1d[:0]
assert arr.mean(skipna=skipna) is NaT
arr2d = arr.reshape(0, 3)
result = arr2d.mean(axis=0, skipna=skipna)
expected = DatetimeArray._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=1, skipna=skipna)
expected = arr # i.e. 1D, empty
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=None, skipna=skipna)
assert result is NaT

View File

@ -0,0 +1,39 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
@pytest.fixture(params=[Float32Dtype, Float64Dtype])
def dtype(request):
return request.param()
@pytest.fixture
def data(dtype):
return pd.array(
list(np.arange(0.1, 0.9, 0.1))
+ [pd.NA]
+ list(np.arange(1, 9.8, 0.1))
+ [pd.NA]
+ [9.9, 10.0],
dtype=dtype,
)
@pytest.fixture
def data_missing(dtype):
return pd.array([np.nan, 0.1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing

View File

@ -0,0 +1,203 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [1.1, 2.2, None, None, 5.5]),
("mul", [0.1, 0.4, None, None, 2.5]),
("sub", [0.9, 1.8, None, None, 4.5]),
("truediv", [10.0, 10.0, None, None, 10.0]),
("floordiv", [9.0, 9.0, None, None, 10.0]),
("mod", [0.1, 0.2, None, None, 0.0]),
],
ids=["add", "mul", "sub", "div", "floordiv", "mod"],
)
def test_array_op(dtype, opname, exp):
a = pd.array([1.0, 2.0, None, 4.0, 5.0], dtype=dtype)
b = pd.array([0.1, 0.2, 0.3, None, 0.5], dtype=dtype)
op = getattr(operator, opname)
result = op(a, b)
expected = pd.array(exp, dtype=dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(dtype, zero, negative):
# TODO pending NA/NaN discussion
# https://github.com/pandas-dev/pandas/issues/32265/
a = pd.array([0, 1, -1, None], dtype=dtype)
result = a / zero
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)
def test_pow_scalar(dtype):
a = pd.array([-1, 0, 1, None, 2], dtype=dtype)
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**1
expected = pd.array([-1, 0, 1, None, 2], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**pd.NA
expected = pd.array([None, None, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**np.nan
# TODO np.nan should be converted to pd.NA / missing before operation?
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
mask=a._mask,
)
tm.assert_extension_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
result = 0**a
expected = pd.array([1, 0, None, 0], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = 1**a
expected = pd.array([1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = pd.NA**a
expected = pd.array([1, None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
)
tm.assert_extension_array_equal(result, expected)
def test_pow_array(dtype):
a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype=dtype)
b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype=dtype)
result = a**b
expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = pd.array([np.nan, np.nan], dtype="Float64")
result = np.array([1.0, 2.0]) ** arr
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
def test_arith_zero_dim_ndarray(other):
arr = pd.array([1, None, 2], dtype="Float64")
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
msg = (
r"(:?can only perform ops with numeric values)"
r"|(:?FloatingArray cannot perform the operation mod)"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))
msg = "|".join(
[
"can only perform ops with numeric values",
"cannot perform .* with this index type: DatetimeArray",
"Addition/subtraction of integers and integer-arrays "
"with DatetimeArray is no longer supported. *",
]
)
with pytest.raises(TypeError, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# Various
# -----------------------------------------------------------------------------
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.array([1, 2, np.nan], dtype="Float64"),
"B": pd.array([1, np.nan, 3], dtype="Float32"),
"C": np.array([1, 2, 3], dtype="float64"),
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, None], dtype="boolean")
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"source, neg_target, abs_target",
[
([1.1, 2.2, 3.3], [-1.1, -2.2, -3.3], [1.1, 2.2, 3.3]),
([1.1, 2.2, None], [-1.1, -2.2, None], [1.1, 2.2, None]),
([-1.1, 0.0, 1.1], [1.1, 0.0, -1.1], [1.1, 0.0, 1.1]),
],
)
def test_unary_float_operators(float_ea_dtype, source, neg_target, abs_target):
# GH38794
dtype = float_ea_dtype
arr = pd.array(source, dtype=dtype)
neg_result, pos_result, abs_result = -arr, +arr, abs(arr)
neg_target = pd.array(neg_target, dtype=dtype)
abs_target = pd.array(abs_target, dtype=dtype)
tm.assert_extension_array_equal(neg_result, neg_target)
tm.assert_extension_array_equal(pos_result, arr)
assert not tm.shares_memory(pos_result, arr)
tm.assert_extension_array_equal(abs_result, abs_target)

View File

@ -0,0 +1,118 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype():
# with missing values
arr = pd.array([0.1, 0.2, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype NumPy"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype NumPy"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([0.0, 1.0, 0.5], dtype="Float64")
result = arr.astype("int64")
expected = np.array([0, 1, 0], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([False, True, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_floating_array():
# astype to FloatingArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("Float64")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.Float64Dtype())
tm.assert_extension_array_equal(result, arr)
result = arr.astype("Float32")
expected = pd.array([0.0, 1.0, None], dtype="Float32")
tm.assert_extension_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, expected)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([0.0, 1.5, None], dtype="Float64")
result = arr.astype("Int64")
expected = pd.array([0, 1, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_astype_str():
a = pd.array([0.1, 0.2, None], dtype="Float64")
expected = np.array(["0.1", "0.2", "<NA>"], dtype=object)
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_astype_copy():
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Float64", copy=True)
assert result is not arr
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
# copy=False
result = arr.astype("Float64", copy=False)
assert result is arr
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA
# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
result = arr.astype("Float32", copy=False)
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
def test_astype_object(dtype):
arr = pd.array([1.0, pd.NA], dtype=dtype)
result = arr.astype(object)
expected = np.array([1.0, pd.NA], dtype=object)
tm.assert_numpy_array_equal(result, expected)
# check exact element types
assert isinstance(result[0], float)
assert result[1] is pd.NA

View File

@ -0,0 +1,65 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.tests.arrays.masked_shared import (
ComparisonOps,
NumericOps,
)
class TestComparisonOps(NumericOps, ComparisonOps):
@pytest.mark.parametrize("other", [True, False, pd.NA, -1.0, 0.0, 1])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_compare_with_integerarray(self, comparison_op):
op = comparison_op
a = pd.array([0, 1, None] * 3, dtype="Int64")
b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64")
other = b.astype("Int64")
expected = op(a, other)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
expected = op(other, a)
result = op(b, a)
tm.assert_extension_array_equal(result, expected)
def test_equals():
# GH-30652
# equals is generally tested in /tests/extension/base/methods, but this
# specifically tests that two arrays of the same class but different dtype
# do not evaluate equal
a1 = pd.array([1, 2, None], dtype="Float64")
a2 = pd.array([1, 2, None], dtype="Float32")
assert a1.equals(a2) is False
def test_equals_nan_vs_na():
# GH#44382
mask = np.zeros(3, dtype=bool)
data = np.array([1.0, np.nan, 3.0], dtype=np.float64)
left = FloatingArray(data, mask)
assert left.equals(left)
tm.assert_extension_array_equal(left, left)
assert left.equals(left.copy())
assert left.equals(FloatingArray(data.copy(), mask.copy()))
mask2 = np.array([False, True, False], dtype=bool)
data2 = np.array([1.0, 2.0, 3.0], dtype=np.float64)
right = FloatingArray(data2, mask2)
assert right.equals(right)
tm.assert_extension_array_equal(right, right)
assert not left.equals(right)
# with mask[1] = True, the only difference is data[1], which should
# not matter for equals
mask[1] = True
assert left.equals(right)

View File

@ -0,0 +1,21 @@
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Float64", "Float64"], "Float64"),
(["Float32", "Float64"], "Float64"),
(["Float32", "Float32"], "Float32"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes])
expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,197 @@
import numpy as np
import pytest
from pandas.compat import np_version_under1p20
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_uses_pandas_na():
a = pd.array([1, None], dtype=Float64Dtype())
assert a[1] is pd.NA
def test_floating_array_constructor():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
expected = pd.array([1, 2, 3, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(result._data, values)
tm.assert_numpy_array_equal(result._mask, mask)
msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
with pytest.raises(TypeError, match=msg):
FloatingArray(values.tolist(), mask)
with pytest.raises(TypeError, match=msg):
FloatingArray(values, mask.tolist())
with pytest.raises(TypeError, match=msg):
FloatingArray(values.astype(int), mask)
msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
with pytest.raises(TypeError, match=msg):
FloatingArray(values)
def test_floating_array_disallows_float16():
# GH#44715
arr = np.array([1, 2], dtype=np.float16)
mask = np.array([False, False])
msg = "FloatingArray does not support np.float16 dtype"
with pytest.raises(TypeError, match=msg):
FloatingArray(arr, mask)
def test_floating_array_disallows_Float16_dtype(request):
# GH#44715
if np_version_under1p20:
# https://github.com/numpy/numpy/issues/20512
mark = pytest.mark.xfail(reason="numpy does not raise on np.dtype('Float16')")
request.node.add_marker(mark)
with pytest.raises(TypeError, match="data type 'Float16' not understood"):
pd.array([1.0, 2.0], dtype="Float16")
def test_floating_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
assert result._data is values
assert result._mask is mask
result = FloatingArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_array():
result = pd.array([0.1, 0.2, 0.3, 0.4])
expected = pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, pd.NA]),
([None], [pd.NA]),
([None, np.nan], [pd.NA, pd.NA]),
([1, np.nan], [1, pd.NA]),
([np.nan], [pd.NA]),
],
)
def test_to_array_none_is_nan(a, b):
result = pd.array(a, dtype="Float64")
expected = pd.array(b, dtype="Float64")
tm.assert_extension_array_equal(result, expected)
def test_to_array_mixed_integer_float():
result = pd.array([1, 2.0])
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = pd.array([1, None, 2.0])
expected = pd.array([1.0, None, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
["1", "2"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
# GH#44514 all-NA case used to get quietly swapped out before checking ndim
np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
],
)
def test_to_array_error(values):
# error in converting existing arrays to FloatingArray
msg = "|".join(
[
"cannot be converted to a FloatingDtype",
"values must be a 1D list-like",
"Cannot pass scalar",
]
)
with pytest.raises((TypeError, ValueError), match=msg):
pd.array(values, dtype="Float64")
def test_to_array_inferred_dtype():
# if values has dtype -> respect it
result = pd.array(np.array([1, 2], dtype="float32"))
assert result.dtype == Float32Dtype()
# if values have no dtype -> always float64
result = pd.array([1.0, 2.0])
assert result.dtype == Float64Dtype()
def test_to_array_dtype_keyword():
result = pd.array([1, 2], dtype="Float32")
assert result.dtype == Float32Dtype()
# if values has dtype -> override it
result = pd.array(np.array([1, 2], dtype="float32"), dtype="Float64")
assert result.dtype == Float64Dtype()
def test_to_array_integer():
result = pd.array([1, 2], dtype="Float64")
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# for integer dtypes, the itemsize is not preserved
# TODO can we specify "floating" in general?
result = pd.array(np.array([1, 2], dtype="int32"), dtype="Float64")
assert result.dtype == Float64Dtype()
@pytest.mark.parametrize(
"bool_values, values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Float64Dtype(), Float64Dtype()),
([False, True], [0, 1], "Float64", Float64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Float64Dtype(), Float64Dtype()),
],
)
def test_to_array_bool(bool_values, values, target_dtype, expected_dtype):
result = pd.array(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = pd.array(values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
def test_series_from_float(data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,192 @@
import numpy as np
import pytest
from pandas.compat import IS64
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
# np.sign emits a warning with nans, <https://github.com/numpy/numpy/issues/15127>
@pytest.mark.filterwarnings("ignore:invalid value encountered in sign")
def test_ufuncs_single(ufunc):
a = pd.array([1, 2, -3, np.nan], dtype="Float64")
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = pd.array([1.0, 0.2, 3.0, np.nan], dtype="Float64")
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = pd.Series(ufunc(s.astype(float)), dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_float(ufunc):
# two FloatingArrays
a = pd.array([1, 0.2, -3, np.nan], dtype="Float64")
result = ufunc(a, a)
expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with scalar
result = ufunc(a, 1)
expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="Float64")
res = np.add.reduce(arr)
expected = arr.sum(skipna=False)
tm.assert_almost_equal(res, expected)
@pytest.mark.skipif(not IS64, reason="GH 36579: fail on 32-bit system")
@pytest.mark.parametrize(
"pandasmethname, kwargs",
[
("var", {"ddof": 0}),
("var", {"ddof": 1}),
("kurtosis", {}),
("skew", {}),
("sem", {}),
],
)
def test_stat_method(pandasmethname, kwargs):
s = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, np.nan], dtype="Float64")
pandasmeth = getattr(s, pandasmethname)
result = pandasmeth(**kwargs)
s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64")
pandasmeth = getattr(s2, pandasmethname)
expected = pandasmeth(**kwargs)
assert expected == result
def test_value_counts_na():
arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = arr.value_counts(dropna=False)
idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
assert idx.dtype == arr.dtype
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64")
tm.assert_series_equal(result, expected)
def test_value_counts_empty():
ser = pd.Series([], dtype="Float64")
result = ser.value_counts()
idx = pd.Index([], dtype="Float64")
assert idx.dtype == "Float64"
expected = pd.Series([], index=idx, dtype="Int64")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_floating_array_sum(skipna, min_count, dtype):
arr = pd.array([1, 2, 3, None], dtype=dtype)
result = arr.sum(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 6.0
else:
assert result is pd.NA
@pytest.mark.parametrize(
"values, expected", [([1, 2, 3], 6.0), ([1, 2, 3, None], 6.0), ([None], 0.0)]
)
def test_floating_array_numpy_sum(values, expected):
arr = pd.array(values, dtype="Float64")
result = np.sum(arr)
assert result == expected
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([0.1, None, 3.0], dtype="Float64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, np.float64)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([0.1, 3], dtype="Float64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("method", ["min", "max"])
def test_floating_array_min_max(skipna, method, dtype):
arr = pd.array([0.0, 1.0, None], dtype=dtype)
func = getattr(arr, method)
result = func(skipna=skipna)
if skipna:
assert result == (0 if method == "min" else 1)
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 9])
def test_floating_array_prod(skipna, min_count, dtype):
arr = pd.array([1.0, 2.0, None], dtype=dtype)
result = arr.prod(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 2
else:
assert result is pd.NA

View File

@ -0,0 +1,48 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
np.dtype(dtype.type).kind == "f"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[(Float32Dtype(), "Float32Dtype()"), (Float64Dtype(), "Float64Dtype()")],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(pd.array([1.0, None, 3.0]))
expected = "<FloatingArray>\n[1.0, <NA>, 3.0]\nLength: 3, dtype: Float64"
assert result == expected
def test_repr_array_long():
data = pd.array([1.0, 2.0, None] * 1000)
expected = """<FloatingArray>
[ 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0,
...
<NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>]
Length: 3000, dtype: Float64"""
result = repr(data)
assert result == expected
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 <NA>\n1 0.1"
assert result == expected

View File

@ -0,0 +1,132 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, 0.3], dtype="object")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_float(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to float, otherwise raises
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
result = arr.to_numpy(dtype="float64")
# need to explicitly specify na_value
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_int(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to int, otherwise raises
arr = con([1.0, 2.0, 3.0], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([1, 2, 3], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
arr = con([1.0, 2.0, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
result = arr.to_numpy(dtype="int64")
# automatic casting (floors the values)
arr = con([0.1, 0.9, 1.1], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([0, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_value(box):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([0.0, 1.0, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([0, 1, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_na_value_with_nan():
# array with both NaN and NA -> only fill NA with `na_value`
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True]))
result = arr.to_numpy(dtype="float64", na_value=-1)
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_dtype(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0], dtype="Float64")
result = arr.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_raises(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
with pytest.raises(ValueError, match=dtype):
arr.to_numpy(dtype=dtype)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_string(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype="str")
expected = np.array([0.0, 1.0, pd.NA], dtype="<U32")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([10, 0.2, 0.3], dtype="Float64"))
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64", copy=True)
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([0.1, 0.2, 0.3], dtype="Float64"))

View File

@ -0,0 +1,52 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
@pytest.fixture(
params=[
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
]
)
def dtype(request):
return request.param()
@pytest.fixture
def data(dtype):
return pd.array(
list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100],
dtype=dtype,
)
@pytest.fixture
def data_missing(dtype):
return pd.array([np.nan, 1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing

View File

@ -0,0 +1,303 @@
import operator
import numpy as np
import pytest
from pandas.compat import np_version_under1p20
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
import pandas.core.ops as ops
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[("add", [1, 3, None, None, 9]), ("mul", [0, 2, None, None, 20])],
ids=["add", "mul"],
)
def test_add_mul(dtype, opname, exp):
a = pd.array([0, 1, None, 3, 4], dtype=dtype)
b = pd.array([1, 2, 3, None, 5], dtype=dtype)
# array / array
expected = pd.array(exp, dtype=dtype)
op = getattr(operator, opname)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
op = getattr(ops, "r" + opname)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
def test_sub(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a - b
expected = pd.array([1, 1, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_div(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a / b
expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(zero, negative):
# https://github.com/pandas-dev/pandas/issues/27398, GH#22793
a = pd.array([0, 1, -1, None], dtype="Int64")
result = a / zero
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)
def test_floordiv(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a // b
# Series op sets 1//0 to np.inf, which IntegerArray does not do (yet)
expected = pd.array([0, 2, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_mod(dtype):
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a % b
expected = pd.array([0, 0, None, None, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_pow_scalar():
a = pd.array([-1, 0, 1, None, 2], dtype="Int64")
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**1
expected = pd.array([-1, 0, 1, None, 2], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**pd.NA
expected = pd.array([None, None, 1, None, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = a**np.nan
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
np.array([False, False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
result = 0**a
expected = pd.array([1, 0, None, 0], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = 1**a
expected = pd.array([1, 1, 1, 1], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = pd.NA**a
expected = pd.array([1, None, None, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
np.array([False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)
def test_pow_array():
a = pd.array([0, 0, 0, 1, 1, 1, None, None, None])
b = pd.array([0, 1, None, 0, 1, None, 0, 1, None])
result = a**b
expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None])
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = pd.array([np.nan, np.nan], dtype="Int64")
result = np.array([1.0, 2.0]) ** arr
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
def test_numpy_zero_dim_ndarray(other):
arr = pd.array([1, None, 2])
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
msg = (
r"(:?can only perform ops with numeric values)"
r"|(:?IntegerArray cannot perform the operation mod)"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))
msg = "|".join(
[
"can only perform ops with numeric values",
"cannot perform .* with this index type: DatetimeArray",
"Addition/subtraction of integers and integer-arrays "
"with DatetimeArray is no longer supported. *",
]
)
with pytest.raises(TypeError, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# Various
# -----------------------------------------------------------------------------
# TODO test unsigned overflow
def test_arith_coerce_scalar(data, all_arithmetic_operators):
op = tm.get_op_from_name(all_arithmetic_operators)
s = pd.Series(data)
other = 0.01
result = op(s, other)
expected = op(s.astype(float), other)
expected = expected.astype("Float64")
# rfloordiv results in nan instead of inf
if all_arithmetic_operators == "__rfloordiv__" and np_version_under1p20:
# for numpy 1.20 https://github.com/numpy/numpy/pull/16161
# updated floordiv, now matches our behavior defined in core.ops
mask = (
((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool)
)
expected.array._data[mask] = np.nan
# rmod results in NaN that wasn't NA in original nullable Series -> unmask it
elif all_arithmetic_operators == "__rmod__":
mask = (s == 0).fillna(False).to_numpy(bool)
expected.array._mask[mask] = False
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("other", [1.0, np.array(1.0)])
def test_arithmetic_conversion(all_arithmetic_operators, other):
# if we have a float operand we should have a float result
# if that is equal to an integer
op = tm.get_op_from_name(all_arithmetic_operators)
s = pd.Series([1, 2, 3], dtype="Int64")
result = op(s, other)
assert result.dtype == "Float64"
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.Series([1, 2, np.nan], dtype="Int64"),
"B": pd.Series([1, np.nan, 3], dtype="UInt8"),
"C": [1, 2, 3],
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, None], dtype="boolean")
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("op", ["mean"])
def test_reduce_to_float(op):
# some reduce ops always return float, even if the result
# is a rounded number
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, float)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Float64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"source, neg_target, abs_target",
[
([1, 2, 3], [-1, -2, -3], [1, 2, 3]),
([1, 2, None], [-1, -2, None], [1, 2, None]),
([-1, 0, 1], [1, 0, -1], [1, 0, 1]),
],
)
def test_unary_int_operators(any_signed_int_ea_dtype, source, neg_target, abs_target):
dtype = any_signed_int_ea_dtype
arr = pd.array(source, dtype=dtype)
neg_result, pos_result, abs_result = -arr, +arr, abs(arr)
neg_target = pd.array(neg_target, dtype=dtype)
abs_target = pd.array(abs_target, dtype=dtype)
tm.assert_extension_array_equal(neg_result, neg_target)
tm.assert_extension_array_equal(pos_result, arr)
assert not tm.shares_memory(pos_result, arr)
tm.assert_extension_array_equal(abs_result, abs_target)

View File

@ -0,0 +1,38 @@
import pytest
import pandas as pd
from pandas.tests.arrays.masked_shared import (
ComparisonOps,
NumericOps,
)
class TestComparisonOps(NumericOps, ComparisonOps):
@pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_compare_to_int(self, dtype, comparison_op):
# GH 28930
op_name = f"__{comparison_op.__name__}__"
s1 = pd.Series([1, None, 3], dtype=dtype)
s2 = pd.Series([1, None, 3], dtype="float")
method = getattr(s1, op_name)
result = method(2)
method = getattr(s2, op_name)
expected = method(2).astype("boolean")
expected[s2.isna()] = pd.NA
self.assert_series_equal(result, expected)
def test_equals():
# GH-30652
# equals is generally tested in /tests/extension/base/methods, but this
# specifically tests that two arrays of the same class but different dtype
# do not evaluate equal
a1 = pd.array([1, 2, None], dtype="Int64")
a2 = pd.array([1, 2, None], dtype="Int32")
assert a1.equals(a2) is False

View File

@ -0,0 +1,65 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Int64", "Int64"], "Int64"),
(["UInt64", "UInt64"], "UInt64"),
(["Int8", "Int8"], "Int8"),
(["Int8", "Int16"], "Int16"),
(["UInt8", "Int8"], "Int16"),
(["Int32", "UInt32"], "Int64"),
(["Int64", "UInt64"], "Float64"),
(["Int64", "boolean"], "Int64"),
(["UInt8", "boolean"], "UInt8"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes])
expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)
# order doesn't matter for result
result = pd.concat(
[pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]]
)
expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Int64", "int64"], "Int64"),
(["UInt64", "uint64"], "UInt64"),
(["Int8", "int8"], "Int8"),
(["Int8", "int16"], "Int16"),
(["UInt8", "int8"], "Int16"),
(["Int32", "uint32"], "Int64"),
(["Int64", "uint64"], "Float64"),
(["Int64", "bool"], "Int64"),
(["UInt8", "bool"], "UInt8"),
],
)
def test_concat_series_with_numpy(to_concat_dtypes, result_dtype):
s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0])
s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1]))
result = pd.concat([s1, s2], ignore_index=True)
expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype)
tm.assert_series_equal(result, expected)
# order doesn't matter for result
result = pd.concat([s2, s1], ignore_index=True)
expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,231 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_integer
from pandas.core.arrays import IntegerArray
from pandas.core.arrays.integer import (
Int8Dtype,
Int32Dtype,
Int64Dtype,
)
@pytest.fixture(params=[pd.array, IntegerArray._from_sequence])
def constructor(request):
return request.param
def test_uses_pandas_na():
a = pd.array([1, None], dtype=Int64Dtype())
assert a[1] is pd.NA
def test_from_dtype_from_float(data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / array
expected = pd.Series(data).dropna().reset_index(drop=True)
dropped = np.array(data.dropna()).astype(np.dtype(dtype.type))
result = pd.Series(dropped, dtype=str(dtype))
tm.assert_series_equal(result, expected)
def test_conversions(data_missing):
# astype to object series
df = pd.DataFrame({"A": data_missing})
result = df["A"].astype("object")
expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A")
tm.assert_series_equal(result, expected)
# convert to object ndarray
# we assert that we are exactly equal
# including type conversions of scalars
result = df["A"].astype("object").values
expected = np.array([pd.NA, 1], dtype=object)
tm.assert_numpy_array_equal(result, expected)
for r, e in zip(result, expected):
if pd.isnull(r):
assert pd.isnull(e)
elif is_integer(r):
assert r == e
assert is_integer(e)
else:
assert r == e
assert type(r) == type(e)
def test_integer_array_constructor():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
expected = pd.array([1, 2, 3, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
with pytest.raises(TypeError, match=msg):
IntegerArray(values.tolist(), mask)
with pytest.raises(TypeError, match=msg):
IntegerArray(values, mask.tolist())
with pytest.raises(TypeError, match=msg):
IntegerArray(values.astype(float), mask)
msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
with pytest.raises(TypeError, match=msg):
IntegerArray(values)
def test_integer_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
assert result._data is values
assert result._mask is mask
result = IntegerArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, np.nan]),
([None], [np.nan]),
([None, np.nan], [np.nan, np.nan]),
([np.nan, np.nan], [np.nan, np.nan]),
],
)
def test_to_integer_array_none_is_nan(a, b):
result = pd.array(a, dtype="Int64")
expected = pd.array(b, dtype="Int64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
],
)
def test_to_integer_array_error(values):
# error in converting existing arrays to IntegerArrays
msg = "|".join(
[
r"cannot be converted to an IntegerDtype",
r"invalid literal for int\(\) with base 10:",
r"values must be a 1D list-like",
r"Cannot pass scalar",
]
)
with pytest.raises((ValueError, TypeError), match=msg):
pd.array(values, dtype="Int64")
with pytest.raises((ValueError, TypeError), match=msg):
IntegerArray._from_sequence(values)
def test_to_integer_array_inferred_dtype(constructor):
# if values has dtype -> respect it
result = constructor(np.array([1, 2], dtype="int8"))
assert result.dtype == Int8Dtype()
result = constructor(np.array([1, 2], dtype="int32"))
assert result.dtype == Int32Dtype()
# if values have no dtype -> always int64
result = constructor([1, 2])
assert result.dtype == Int64Dtype()
def test_to_integer_array_dtype_keyword(constructor):
result = constructor([1, 2], dtype="Int8")
assert result.dtype == Int8Dtype()
# if values has dtype -> override it
result = constructor(np.array([1, 2], dtype="int8"), dtype="Int32")
assert result.dtype == Int32Dtype()
def test_to_integer_array_float():
result = IntegerArray._from_sequence([1.0, 2.0])
expected = pd.array([1, 2], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
IntegerArray._from_sequence([1.5, 2.0])
# for float dtypes, the itemsize is not preserved
result = IntegerArray._from_sequence(np.array([1.0, 2.0], dtype="float32"))
assert result.dtype == Int64Dtype()
def test_to_integer_array_str():
result = IntegerArray._from_sequence(["1", "2", None])
expected = pd.array([1, 2, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1", "2", ""])
with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1.5", "2.0"])
@pytest.mark.parametrize(
"bool_values, int_values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Int64Dtype(), Int64Dtype()),
([False, True], [0, 1], "Int64", Int64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()),
],
)
def test_to_integer_array_bool(
constructor, bool_values, int_values, target_dtype, expected_dtype
):
result = constructor(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = pd.array(int_values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values, to_dtype, result_dtype",
[
(np.array([1], dtype="int64"), None, Int64Dtype),
(np.array([1, np.nan]), None, Int64Dtype),
(np.array([1, np.nan]), "int8", Int8Dtype),
],
)
def test_to_integer_array(values, to_dtype, result_dtype):
# convert existing arrays to IntegerArrays
result = IntegerArray._from_sequence(values, dtype=to_dtype)
assert result.dtype == result_dtype()
expected = pd.array(values, dtype=result_dtype())
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,297 @@
import numpy as np
import pytest
from pandas.core.dtypes.generic import ABCIndex
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.integer import (
Int8Dtype,
UInt32Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
if dtype.is_signed_integer:
assert np.dtype(dtype.type).kind == "i"
else:
assert np.dtype(dtype.type).kind == "u"
assert dtype.name is not None
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
# TODO(#22346): preserve Int64 dtype
# for ops that enable (mean would actually work here
# but generally it is a float return value)
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Int64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
def test_astype_nansafe():
# see gh-22343
arr = pd.array([np.nan, 1, 2], dtype="Int8")
msg = "cannot convert to 'uint32'-dtype NumPy array with missing values."
with pytest.raises(ValueError, match=msg):
arr.astype("uint32")
@pytest.mark.parametrize("dropna", [True, False])
def test_construct_index(all_data, dropna):
# ensure that we do not coerce to Float64Index, rather
# keep as Index
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Index(pd.array(other, dtype=all_data.dtype))
expected = pd.Index(other, dtype=all_data.dtype)
assert all_data.dtype == expected.dtype # dont coerce to object
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("dropna", [True, False])
def test_astype_index(all_data, dropna):
# as an int/uint index to Index
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
else:
other = all_data
dtype = all_data.dtype
idx = pd.Index._with_infer(np.array(other))
assert isinstance(idx, ABCIndex)
result = idx.astype(dtype)
expected = idx.astype(object).astype(dtype)
tm.assert_index_equal(result, expected)
def test_astype(all_data):
all_data = all_data[:10]
ints = all_data[~all_data.isna()]
mixed = all_data
dtype = Int8Dtype()
# coerce to same type - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype)
expected = pd.Series(ints)
tm.assert_series_equal(result, expected)
# coerce to same other - ints
s = pd.Series(ints)
result = s.astype(dtype)
expected = pd.Series(ints, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype.numpy_dtype)
expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype))
tm.assert_series_equal(result, expected)
# coerce to same type - mixed
s = pd.Series(mixed)
result = s.astype(all_data.dtype)
expected = pd.Series(mixed)
tm.assert_series_equal(result, expected)
# coerce to same other - mixed
s = pd.Series(mixed)
result = s.astype(dtype)
expected = pd.Series(mixed, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - mixed
s = pd.Series(mixed)
msg = r"cannot convert to .*-dtype NumPy array with missing values.*"
with pytest.raises(ValueError, match=msg):
s.astype(all_data.dtype.numpy_dtype)
# coerce to object
s = pd.Series(mixed)
result = s.astype("object")
expected = pd.Series(np.asarray(mixed))
tm.assert_series_equal(result, expected)
def test_astype_copy():
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")
# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Int64", copy=True)
assert result is not arr
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
# copy=False
result = arr.astype("Int64", copy=False)
assert result is arr
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA
# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([1, 2, 3, None], dtype="Int64")
orig = pd.array([1, 2, 3, None], dtype="Int64")
result = arr.astype("Int32", copy=False)
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
def test_astype_to_larger_numpy():
a = pd.array([1, 2], dtype="Int32")
result = a.astype("int64")
expected = np.array([1, 2], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
a = pd.array([1, 2], dtype="UInt32")
result = a.astype("uint64")
expected = np.array([1, 2], dtype="uint64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"])
def test_astype_specific_casting(dtype):
s = pd.Series([1, 2, 3], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3], dtype=dtype)
tm.assert_series_equal(result, expected)
s = pd.Series([1, 2, 3, None], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3, None], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_astype_floating():
arr = pd.array([1, 2, None], dtype="Int64")
result = arr.astype("Float64")
expected = pd.array([1.0, 2.0, None], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
def test_astype_dt64():
# GH#32435
arr = pd.array([1, 2, 3, pd.NA]) * 10**9
result = arr.astype("datetime64[ns]")
expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]")
tm.assert_numpy_array_equal(result, expected)
def test_construct_cast_invalid(dtype):
msg = "cannot safely"
arr = [1.2, 2.3, 3.7]
with pytest.raises(TypeError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
arr = [1.2, 2.3, 3.7, np.nan]
with pytest.raises(TypeError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
@pytest.mark.parametrize("in_series", [True, False])
def test_to_numpy_na_nan(in_series):
a = pd.array([0, 1, None], dtype="Int64")
if in_series:
a = pd.Series(a)
result = a.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.0, 1.0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = a.to_numpy(dtype="int64", na_value=-1)
expected = np.array([0, 1, -1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = a.to_numpy(dtype="bool", na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("in_series", [True, False])
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
def test_to_numpy_dtype(dtype, in_series):
a = pd.array([0, 1], dtype="Int64")
if in_series:
a = pd.Series(a)
result = a.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"])
def test_to_numpy_na_raises(dtype):
a = pd.array([0, 1, None], dtype="Int64")
with pytest.raises(ValueError, match=dtype):
a.to_numpy(dtype=dtype)
def test_astype_str():
a = pd.array([1, 2, None], dtype="Int64")
expected = np.array(["1", "2", "<NA>"], dtype="<U21")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_astype_boolean():
# https://github.com/pandas-dev/pandas/issues/31102
a = pd.array([1, 0, -1, 2, None], dtype="Int64")
result = a.astype("boolean")
expected = pd.array([True, False, True, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,201 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
# np.sign emits a warning with nans, <https://github.com/numpy/numpy/issues/15127>
@pytest.mark.filterwarnings("ignore:invalid value encountered in sign")
def test_ufuncs_single_int(ufunc):
a = pd.array([1, 2, -3, np.nan])
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(pd.array(ufunc(a.astype(float)), dtype="Int64"))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = pd.array([1, 2, -3, np.nan])
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask)
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_int(ufunc):
# two IntegerArrays
a = pd.array([1, 2, -3, np.nan])
result = ufunc(a, a)
expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
# IntegerArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = pd.array(ufunc(a.astype(float), arr), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
# IntegerArray with scalar
result = ufunc(a, 1)
expected = pd.array(ufunc(a.astype(float), 1), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = pd.array(ufunc(1, a.astype(float)), dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_ufunc_binary_output():
a = pd.array([1, 2, np.nan])
result = np.modf(a)
expected = np.modf(a.to_numpy(na_value=np.nan, dtype="float"))
expected = (pd.array(expected[0]), pd.array(expected[1]))
assert isinstance(result, tuple)
assert len(result) == 2
for x, y in zip(result, expected):
tm.assert_extension_array_equal(x, y)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values)
res = np.add.reduce(arr)
expected = arr.sum(skipna=False)
tm.assert_almost_equal(res, expected)
@pytest.mark.parametrize(
"pandasmethname, kwargs",
[
("var", {"ddof": 0}),
("var", {"ddof": 1}),
("kurtosis", {}),
("skew", {}),
("sem", {}),
],
)
def test_stat_method(pandasmethname, kwargs):
s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64")
pandasmeth = getattr(s, pandasmethname)
result = pandasmeth(**kwargs)
s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64")
pandasmeth = getattr(s2, pandasmethname)
expected = pandasmeth(**kwargs)
assert expected == result
def test_value_counts_na():
arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
result = arr.value_counts(dropna=False)
ex_index = pd.Index([1, 2, pd.NA], dtype="Int64")
assert ex_index.dtype == "Int64"
expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
def test_value_counts_empty():
# https://github.com/pandas-dev/pandas/issues/33317
ser = pd.Series([], dtype="Int64")
result = ser.value_counts()
idx = pd.Index([], dtype=ser.dtype)
assert idx.dtype == ser.dtype
expected = pd.Series([], index=idx, dtype="Int64")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
# GH 33172
ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_integer_array_sum(skipna, min_count, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([1, 2, 3, None], dtype=dtype)
result = arr.sum(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 6
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("method", ["min", "max"])
def test_integer_array_min_max(skipna, method, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([0, 1, None], dtype=dtype)
func = getattr(arr, method)
result = func(skipna=skipna)
if skipna:
assert result == (0 if method == "min" else 1)
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 9])
def test_integer_array_prod(skipna, min_count, any_int_ea_dtype):
dtype = any_int_ea_dtype
arr = pd.array([1, 2, None], dtype=dtype)
result = arr.prod(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 2
else:
assert result is pd.NA
@pytest.mark.parametrize(
"values, expected", [([1, 2, 3], 6), ([1, 2, 3, None], 6), ([None], 0)]
)
def test_integer_array_numpy_sum(values, expected):
arr = pd.array(values, dtype="Int64")
result = np.sum(arr)
assert result == expected
@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
def test_dataframe_reductions(op):
# https://github.com/pandas-dev/pandas/pull/32867
# ensure the integers are not cast to float during reductions
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
result = df.max()
assert isinstance(result["a"], np.int64)
# TODO(jreback) - these need testing / are broken
# shift
# set_index (destroys type)

View File

@ -0,0 +1,19 @@
import pandas as pd
import pandas._testing as tm
def test_array_setitem_nullable_boolean_mask():
# GH 31446
ser = pd.Series([1, 2], dtype="Int64")
result = ser.where(ser > 1)
expected = pd.Series([pd.NA, 2], dtype="Int64")
tm.assert_series_equal(result, expected)
def test_array_setitem():
# GH 31446
arr = pd.Series([1, 2], dtype="Int64").array
arr[arr > 1] = 1
expected = pd.array([1, 1], dtype="Int64")
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,68 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
if dtype.is_signed_integer:
assert np.dtype(dtype.type).kind == "i"
else:
assert np.dtype(dtype.type).kind == "u"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[
(Int8Dtype(), "Int8Dtype()"),
(Int16Dtype(), "Int16Dtype()"),
(Int32Dtype(), "Int32Dtype()"),
(Int64Dtype(), "Int64Dtype()"),
(UInt8Dtype(), "UInt8Dtype()"),
(UInt16Dtype(), "UInt16Dtype()"),
(UInt32Dtype(), "UInt32Dtype()"),
(UInt64Dtype(), "UInt64Dtype()"),
],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(pd.array([1, None, 3]))
expected = "<IntegerArray>\n[1, <NA>, 3]\nLength: 3, dtype: Int64"
assert result == expected
def test_repr_array_long():
data = pd.array([1, 2, None] * 1000)
expected = (
"<IntegerArray>\n"
"[ 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>, 1,\n"
" ...\n"
" <NA>, 1, 2, <NA>, 1, 2, <NA>, 1, 2, <NA>]\n"
"Length: 3000, dtype: Int64"
)
result = repr(data)
assert result == expected
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 <NA>\n1 1"
assert result == expected

View File

@ -0,0 +1,28 @@
import pytest
from pandas import (
Categorical,
CategoricalDtype,
Index,
IntervalIndex,
)
import pandas._testing as tm
class TestAstype:
@pytest.mark.parametrize("ordered", [True, False])
def test_astype_categorical_retains_ordered(self, ordered):
index = IntervalIndex.from_breaks(range(5))
arr = index._data
dtype = CategoricalDtype(None, ordered=ordered)
expected = Categorical(list(arr), ordered=ordered)
result = arr.astype(dtype)
assert result.ordered is ordered
tm.assert_categorical_equal(result, expected)
# test IntervalIndex.astype while we're at it.
result = index.astype(dtype)
expected = Index(expected)
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,369 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
Index,
Interval,
IntervalIndex,
Timedelta,
Timestamp,
date_range,
timedelta_range,
)
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
@pytest.fixture(
params=[
(Index([0, 2, 4]), Index([1, 3, 5])),
(Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])),
(timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)),
(date_range("20170101", periods=3), date_range("20170102", periods=3)),
(
date_range("20170101", periods=3, tz="US/Eastern"),
date_range("20170102", periods=3, tz="US/Eastern"),
),
],
ids=lambda x: str(x[0].dtype),
)
def left_right_dtypes(request):
"""
Fixture for building an IntervalArray from various dtypes
"""
return request.param
class TestAttributes:
@pytest.mark.parametrize(
"left, right",
[
(0, 1),
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timestamp("2018-01-02")),
(
Timestamp("2018-01-01", tz="US/Eastern"),
Timestamp("2018-01-02", tz="US/Eastern"),
),
],
)
@pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex])
def test_is_empty(self, constructor, left, right, closed):
# GH27219
tuples = [(left, left), (left, right), np.nan]
expected = np.array([closed != "both", False, False])
result = constructor.from_tuples(tuples, closed=closed).is_empty
tm.assert_numpy_array_equal(result, expected)
class TestMethods:
@pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"])
def test_set_closed(self, closed, new_closed):
# GH 21670
array = IntervalArray.from_breaks(range(10), closed=closed)
result = array.set_closed(new_closed)
expected = IntervalArray.from_breaks(range(10), closed=new_closed)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[
Interval(0, 1, closed="right"),
IntervalArray.from_breaks([1, 2, 3, 4], closed="right"),
],
)
def test_where_raises(self, other):
ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left"))
match = "'value.closed' is 'right', expected 'left'."
with pytest.raises(ValueError, match=match):
ser.where([True, False, True], other=other)
def test_shift(self):
# https://github.com/pandas-dev/pandas/issues/31495
a = IntervalArray.from_breaks([1, 2, 3])
result = a.shift()
# int -> float
expected = IntervalArray.from_tuples([(np.nan, np.nan), (1.0, 2.0)])
tm.assert_interval_array_equal(result, expected)
def test_shift_datetime(self):
a = IntervalArray.from_breaks(date_range("2000", periods=4))
result = a.shift(2)
expected = a.take([-1, -1, 0], allow_fill=True)
tm.assert_interval_array_equal(result, expected)
result = a.shift(-1)
expected = a.take([1, 2, -1], allow_fill=True)
tm.assert_interval_array_equal(result, expected)
class TestSetitem:
def test_set_na(self, left_right_dtypes):
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
result = IntervalArray.from_arrays(left, right)
if result.dtype.subtype.kind not in ["m", "M"]:
msg = "'value' should be an interval type, got <.*NaTType'> instead."
with pytest.raises(TypeError, match=msg):
result[0] = pd.NaT
if result.dtype.subtype.kind in ["i", "u"]:
msg = "Cannot set float NaN to integer-backed IntervalArray"
with pytest.raises(ValueError, match=msg):
result[0] = np.NaN
return
result[0] = np.nan
expected_left = Index([left._na_value] + list(left[1:]))
expected_right = Index([right._na_value] + list(right[1:]))
expected = IntervalArray.from_arrays(expected_left, expected_right)
tm.assert_extension_array_equal(result, expected)
def test_setitem_mismatched_closed(self):
arr = IntervalArray.from_breaks(range(4))
orig = arr.copy()
other = arr.set_closed("both")
msg = "'value.closed' is 'both', expected 'right'"
with pytest.raises(ValueError, match=msg):
arr[0] = other[0]
with pytest.raises(ValueError, match=msg):
arr[:1] = other[:1]
with pytest.raises(ValueError, match=msg):
arr[:0] = other[:0]
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1]
with pytest.raises(ValueError, match=msg):
arr[:] = list(other[::-1])
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1].astype(object)
with pytest.raises(ValueError, match=msg):
arr[:] = other[::-1].astype("category")
# empty list should be no-op
arr[:0] = []
tm.assert_interval_array_equal(arr, orig)
def test_repr():
# GH 25022
arr = IntervalArray.from_tuples([(0, 1), (1, 2)])
result = repr(arr)
expected = (
"<IntervalArray>\n"
"[(0, 1], (1, 2]]\n"
"Length: 2, dtype: interval[int64, right]"
)
assert result == expected
class TestReductions:
def test_min_max_invalid_axis(self, left_right_dtypes):
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
arr = IntervalArray.from_arrays(left, right)
msg = "`axis` must be fewer than the number of dimensions"
for axis in [-2, 1]:
with pytest.raises(ValueError, match=msg):
arr.min(axis=axis)
with pytest.raises(ValueError, match=msg):
arr.max(axis=axis)
msg = "'>=' not supported between"
with pytest.raises(TypeError, match=msg):
arr.min(axis="foo")
with pytest.raises(TypeError, match=msg):
arr.max(axis="foo")
def test_min_max(self, left_right_dtypes, index_or_series_or_array):
# GH#44746
left, right = left_right_dtypes
left = left.copy(deep=True)
right = right.copy(deep=True)
arr = IntervalArray.from_arrays(left, right)
# The expected results below are only valid if monotonic
assert left.is_monotonic_increasing
assert Index(arr).is_monotonic_increasing
MIN = arr[0]
MAX = arr[-1]
indexer = np.arange(len(arr))
np.random.shuffle(indexer)
arr = arr.take(indexer)
arr_na = arr.insert(2, np.nan)
arr = index_or_series_or_array(arr)
arr_na = index_or_series_or_array(arr_na)
for skipna in [True, False]:
res = arr.min(skipna=skipna)
assert res == MIN
assert type(res) == type(MIN)
res = arr.max(skipna=skipna)
assert res == MAX
assert type(res) == type(MAX)
res = arr_na.min(skipna=False)
assert np.isnan(res)
res = arr_na.max(skipna=False)
assert np.isnan(res)
res = arr_na.min(skipna=True)
assert res == MIN
assert type(res) == type(MIN)
res = arr_na.max(skipna=True)
assert res == MAX
assert type(res) == type(MAX)
# ----------------------------------------------------------------------------
# Arrow interaction
pyarrow_skip = td.skip_if_no("pyarrow")
@pyarrow_skip
def test_arrow_extension_type():
import pyarrow as pa
from pandas.core.arrays._arrow_utils import ArrowIntervalType
p1 = ArrowIntervalType(pa.int64(), "left")
p2 = ArrowIntervalType(pa.int64(), "left")
p3 = ArrowIntervalType(pa.int64(), "right")
assert p1.closed == "left"
assert p1 == p2
assert not p1 == p3
assert hash(p1) == hash(p2)
assert not hash(p1) == hash(p3)
@pyarrow_skip
def test_arrow_array():
import pyarrow as pa
from pandas.core.arrays._arrow_utils import ArrowIntervalType
intervals = pd.interval_range(1, 5, freq=1).array
result = pa.array(intervals)
assert isinstance(result.type, ArrowIntervalType)
assert result.type.closed == intervals.closed
assert result.type.subtype == pa.int64()
assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64"))
assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64"))
expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)])
assert result.storage.equals(expected)
# convert to its storage type
result = pa.array(intervals, type=expected.type)
assert result.equals(expected)
# unsupported conversions
with pytest.raises(TypeError, match="Not supported to convert IntervalArray"):
pa.array(intervals, type="float64")
with pytest.raises(TypeError, match="different 'subtype'"):
pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left"))
@pyarrow_skip
def test_arrow_array_missing():
import pyarrow as pa
from pandas.core.arrays._arrow_utils import ArrowIntervalType
arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0])
arr[1] = None
result = pa.array(arr)
assert isinstance(result.type, ArrowIntervalType)
assert result.type.closed == arr.closed
assert result.type.subtype == pa.float64()
# fields have missing values (not NaN)
left = pa.array([0.0, None, 2.0], type="float64")
right = pa.array([1.0, None, 3.0], type="float64")
assert result.storage.field("left").equals(left)
assert result.storage.field("right").equals(right)
# structarray itself also has missing values on the array level
vals = [
{"left": 0.0, "right": 1.0},
{"left": None, "right": None},
{"left": 2.0, "right": 3.0},
]
expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False]))
assert result.storage.equals(expected)
@pyarrow_skip
@pytest.mark.parametrize(
"breaks",
[[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")],
ids=["float", "datetime64[ns]"],
)
def test_arrow_table_roundtrip(breaks):
import pyarrow as pa
from pandas.core.arrays._arrow_utils import ArrowIntervalType
arr = IntervalArray.from_breaks(breaks)
arr[1] = None
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowIntervalType)
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.IntervalDtype)
tm.assert_frame_equal(result, df)
table2 = pa.concat_tables([table, table])
result = table2.to_pandas()
expected = pd.concat([df, df], ignore_index=True)
tm.assert_frame_equal(result, expected)
# GH-41040
table = pa.table(
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
)
result = table.to_pandas()
tm.assert_frame_equal(result, expected[0:0])
@pyarrow_skip
@pytest.mark.parametrize(
"breaks",
[[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")],
ids=["float", "datetime64[ns]"],
)
def test_arrow_table_roundtrip_without_metadata(breaks):
import pyarrow as pa
arr = IntervalArray.from_breaks(breaks)
arr[1] = None
df = pd.DataFrame({"a": arr})
table = pa.table(df)
# remove the metadata
table = table.replace_schema_metadata()
assert table.schema.metadata is None
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.IntervalDtype)
tm.assert_frame_equal(result, df)

View File

@ -0,0 +1,93 @@
"""Tests for Interval-Interval operations, such as overlaps, contains, etc."""
import numpy as np
import pytest
from pandas import (
Interval,
IntervalIndex,
Timedelta,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
@pytest.fixture(params=[IntervalArray, IntervalIndex])
def constructor(request):
"""
Fixture for testing both interval container classes.
"""
return request.param
@pytest.fixture(
params=[
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timedelta("1 day")),
(0, 1),
],
ids=lambda x: type(x[0]).__name__,
)
def start_shift(request):
"""
Fixture for generating intervals of different types from a start value
and a shift value that can be added to start to generate an endpoint.
"""
return request.param
class TestOverlaps:
def test_overlaps_interval(self, constructor, start_shift, closed, other_closed):
start, shift = start_shift
interval = Interval(start, start + 3 * shift, other_closed)
# intervals: identical, nested, spanning, partial, adjacent, disjoint
tuples = [
(start, start + 3 * shift),
(start + shift, start + 2 * shift),
(start - shift, start + 4 * shift),
(start + 2 * shift, start + 4 * shift),
(start + 3 * shift, start + 4 * shift),
(start + 4 * shift, start + 5 * shift),
]
interval_container = constructor.from_tuples(tuples, closed)
adjacent = interval.closed_right and interval_container.closed_left
expected = np.array([True, True, True, True, adjacent, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex])
def test_overlaps_interval_container(self, constructor, other_constructor):
# TODO: modify this test when implemented
interval_container = constructor.from_breaks(range(5))
other_container = other_constructor.from_breaks(range(5))
with pytest.raises(NotImplementedError, match="^$"):
interval_container.overlaps(other_container)
def test_overlaps_na(self, constructor, start_shift):
"""NA values are marked as False"""
start, shift = start_shift
interval = Interval(start, start + shift)
tuples = [
(start, start + shift),
np.nan,
(start + 2 * shift, start + 3 * shift),
]
interval_container = constructor.from_tuples(tuples)
expected = np.array([True, False, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")],
ids=lambda x: type(x).__name__,
)
def test_overlaps_invalid_type(self, constructor, other):
interval_container = constructor.from_breaks(range(5))
msg = f"`other` must be Interval-like, got {type(other).__name__}"
with pytest.raises(TypeError, match=msg):
interval_container.overlaps(other)

View File

@ -0,0 +1,181 @@
from __future__ import annotations
from typing import Any
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
# integer dtypes
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
scalars: list[Any] = [2] * len(arrays)
# floating dtypes
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
scalars += [0.2, 0.2]
# boolean
arrays += [pd.array([True, False, True, None], dtype="boolean")]
scalars += [False]
@pytest.fixture(params=zip(arrays, scalars), ids=[a.dtype.name for a in arrays])
def data(request):
return request.param
def check_skip(data, op_name):
if isinstance(data.dtype, pd.BooleanDtype) and "sub" in op_name:
pytest.skip("subtract not implemented for boolean")
# Test equivalence of scalars, numpy arrays with array ops
# -----------------------------------------------------------------------------
def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
scalar_array = pd.array([scalar] * len(data), dtype=data.dtype)
# TODO also add len-1 array (np.array([scalar], dtype=data.dtype.numpy_dtype))
for scalar in [scalar, data.dtype.type(scalar)]:
result = op(data, scalar)
expected = op(data, scalar_array)
tm.assert_extension_array_equal(result, expected)
def test_array_NA(data, all_arithmetic_operators, request):
data, _ = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
scalar = pd.NA
scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype)
result = op(data, scalar)
expected = op(data, scalar_array)
tm.assert_extension_array_equal(result, expected)
def test_numpy_array_equivalence(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
numpy_array = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype)
pd_array = pd.array(numpy_array, dtype=data.dtype)
result = op(data, numpy_array)
expected = op(data, pd_array)
tm.assert_extension_array_equal(result, expected)
# Test equivalence with Series and DataFrame ops
# -----------------------------------------------------------------------------
def test_frame(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
# DataFrame with scalar
df = pd.DataFrame({"A": data})
result = op(df, scalar)
expected = pd.DataFrame({"A": op(data, scalar)})
tm.assert_frame_equal(result, expected)
def test_series(data, all_arithmetic_operators):
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
check_skip(data, all_arithmetic_operators)
s = pd.Series(data)
# Series with scalar
result = op(s, scalar)
expected = pd.Series(op(data, scalar))
tm.assert_series_equal(result, expected)
# Series with np.ndarray
other = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype)
result = op(s, other)
expected = pd.Series(op(data, other))
tm.assert_series_equal(result, expected)
# Series with pd.array
other = pd.array([scalar] * len(data), dtype=data.dtype)
result = op(s, other)
expected = pd.Series(op(data, other))
tm.assert_series_equal(result, expected)
# Series with Series
other = pd.Series([scalar] * len(data), dtype=data.dtype)
result = op(s, other)
expected = pd.Series(op(data, other.array))
tm.assert_series_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_object(data, all_arithmetic_operators):
data, _ = data
op = all_arithmetic_operators
opa = getattr(data, op)
# 2d -> return NotImplemented
result = opa(pd.DataFrame({"A": data}))
assert result is NotImplemented
msg = r"can only perform ops with 1-d structures"
with pytest.raises(NotImplementedError, match=msg):
opa(np.arange(len(data)).reshape(-1, len(data)))
def test_error_len_mismatch(data, all_arithmetic_operators):
# operating with a list-like with non-matching length raises
data, scalar = data
op = tm.get_op_from_name(all_arithmetic_operators)
other = [scalar] * (len(data) - 1)
for other in [other, np.array(other)]:
with pytest.raises(ValueError, match="Lengths must match"):
op(data, other)
s = pd.Series(data)
with pytest.raises(ValueError, match="Lengths must match"):
op(s, other)
@pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"])
def test_unary_op_does_not_propagate_mask(data, op, request):
# https://github.com/pandas-dev/pandas/issues/39943
data, _ = data
ser = pd.Series(data)
if op == "__invert__" and data.dtype.kind == "f":
# we follow numpy in raising
msg = "ufunc 'invert' not supported for the input types"
with pytest.raises(TypeError, match=msg):
getattr(ser, op)()
with pytest.raises(TypeError, match=msg):
getattr(data, op)()
with pytest.raises(TypeError, match=msg):
# Check that this is still the numpy behavior
getattr(data._data, op)()
return
result = getattr(ser, op)()
expected = result.copy(deep=True)
ser[0] = None
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,187 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
pa = pytest.importorskip("pyarrow", minversion="1.0.1")
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
arrays += [pd.array([True, False, True, None], dtype="boolean")]
@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
def data(request):
return request.param
def test_arrow_array(data):
arr = pa.array(data)
expected = pa.array(
data.to_numpy(object, na_value=None),
type=pa.from_numpy_dtype(data.dtype.numpy_dtype),
)
assert arr.equals(expected)
def test_arrow_roundtrip(data):
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == str(data.dtype.numpy_dtype)
result = table.to_pandas()
assert result["a"].dtype == data.dtype
tm.assert_frame_equal(result, df)
def test_dataframe_from_arrow_types_mapper():
def types_mapper(arrow_type):
if pa.types.is_boolean(arrow_type):
return pd.BooleanDtype()
elif pa.types.is_integer(arrow_type):
return pd.Int64Dtype()
bools_array = pa.array([True, None, False], type=pa.bool_())
ints_array = pa.array([1, None, 2], type=pa.int64())
small_ints_array = pa.array([-1, 0, 7], type=pa.int8())
record_batch = pa.RecordBatch.from_arrays(
[bools_array, ints_array, small_ints_array], ["bools", "ints", "small_ints"]
)
result = record_batch.to_pandas(types_mapper=types_mapper)
bools = pd.Series([True, None, False], dtype="boolean")
ints = pd.Series([1, None, 2], dtype="Int64")
small_ints = pd.Series([-1, 0, 7], dtype="Int64")
expected = pd.DataFrame({"bools": bools, "ints": ints, "small_ints": small_ints})
tm.assert_frame_equal(result, expected)
def test_arrow_load_from_zero_chunks(data):
# GH-41040
df = pd.DataFrame({"a": data[0:0]})
table = pa.table(df)
assert table.field("a").type == str(data.dtype.numpy_dtype)
table = pa.table(
[pa.chunked_array([], type=table.field("a").type)], schema=table.schema
)
result = table.to_pandas()
assert result["a"].dtype == data.dtype
tm.assert_frame_equal(result, df)
def test_arrow_from_arrow_uint():
# https://github.com/pandas-dev/pandas/issues/31896
# possible mismatch in types
dtype = pd.UInt32Dtype()
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
tm.assert_extension_array_equal(result, expected)
def test_arrow_sliced(data):
# https://github.com/pandas-dev/pandas/issues/38525
df = pd.DataFrame({"a": data})
table = pa.table(df)
result = table.slice(2, None).to_pandas()
expected = df.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)
# no missing values
df2 = df.fillna(data[0])
table = pa.table(df2)
result = table.slice(2, None).to_pandas()
expected = df2.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)
@pytest.fixture
def np_dtype_to_arrays(any_real_numpy_dtype):
np_dtype = np.dtype(any_real_numpy_dtype)
pa_type = pa.from_numpy_dtype(np_dtype)
# None ensures the creation of a bitmask buffer.
pa_array = pa.array([0, 1, 2, None], type=pa_type)
# Since masked Arrow buffer slots are not required to contain a specific
# value, assert only the first three values of the created np.array
np_expected = np.array([0, 1, 2], dtype=np_dtype)
mask_expected = np.array([True, True, True, False])
return np_dtype, pa_array, np_expected, mask_expected
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
"""
Test conversion from pyarrow array to numpy array.
Modifies the pyarrow buffer to contain padding and offset, which are
considered valid buffers by pyarrow.
Also tests empty pyarrow arrays with non empty buffers.
See https://github.com/pandas-dev/pandas/issues/40896
"""
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
mask_buffer = pa_array.buffers()[0]
data_buffer = pa_array.buffers()[1]
data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
# Add trailing padding to the buffer.
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
pa_array_trail = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer, data_buffer_trail],
offset=pa_array.offset,
)
pa_array_trail.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
# Add offset to the buffer.
offset = b"\x00" * (pa_array.type.bit_width // 8)
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
mask_buffer_offset = pa.py_buffer(b"\x0E")
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer_offset, data_buffer_offset],
offset=pa_array.offset + 1,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)
# Empty array
np_expected_empty = np.array([], dtype=np_dtype)
mask_expected_empty = np.array([], dtype=np.bool_)
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=0,
buffers=[mask_buffer, data_buffer],
offset=pa_array.offset,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected_empty)
tm.assert_numpy_array_equal(mask, mask_expected_empty)
def test_from_arrow_type_error(data):
# ensure that __from_arrow__ returns a TypeError when getting a wrong
# array type
arr = pa.array(data).cast("string")
with pytest.raises(TypeError, match=None):
# we don't test the exact error message, only the fact that it raises
# a TypeError is relevant
data.dtype.__from_arrow__(arr)

View File

@ -0,0 +1,44 @@
import numpy as np
import pytest
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
import pandas._testing as tm
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
arrays += [
pd.array([0.141, -0.268, 5.895, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES
]
@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
def data(request):
return request.param
@pytest.fixture()
def numpy_dtype(data):
# For integer dtype, the numpy conversion must be done to float
if is_integer_dtype(data):
numpy_dtype = float
else:
numpy_dtype = data.dtype.type
return numpy_dtype
def test_round(data, numpy_dtype):
# No arguments
result = data.round()
expected = pd.array(
np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)), dtype=data.dtype
)
tm.assert_extension_array_equal(result, expected)
# Decimals argument
result = data.round(decimals=2)
expected = pd.array(
np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2),
dtype=data.dtype,
)
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,138 @@
"""
Tests shared by MaskedArray subclasses.
"""
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.base import BaseOpsUtil
class ComparisonOps(BaseOpsUtil):
def _compare_other(self, data, op, other):
# array
result = pd.Series(op(data, other))
expected = pd.Series(op(data._data, other), dtype="boolean")
# fill the nan locations
expected[data._mask] = pd.NA
tm.assert_series_equal(result, expected)
# series
ser = pd.Series(data)
result = op(ser, other)
expected = op(pd.Series(data._data), other)
# fill the nan locations
expected[data._mask] = pd.NA
expected = expected.astype("boolean")
tm.assert_series_equal(result, expected)
# subclass will override to parametrize 'other'
def test_scalar(self, other, comparison_op, dtype):
op = comparison_op
left = pd.array([1, 0, None], dtype=dtype)
result = op(left, other)
if other is pd.NA:
expected = pd.array([None, None, None], dtype="boolean")
else:
values = op(left._data, other)
expected = pd.arrays.BooleanArray(values, left._mask, copy=True)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = pd.NA
tm.assert_extension_array_equal(left, pd.array([1, 0, None], dtype=dtype))
class NumericOps:
# Shared by IntegerArray and FloatingArray, not BooleanArray
def test_no_shared_mask(self, data):
result = data + 1
assert not tm.shares_memory(result, data)
def test_array(self, comparison_op, dtype):
op = comparison_op
left = pd.array([0, 1, 2, None, None, None], dtype=dtype)
right = pd.array([0, 1, None, 0, 1, None], dtype=dtype)
result = op(left, right)
values = op(left._data, right._data)
mask = left._mask | right._mask
expected = pd.arrays.BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = pd.NA
tm.assert_extension_array_equal(
left, pd.array([0, 1, 2, None, None, None], dtype=dtype)
)
tm.assert_extension_array_equal(
right, pd.array([0, 1, None, 0, 1, None], dtype=dtype)
)
def test_compare_with_booleanarray(self, comparison_op, dtype):
op = comparison_op
left = pd.array([True, False, None] * 3, dtype="boolean")
right = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype=dtype)
other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean")
expected = op(left, other)
result = op(left, right)
tm.assert_extension_array_equal(result, expected)
# reversed op
expected = op(other, left)
result = op(right, left)
tm.assert_extension_array_equal(result, expected)
def test_compare_to_string(self, dtype):
# GH#28930
ser = pd.Series([1, None], dtype=dtype)
result = ser == "a"
expected = pd.Series([False, pd.NA], dtype="boolean")
self.assert_series_equal(result, expected)
def test_ufunc_with_out(self, dtype):
arr = pd.array([1, 2, 3], dtype=dtype)
arr2 = pd.array([1, 2, pd.NA], dtype=dtype)
mask = arr == arr
mask2 = arr2 == arr2
result = np.zeros(3, dtype=bool)
result |= mask
# If MaskedArray.__array_ufunc__ handled "out" appropriately,
# `result` should still be an ndarray.
assert isinstance(result, np.ndarray)
assert result.all()
# result |= mask worked because mask could be cast lossslessly to
# boolean ndarray. mask2 can't, so this raises
result = np.zeros(3, dtype=bool)
msg = "Specify an appropriate 'na_value' for this dtype"
with pytest.raises(ValueError, match=msg):
result |= mask2
# addition
res = np.add(arr, arr2)
expected = pd.array([2, 4, pd.NA], dtype=dtype)
tm.assert_extension_array_equal(res, expected)
# when passing out=arr, we will modify 'arr' inplace.
res = np.add(arr, arr2, out=arr)
assert res is arr
tm.assert_extension_array_equal(res, expected)
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,121 @@
import pytest
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
PeriodArray,
period_array,
)
pa = pytest.importorskip("pyarrow", minversion="1.0.1")
def test_arrow_extension_type():
from pandas.core.arrays._arrow_utils import ArrowPeriodType
p1 = ArrowPeriodType("D")
p2 = ArrowPeriodType("D")
p3 = ArrowPeriodType("M")
assert p1.freq == "D"
assert p1 == p2
assert not p1 == p3
assert hash(p1) == hash(p2)
assert not hash(p1) == hash(p3)
@pytest.mark.parametrize(
"data, freq",
[
(pd.date_range("2017", periods=3), "D"),
(pd.date_range("2017", periods=3, freq="A"), "A-DEC"),
],
)
def test_arrow_array(data, freq):
from pandas.core.arrays._arrow_utils import ArrowPeriodType
periods = period_array(data, freq=freq)
result = pa.array(periods)
assert isinstance(result.type, ArrowPeriodType)
assert result.type.freq == freq
expected = pa.array(periods.asi8, type="int64")
assert result.storage.equals(expected)
# convert to its storage type
result = pa.array(periods, type=pa.int64())
assert result.equals(expected)
# unsupported conversions
msg = "Not supported to convert PeriodArray to 'double' type"
with pytest.raises(TypeError, match=msg):
pa.array(periods, type="float64")
with pytest.raises(TypeError, match="different 'freq'"):
pa.array(periods, type=ArrowPeriodType("T"))
def test_arrow_array_missing():
from pandas.core.arrays._arrow_utils import ArrowPeriodType
arr = PeriodArray([1, 2, 3], freq="D")
arr[1] = pd.NaT
result = pa.array(arr)
assert isinstance(result.type, ArrowPeriodType)
assert result.type.freq == "D"
expected = pa.array([1, None, 3], type="int64")
assert result.storage.equals(expected)
def test_arrow_table_roundtrip():
from pandas.core.arrays._arrow_utils import ArrowPeriodType
arr = PeriodArray([1, 2, 3], freq="D")
arr[1] = pd.NaT
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowPeriodType)
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)
table2 = pa.concat_tables([table, table])
result = table2.to_pandas()
expected = pd.concat([df, df], ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_arrow_load_from_zero_chunks():
# GH-41040
from pandas.core.arrays._arrow_utils import ArrowPeriodType
arr = PeriodArray([], freq="D")
df = pd.DataFrame({"a": arr})
table = pa.table(df)
assert isinstance(table.field("a").type, ArrowPeriodType)
table = pa.table(
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
)
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)
def test_arrow_table_roundtrip_without_metadata():
arr = PeriodArray([1, 2, 3], freq="H")
arr[1] = pd.NaT
df = pd.DataFrame({"a": arr})
table = pa.table(df)
# remove the metadata
table = table.replace_schema_metadata()
assert table.schema.metadata is None
result = table.to_pandas()
assert isinstance(result["a"].dtype, PeriodDtype)
tm.assert_frame_equal(result, df)

View File

@ -0,0 +1,69 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import period_array
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype(dtype):
# We choose to ignore the sign and size of integers for
# Period/Datetime/Timedelta astype
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(dtype)
if np.dtype(dtype).kind == "u":
expected_dtype = np.dtype("uint64")
else:
expected_dtype = np.dtype("int64")
expected = arr.astype(expected_dtype)
assert result.dtype == expected_dtype
tm.assert_numpy_array_equal(result, expected)
def test_astype_copies():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(np.int64, copy=False)
# Add the `.base`, since we now use `.asi8` which returns a view.
# We could maybe override it in PeriodArray to return ._data directly.
assert result.base is arr._data
result = arr.astype(np.int64, copy=True)
assert result is not arr._data
tm.assert_numpy_array_equal(result, arr._data.view("i8"))
def test_astype_categorical():
arr = period_array(["2000", "2001", "2001", None], freq="D")
result = arr.astype("category")
categories = pd.PeriodIndex(["2000", "2001"], freq="D")
expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories)
tm.assert_categorical_equal(result, expected)
def test_astype_period():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(PeriodDtype("M"))
expected = period_array(["2000", "2001", None], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize("other", ["datetime64[ns]", "timedelta64[ns]"])
def test_astype_datetime(other):
arr = period_array(["2000", "2001", None], freq="D")
# slice off the [ns] so that the regex matches.
if other == "timedelta64[ns]":
with pytest.raises(TypeError, match=other[:-4]):
arr.astype(other)
else:
# GH#45038 allow period->dt64 because we allow dt64->period
result = arr.astype(other)
expected = pd.DatetimeIndex(["2000", "2001", pd.NaT])._data
tm.assert_datetime_array_equal(result, expected)

View File

@ -0,0 +1,123 @@
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
from pandas._libs.tslibs.period import IncompatibleFrequency
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
PeriodArray,
period_array,
)
@pytest.mark.parametrize(
"data, freq, expected",
[
([pd.Period("2017", "D")], None, [17167]),
([pd.Period("2017", "D")], "D", [17167]),
([2017], "D", [17167]),
(["2017"], "D", [17167]),
([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]),
([pd.Period("2017", "D"), None], None, [17167, iNaT]),
(pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]),
(pd.date_range("2017", periods=3), None, [17167, 17168, 17169]),
(pd.period_range("2017", periods=4, freq="Q"), None, [188, 189, 190, 191]),
],
)
def test_period_array_ok(data, freq, expected):
result = period_array(data, freq=freq).asi8
expected = np.asarray(expected, dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)
def test_period_array_readonly_object():
# https://github.com/pandas-dev/pandas/issues/25403
pa = period_array([pd.Period("2019-01-01")])
arr = np.asarray(pa, dtype="object")
arr.setflags(write=False)
result = period_array(arr)
tm.assert_period_array_equal(result, pa)
result = pd.Series(arr)
tm.assert_series_equal(result, pd.Series(pa))
result = pd.DataFrame({"A": arr})
tm.assert_frame_equal(result, pd.DataFrame({"A": pa}))
def test_from_datetime64_freq_changes():
# https://github.com/pandas-dev/pandas/issues/23438
arr = pd.date_range("2017", periods=3, freq="D")
result = PeriodArray._from_datetime64(arr, freq="M")
expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize(
"data, freq, msg",
[
(
[pd.Period("2017", "D"), pd.Period("2017", "A")],
None,
"Input has different freq",
),
([pd.Period("2017", "D")], "A", "Input has different freq"),
],
)
def test_period_array_raises(data, freq, msg):
with pytest.raises(IncompatibleFrequency, match=msg):
period_array(data, freq)
def test_period_array_non_period_series_raies():
ser = pd.Series([1, 2, 3])
with pytest.raises(TypeError, match="dtype"):
PeriodArray(ser, freq="D")
def test_period_array_freq_mismatch():
arr = period_array(["2000", "2001"], freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, freq="M")
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, freq=pd.tseries.offsets.MonthEnd())
def test_from_sequence_disallows_i8():
arr = period_array(["2000", "2001"], freq="D")
msg = str(arr[0].ordinal)
with pytest.raises(TypeError, match=msg):
PeriodArray._from_sequence(arr.asi8, dtype=arr.dtype)
with pytest.raises(TypeError, match=msg):
PeriodArray._from_sequence(list(arr.asi8), dtype=arr.dtype)
def test_from_td64nat_sequence_raises():
# GH#44507
td = pd.NaT.to_numpy("m8[ns]")
dtype = pd.period_range("2005-01-01", periods=3, freq="D").dtype
arr = np.array([None], dtype=object)
arr[0] = td
msg = "Value must be Period, string, integer, or datetime"
with pytest.raises(ValueError, match=msg):
PeriodArray._from_sequence(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.PeriodIndex(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.Index(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.array(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.Series(arr, dtype=dtype)
with pytest.raises(ValueError, match=msg):
pd.DataFrame(arr, dtype=dtype)

View File

@ -0,0 +1,42 @@
import pytest
import pandas as pd
from pandas.core.arrays import period_array
class TestReductions:
def test_min_max(self):
arr = period_array(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
freq="D",
)
result = arr.min()
expected = pd.Period("2000-01-02", freq="D")
assert result == expected
result = arr.max()
expected = pd.Period("2000-01-05", freq="D")
assert result == expected
result = arr.min(skipna=False)
assert result is pd.NaT
result = arr.max(skipna=False)
assert result is pd.NaT
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna):
arr = period_array([], freq="D")
result = arr.min(skipna=skipna)
assert result is pd.NaT
result = arr.max(skipna=skipna)
assert result is pd.NaT

View File

@ -0,0 +1,159 @@
import string
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import (
SparseArray,
SparseDtype,
)
class TestSeriesAccessor:
# TODO: collect other Series accessor tests
def test_to_dense(self):
s = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]")
result = s.sparse.to_dense()
expected = pd.Series([0, 1, 0, 10])
tm.assert_series_equal(result, expected)
class TestFrameAccessor:
def test_accessor_raises(self):
df = pd.DataFrame({"A": [0, 1]})
with pytest.raises(AttributeError, match="sparse"):
df.sparse
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
@pytest.mark.parametrize("dtype", ["float64", "int64"])
@td.skip_if_no_scipy
def test_from_spmatrix(self, format, labels, dtype):
import scipy.sparse
sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
mat = scipy.sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
expected = pd.DataFrame(
np.eye(10, dtype=dtype), index=labels, columns=labels
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@td.skip_if_no_scipy
def test_from_spmatrix_including_explicit_zero(self, format):
import scipy.sparse
mat = scipy.sparse.random(10, 2, density=0.5, format=format)
mat.data[0] = 0
result = pd.DataFrame.sparse.from_spmatrix(mat)
dtype = SparseDtype("float64", 0.0)
expected = pd.DataFrame(mat.todense()).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"columns",
[["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]],
)
@td.skip_if_no_scipy
def test_from_spmatrix_columns(self, columns):
import scipy.sparse
dtype = SparseDtype("float64", 0.0)
mat = scipy.sparse.random(10, 2, density=0.5)
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
)
@td.skip_if_no_scipy
def test_to_coo(self, colnames):
import scipy.sparse
df = pd.DataFrame(
{colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
)
result = df.sparse.to_coo()
expected = scipy.sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0
@pytest.mark.parametrize("fill_value", [1, np.nan])
@td.skip_if_no_scipy
def test_to_coo_nonzero_fill_val_raises(self, fill_value):
df = pd.DataFrame(
{
"A": SparseArray(
[fill_value, fill_value, fill_value, 2], fill_value=fill_value
),
"B": SparseArray(
[fill_value, 2, fill_value, fill_value], fill_value=fill_value
),
}
)
with pytest.raises(ValueError, match="fill value must be 0"):
df.sparse.to_coo()
def test_to_dense(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)),
"B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)),
"C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)),
},
index=["b", "a"],
)
result = df.sparse.to_dense()
expected = pd.DataFrame(
{"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"]
)
tm.assert_frame_equal(result, expected)
def test_density(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0, 2, 1], fill_value=0),
"B": SparseArray([0, 1, 1, 1], fill_value=0),
}
)
res = df.sparse.density
expected = 0.75
assert res == expected
@pytest.mark.parametrize("dtype", ["int64", "float64"])
@pytest.mark.parametrize("dense_index", [True, False])
@td.skip_if_no_scipy
def test_series_from_coo(self, dtype, dense_index):
import scipy.sparse
A = scipy.sparse.eye(3, format="coo", dtype=dtype)
result = pd.Series.sparse.from_coo(A, dense_index=dense_index)
index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index)
if dense_index:
expected = expected.reindex(pd.MultiIndex.from_product(index.levels))
tm.assert_series_equal(result, expected)
@td.skip_if_no_scipy
def test_series_from_coo_incorrect_format_raises(self):
# gh-26554
import scipy.sparse
m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]]))
with pytest.raises(
TypeError, match="Expected coo_matrix. Got csr_matrix instead."
):
pd.Series.sparse.from_coo(m)
def test_with_column_named_sparse(self):
# https://github.com/pandas-dev/pandas/issues/30758
df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])})
assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor)

View File

@ -0,0 +1,530 @@
import operator
import numpy as np
import pytest
from pandas.compat import np_version_under1p20
import pandas as pd
import pandas._testing as tm
from pandas.core import ops
from pandas.core.arrays.sparse import (
SparseArray,
SparseDtype,
)
@pytest.fixture(params=["integer", "block"])
def kind(request):
"""kind kwarg to pass to SparseArray/SparseSeries"""
return request.param
@pytest.fixture(params=[True, False])
def mix(request):
# whether to operate op(sparse, dense) instead of op(sparse, sparse)
return request.param
class TestSparseArrayArithmetics:
_base = np.array
_klass = SparseArray
def _assert(self, a, b):
# We have to use tm.assert_sp_array_equal. See GH #45126
tm.assert_numpy_array_equal(a, b)
def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op):
# Check that arithmetic behavior matches non-Sparse Series arithmetic
if isinstance(a_dense, np.ndarray):
expected = op(pd.Series(a_dense), b_dense).values
elif isinstance(b_dense, np.ndarray):
expected = op(a_dense, pd.Series(b_dense)).values
else:
raise NotImplementedError
with np.errstate(invalid="ignore", divide="ignore"):
if mix:
result = op(a, b_dense).to_dense()
else:
result = op(a, b).to_dense()
self._assert(result, expected)
def _check_bool_result(self, res):
assert isinstance(res, self._klass)
assert isinstance(res.dtype, SparseDtype)
assert res.dtype.subtype == np.bool_
assert isinstance(res.fill_value, bool)
def _check_comparison_ops(self, a, b, a_dense, b_dense):
with np.errstate(invalid="ignore"):
# Unfortunately, trying to wrap the computation of each expected
# value is with np.errstate() is too tedious.
#
# sparse & sparse
self._check_bool_result(a == b)
self._assert((a == b).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b)
self._assert((a != b).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b)
self._assert((a >= b).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b)
self._assert((a <= b).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b)
self._assert((a > b).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b)
self._assert((a < b).to_dense(), a_dense < b_dense)
# sparse & dense
self._check_bool_result(a == b_dense)
self._assert((a == b_dense).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b_dense)
self._assert((a != b_dense).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b_dense)
self._assert((a >= b_dense).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b_dense)
self._assert((a <= b_dense).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b_dense)
self._assert((a > b_dense).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b_dense)
self._assert((a < b_dense).to_dense(), a_dense < b_dense)
def _check_logical_ops(self, a, b, a_dense, b_dense):
# sparse & sparse
self._check_bool_result(a & b)
self._assert((a & b).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b)
self._assert((a | b).to_dense(), a_dense | b_dense)
# sparse & dense
self._check_bool_result(a & b_dense)
self._assert((a & b_dense).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b_dense)
self._assert((a | b_dense).to_dense(), a_dense | b_dense)
@pytest.mark.parametrize("scalar", [0, 1, 3])
@pytest.mark.parametrize("fill_value", [None, 0, 2])
def test_float_scalar(
self, kind, mix, all_arithmetic_functions, fill_value, scalar, request
):
op = all_arithmetic_functions
if np_version_under1p20:
if op in [operator.floordiv, ops.rfloordiv]:
if op is operator.floordiv and scalar != 0:
pass
elif op is ops.rfloordiv and scalar == 0:
pass
else:
mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172")
request.node.add_marker(mark)
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = self._klass(values, kind=kind, fill_value=fill_value)
self._check_numeric_ops(a, scalar, values, scalar, mix, op)
def test_float_scalar_comparison(self, kind):
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = self._klass(values, kind=kind)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = self._klass(values, kind=kind, fill_value=0)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = self._klass(values, kind=kind, fill_value=2)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions):
# when sp_index are the same
op = all_arithmetic_functions
values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_with_nans(
self, kind, mix, all_arithmetic_functions, request
):
# when sp_index are the same
op = all_arithmetic_functions
if (
np_version_under1p20
and op is ops.rfloordiv
and not (mix and kind == "block")
):
mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172")
request.node.add_marker(mark)
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_comparison(self, kind):
# when sp_index are the same
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
def test_float_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_different_kind(self, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind="integer")
b = self._klass(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind="integer", fill_value=0)
b = self._klass(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind="integer", fill_value=0)
b = self._klass(rvalues, kind="block", fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind="integer", fill_value=1)
b = self._klass(rvalues, kind="block", fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_comparison(self, kind):
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_int_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
# have to specify dtype explicitly until fixing GH 667
dtype = np.int64
values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = self._klass(values, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, fill_value=1, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype, fill_value=1)
b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_int_array_comparison(self, kind):
dtype = "int64"
# int32 NI ATM
values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = self._klass(values, dtype=dtype, kind=kind)
b = self._klass(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
b = self._klass(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=1)
b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_same_index(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = self._base([True, False, True, True], dtype=np.bool_)
rvalues = self._base([True, False, True, True], dtype=np.bool_)
a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_array_logical(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = self._base([True, False, True, False, True, True], dtype=np.bool_)
rvalues = self._base([True, False, False, True, False, True], dtype=np.bool_)
a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request):
op = all_arithmetic_functions
if np_version_under1p20 and op in [operator.floordiv, ops.rfloordiv] and mix:
mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172")
request.node.add_marker(mark)
rdtype = "int64"
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_mixed_array_comparison(self, kind):
rdtype = "int64"
# int32 NI ATM
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_xor(self):
s = SparseArray([True, True, False, False])
t = SparseArray([True, False, True, False])
result = s ^ t
sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
expected = SparseArray([False, True, True], sparse_index=sp_index)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("op", [operator.eq, operator.add])
def test_with_list(op):
arr = SparseArray([0, 1], fill_value=0)
result = op(arr, [0, 1])
expected = op(arr, SparseArray([0, 1]))
tm.assert_sp_array_equal(result, expected)
def test_with_dataframe():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
df = pd.DataFrame([[1, 2], [3, 4]])
result = arr.__add__(df)
assert result is NotImplemented
def test_with_zerodim_ndarray():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
result = arr * np.array(2)
expected = arr * 2
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.abs, np.exp])
@pytest.mark.parametrize(
"arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])]
)
def test_ufuncs(ufunc, arr):
result = ufunc(arr)
fill_value = ufunc(arr.fill_value)
expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
(SparseArray([0, 0, 0]), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
],
)
@pytest.mark.parametrize("ufunc", [np.add, np.greater])
def test_binary_ufuncs(ufunc, a, b):
# can't say anything about fill value here.
result = ufunc(a, b)
expected = ufunc(np.asarray(a), np.asarray(b))
assert isinstance(result, SparseArray)
tm.assert_numpy_array_equal(np.asarray(result), expected)
def test_ndarray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
ndarray += sparray
expected = np.array([0, 3, 2, 3])
tm.assert_numpy_array_equal(ndarray, expected)
def test_sparray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
sparray += ndarray
expected = SparseArray([0, 3, 2, 3], fill_value=0)
tm.assert_sp_array_equal(sparray, expected)
@pytest.mark.parametrize("fill_value", [True, False])
def test_invert(fill_value):
arr = np.array([True, False, False, True])
sparray = SparseArray(arr, fill_value=fill_value)
result = ~sparray
expected = SparseArray(~arr, fill_value=not fill_value)
tm.assert_sp_array_equal(result, expected)
result = ~pd.Series(sparray)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
result = ~pd.DataFrame({"A": sparray})
expected = pd.DataFrame({"A": expected})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("fill_value", [0, np.nan])
@pytest.mark.parametrize("op", [operator.pos, operator.neg])
def test_unary_op(op, fill_value):
arr = np.array([0, 1, np.nan, 2])
sparray = SparseArray(arr, fill_value=fill_value)
result = op(sparray)
expected = SparseArray(op(arr), fill_value=op(fill_value))
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("cons", [list, np.array, SparseArray])
def test_mismatched_length_cmp_op(cons):
left = SparseArray([True, True])
right = cons([True, True, True])
with pytest.raises(ValueError, match="operands have mismatched length"):
left & right

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,62 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestSparseArrayConcat:
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_basic(self, kind):
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=kind)
result = SparseArray._concat_same_type([a, b])
# Can't make any assertions about the sparse index itself
# since we aren't don't merge sparse blocs across arrays
# in to_concat
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_uses_first_kind(self, kind):
other = "integer" if kind == "block" else "block"
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=other)
result = SparseArray._concat_same_type([a, b])
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize(
"other, expected_dtype",
[
# compatible dtype -> preserve sparse
(pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
# (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
# incompatible dtype -> Sparse[common dtype]
(pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)),
# incompatible dtype -> Sparse[object] dtype
(pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
# categorical with compatible categories -> dtype of the categories
(pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
(pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
# categorical with incompatible categories -> object dtype
(pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
],
)
def test_concat_with_non_sparse(other, expected_dtype):
# https://github.com/pandas-dev/pandas/issues/34336
s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))
result = pd.concat([s_sparse, other], ignore_index=True)
expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
tm.assert_series_equal(result, expected)
result = pd.concat([other, s_sparse], ignore_index=True)
expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,209 @@
import re
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.sparse import SparseDtype
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", 0),
("float", np.nan),
("bool", False),
("object", np.nan),
("datetime64[ns]", np.datetime64("NaT", "ns")),
("timedelta64[ns]", np.timedelta64("NaT", "ns")),
],
)
def test_inferred_dtype(dtype, fill_value):
sparse_dtype = SparseDtype(dtype)
result = sparse_dtype.fill_value
if pd.isna(fill_value):
assert pd.isna(result) and type(result) == type(fill_value)
else:
assert result == fill_value
def test_from_sparse_dtype():
dtype = SparseDtype("float", 0)
result = SparseDtype(dtype)
assert result.fill_value == 0
def test_from_sparse_dtype_fill_value():
dtype = SparseDtype("int", 1)
result = SparseDtype(dtype, fill_value=2)
expected = SparseDtype("int", 2)
assert result == expected
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", None),
("float", None),
("bool", None),
("object", None),
("datetime64[ns]", None),
("timedelta64[ns]", None),
("int", np.nan),
("float", 0),
],
)
def test_equal(dtype, fill_value):
a = SparseDtype(dtype, fill_value)
b = SparseDtype(dtype, fill_value)
assert a == b
assert b == a
def test_nans_equal():
a = SparseDtype(float, float("nan"))
b = SparseDtype(float, np.nan)
assert a == b
assert b == a
@pytest.mark.parametrize(
"a, b",
[
(SparseDtype("float64"), SparseDtype("float32")),
(SparseDtype("float64"), SparseDtype("float64", 0)),
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
(SparseDtype("float64"), np.dtype("float64")),
],
)
def test_not_equal(a, b):
assert a != b
def test_construct_from_string_raises():
with pytest.raises(
TypeError, match="Cannot construct a 'SparseDtype' from 'not a dtype'"
):
SparseDtype.construct_from_string("not a dtype")
@pytest.mark.parametrize(
"dtype, expected",
[
(SparseDtype(int), True),
(SparseDtype(float), True),
(SparseDtype(bool), True),
(SparseDtype(object), False),
(SparseDtype(str), False),
],
)
def test_is_numeric(dtype, expected):
assert dtype._is_numeric is expected
def test_str_uses_object():
result = SparseDtype(str).subtype
assert result == np.dtype("object")
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[float64]", SparseDtype(np.dtype("float64"))),
("Sparse[float32]", SparseDtype(np.dtype("float32"))),
("Sparse[int]", SparseDtype(np.dtype("int"))),
("Sparse[str]", SparseDtype(np.dtype("str"))),
("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))),
("Sparse", SparseDtype(np.dtype("float"), np.nan)),
],
)
def test_construct_from_string(string, expected):
result = SparseDtype.construct_from_string(string)
assert result == expected
@pytest.mark.parametrize(
"a, b, expected",
[
(SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True),
(SparseDtype(int, 0), SparseDtype(int, 0), True),
(SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True),
(SparseDtype(float, 0), SparseDtype(float, np.nan), False),
(SparseDtype(int, 0.0), SparseDtype(float, 0.0), False),
],
)
def test_hash_equal(a, b, expected):
result = a == b
assert result is expected
result = hash(a) == hash(b)
assert result is expected
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[int]", "int"),
("Sparse[int, 0]", "int"),
("Sparse[int64]", "int64"),
("Sparse[int64, 0]", "int64"),
("Sparse[datetime64[ns], 0]", "datetime64[ns]"),
],
)
def test_parse_subtype(string, expected):
subtype, _ = SparseDtype._parse_subtype(string)
assert subtype == expected
@pytest.mark.parametrize(
"string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"]
)
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match="fill_value in the string is not"):
SparseDtype.construct_from_string(string)
@pytest.mark.parametrize(
"original, dtype, expected",
[
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, "1")),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
],
)
def test_update_dtype(original, dtype, expected):
result = original.update_dtype(dtype)
assert result == expected
@pytest.mark.parametrize(
"original, dtype, expected_error_msg",
[
(
SparseDtype(float, np.nan),
int,
re.escape("Cannot convert non-finite values (NA or inf) to integer"),
),
(
SparseDtype(str, "abc"),
int,
re.escape("invalid literal for int() with base 10: 'abc'"),
),
],
)
def test_update_dtype_raises(original, dtype, expected_error_msg):
with pytest.raises(ValueError, match=expected_error_msg):
original.update_dtype(dtype)
def test_repr():
# GH-34352
result = str(SparseDtype("int64", fill_value=0))
expected = "Sparse[int64, 0]"
assert result == expected
result = str(SparseDtype(object, fill_value="0"))
expected = "Sparse[object, '0']"
assert result == expected

View File

@ -0,0 +1,618 @@
import operator
import numpy as np
import pytest
import pandas._libs.sparse as splib
import pandas.util._test_decorators as td
from pandas import Series
import pandas._testing as tm
from pandas.core.arrays.sparse import (
BlockIndex,
IntIndex,
make_sparse_index,
)
TEST_LENGTH = 20
plain_case = {
"xloc": [0, 7, 15],
"xlen": [3, 5, 5],
"yloc": [2, 9, 14],
"ylen": [2, 3, 5],
"intersect_loc": [2, 9, 15],
"intersect_len": [1, 3, 4],
}
delete_blocks = {
"xloc": [0, 5],
"xlen": [4, 4],
"yloc": [1],
"ylen": [4],
"intersect_loc": [1],
"intersect_len": [3],
}
split_blocks = {
"xloc": [0],
"xlen": [10],
"yloc": [0, 5],
"ylen": [3, 7],
"intersect_loc": [0, 5],
"intersect_len": [3, 5],
}
skip_block = {
"xloc": [10],
"xlen": [5],
"yloc": [0, 12],
"ylen": [5, 3],
"intersect_loc": [12],
"intersect_len": [3],
}
no_intersect = {
"xloc": [0, 10],
"xlen": [4, 6],
"yloc": [5, 17],
"ylen": [4, 2],
"intersect_loc": [],
"intersect_len": [],
}
def check_cases(_check_case):
def _check_case_dict(case):
_check_case(
case["xloc"],
case["xlen"],
case["yloc"],
case["ylen"],
case["intersect_loc"],
case["intersect_len"],
)
_check_case_dict(plain_case)
_check_case_dict(delete_blocks)
_check_case_dict(split_blocks)
_check_case_dict(skip_block)
_check_case_dict(no_intersect)
# one or both is empty
_check_case([0], [5], [], [], [], [])
_check_case([], [], [], [], [], [])
class TestSparseIndexUnion:
def test_index_make_union(self):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
bresult = xindex.make_union(yindex)
assert isinstance(bresult, BlockIndex)
tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32))
tm.assert_numpy_array_equal(
bresult.blengths, np.array(elen, dtype=np.int32)
)
ixindex = xindex.to_int_index()
iyindex = yindex.to_int_index()
iresult = ixindex.make_union(iyindex)
assert isinstance(iresult, IntIndex)
tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices)
"""
x: ----
y: ----
r: --------
"""
xloc = [0]
xlen = [5]
yloc = [5]
ylen = [4]
eloc = [0]
elen = [9]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----- -----
y: ----- --
"""
xloc = [0, 10]
xlen = [5, 5]
yloc = [2, 17]
ylen = [5, 2]
eloc = [0, 10, 17]
elen = [7, 5, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------
y: -------
r: ----------
"""
xloc = [1]
xlen = [5]
yloc = [3]
ylen = [5]
eloc = [1]
elen = [7]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: -------
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4]
ylen = [8]
eloc = [2]
elen = [12]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: --- -----
y: -------
r: -------------
"""
xloc = [0, 5]
xlen = [3, 5]
yloc = [0]
ylen = [7]
eloc = [0]
elen = [10]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: ------- ---
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4, 13]
ylen = [8, 4]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----------------------
y: ---- ---- ---
r: ----------------------
"""
xloc = [2]
xlen = [15]
yloc = [4, 9, 14]
ylen = [3, 2, 2]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ---- ---
y: --- ---
"""
xloc = [0, 10]
xlen = [3, 3]
yloc = [5, 15]
ylen = [2, 2]
eloc = [0, 5, 10, 15]
elen = [3, 2, 3, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
def test_int_index_make_union(self):
a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1], dtype=np.int32))
b = IntIndex(4, np.array([0, 1], dtype=np.int32))
msg = "Indices must reference same underlying length"
with pytest.raises(ValueError, match=msg):
a.make_union(b)
class TestSparseIndexIntersect:
@td.skip_if_windows
def test_intersect(self):
def _check_correct(a, b, expected):
result = a.intersect(b)
assert result.equals(expected)
def _check_length_exc(a, longer):
msg = "Indices must reference same underlying length"
with pytest.raises(Exception, match=msg):
a.intersect(longer)
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
expected = BlockIndex(TEST_LENGTH, eloc, elen)
longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen)
_check_correct(xindex, yindex, expected)
_check_correct(
xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index()
)
_check_length_exc(xindex, longer_index)
_check_length_exc(xindex.to_int_index(), longer_index.to_int_index())
check_cases(_check_case)
def test_intersect_empty(self):
xindex = IntIndex(4, np.array([], dtype=np.int32))
yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
xindex = xindex.to_block_index()
yindex = yindex.to_block_index()
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
def test_intersect_identical(self):
cases = [
IntIndex(5, np.array([1, 2], dtype=np.int32)),
IntIndex(5, np.array([0, 2, 4], dtype=np.int32)),
IntIndex(0, np.array([], dtype=np.int32)),
IntIndex(5, np.array([], dtype=np.int32)),
]
for case in cases:
assert case.intersect(case).equals(case)
case = case.to_block_index()
assert case.intersect(case).equals(case)
class TestSparseIndexCommon:
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
def test_lookup(self):
for kind in ["integer", "block"]:
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == -1
assert idx.lookup(1) == -1
assert idx.lookup(2) == 0
assert idx.lookup(3) == 1
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
for i in range(-1, 5):
assert idx.lookup(i) == -1
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == 1
assert idx.lookup(2) == 2
assert idx.lookup(3) == 3
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == -1
assert idx.lookup(2) == 1
assert idx.lookup(3) == 2
assert idx.lookup(4) == -1
def test_lookup_array(self):
for kind in ["integer", "block"]:
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, -1, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 0, -1, 1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
exp = np.array([-1, -1, -1, -1], dtype=np.int32)
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, 0, 2], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 2, 1, 3], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
exp = np.array([1, -1, 2, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
exp = np.array([-1, -1, 1, -1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
@pytest.mark.parametrize(
"idx, expected",
[
[0, -1],
[5, 0],
[7, 2],
[8, -1],
[9, -1],
[10, -1],
[11, -1],
[12, 3],
[17, 8],
[18, -1],
],
)
def test_lookup_basics(self, idx, expected):
bindex = BlockIndex(20, [5, 12], [3, 6])
assert bindex.lookup(idx) == expected
iindex = bindex.to_int_index()
assert iindex.lookup(idx) == expected
class TestBlockIndex:
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
def test_make_block_boundary(self):
for i in [5, 10, 100, 101]:
idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
exp = np.arange(0, i, 2, dtype=np.int32)
tm.assert_numpy_array_equal(idx.blocs, exp)
tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32))
def test_equals(self):
index = BlockIndex(10, [0, 4], [2, 5])
assert index.equals(index)
assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
def test_check_integrity(self):
locs = []
lengths = []
# 0-length OK
BlockIndex(0, locs, lengths)
# also OK even though empty
BlockIndex(1, locs, lengths)
msg = "Block 0 extends beyond end"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [5], [10])
msg = "Block 0 overlaps"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [2, 5], [5, 3])
def test_to_int_index(self):
locs = [0, 10]
lengths = [4, 6]
exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
block = BlockIndex(20, locs, lengths)
dense = block.to_int_index()
tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32))
def test_to_block_index(self):
index = BlockIndex(10, [0, 5], [4, 5])
assert index.to_block_index() is index
class TestIntIndex:
def test_check_integrity(self):
# Too many indices than specified in self.length
msg = "Too many indices"
with pytest.raises(ValueError, match=msg):
IntIndex(length=1, indices=[1, 2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# All indices must be less than the length.
msg = "All indices must be less than the length"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 5])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 6])
# Indices must be strictly ascending.
msg = "Indices must be strictly increasing"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 2])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 3])
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_equals(self):
index = IntIndex(10, [0, 1, 2, 3, 4])
assert index.equals(index)
assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
def test_to_block_index(self):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
# see if survive the round trip
xbindex = xindex.to_int_index().to_block_index()
ybindex = yindex.to_int_index().to_block_index()
assert isinstance(xbindex, BlockIndex)
assert xbindex.equals(xindex)
assert ybindex.equals(yindex)
check_cases(_check_case)
def test_to_int_index(self):
index = IntIndex(10, [2, 3, 4, 5, 6])
assert index.to_int_index() is index
class TestSparseOperators:
def _op_tests(self, sparse_op, python_op):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
xdindex = xindex.to_int_index()
ydindex = yindex.to_int_index()
x = np.arange(xindex.npoints) * 10.0 + 1
y = np.arange(yindex.npoints) * 100.0 + 1
xfill = 0
yfill = 2
result_block_vals, rb_index, bfill = sparse_op(
x, xindex, xfill, y, yindex, yfill
)
result_int_vals, ri_index, ifill = sparse_op(
x, xdindex, xfill, y, ydindex, yfill
)
assert rb_index.to_int_index().equals(ri_index)
tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
assert bfill == ifill
# check versus Series...
xseries = Series(x, xdindex.indices)
xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill)
yseries = Series(y, ydindex.indices)
yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill)
series_result = python_op(xseries, yseries)
series_result = series_result.reindex(ri_index.indices)
tm.assert_numpy_array_equal(result_block_vals, series_result.values)
tm.assert_numpy_array_equal(result_int_vals, series_result.values)
check_cases(_check_case)
@pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"])
def test_op(self, opname):
sparse_op = getattr(splib, f"sparse_{opname}_float64")
python_op = getattr(operator, opname)
self._op_tests(sparse_op, python_op)

View File

@ -0,0 +1,555 @@
"""
This module tests the functionality of StringArray and ArrowStringArray.
Tests for the str accessors are in pandas/tests/strings/test_string_array.py
"""
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_dtype_equal
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.string_arrow import ArrowStringArray
@pytest.fixture
def dtype(string_storage):
return pd.StringDtype(storage=string_storage)
@pytest.fixture
def cls(dtype):
return dtype.construct_array_type()
def test_repr(dtype):
df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
expected = " A\n0 a\n1 <NA>\n2 b"
assert repr(df) == expected
expected = "0 a\n1 <NA>\n2 b\nName: A, dtype: string"
assert repr(df.A) == expected
arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray"
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
assert repr(df.A.array) == expected
def test_none_to_nan(cls):
a = cls._from_sequence(["a", None, "b"])
assert a[1] is not None
assert a[1] is pd.NA
def test_setitem_validates(cls):
arr = cls._from_sequence(["a", "b"])
if cls is pd.arrays.StringArray:
msg = "Cannot set non-string value '10' into a StringArray."
else:
msg = "Scalar must be NA or str"
with pytest.raises(ValueError, match=msg):
arr[0] = 10
if cls is pd.arrays.StringArray:
msg = "Must provide strings."
else:
msg = "Scalar must be NA or str"
with pytest.raises(ValueError, match=msg):
arr[:] = np.array([1, 2])
def test_setitem_with_scalar_string(dtype):
# is_float_dtype considers some strings, like 'd', to be floats
# which can cause issues.
arr = pd.array(["a", "c"], dtype=dtype)
arr[0] = "d"
expected = pd.array(["d", "c"], dtype=dtype)
tm.assert_extension_array_equal(arr, expected)
def test_astype_roundtrip(dtype, request):
if dtype.storage == "pyarrow":
reason = "ValueError: Could not convert object to NumPy datetime"
mark = pytest.mark.xfail(reason=reason, raises=ValueError)
request.node.add_marker(mark)
else:
mark = pytest.mark.xfail(
reason="GH#36153 casting from StringArray to dt64 fails", raises=ValueError
)
request.node.add_marker(mark)
ser = pd.Series(pd.date_range("2000", periods=12))
ser[0] = None
casted = ser.astype(dtype)
assert is_dtype_equal(casted.dtype, dtype)
result = casted.astype("datetime64[ns]")
tm.assert_series_equal(result, ser)
def test_add(dtype, request):
if dtype.storage == "pyarrow":
reason = (
"unsupported operand type(s) for +: 'ArrowStringArray' and "
"'ArrowStringArray'"
)
mark = pytest.mark.xfail(raises=TypeError, reason=reason)
request.node.add_marker(mark)
a = pd.Series(["a", "b", "c", None, None], dtype=dtype)
b = pd.Series(["x", "y", None, "z", None], dtype=dtype)
result = a + b
expected = pd.Series(["ax", "by", None, None, None], dtype=dtype)
tm.assert_series_equal(result, expected)
result = a.add(b)
tm.assert_series_equal(result, expected)
result = a.radd(b)
expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype)
tm.assert_series_equal(result, expected)
result = a.add(b, fill_value="-")
expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_add_2d(dtype, request):
if dtype.storage == "pyarrow":
reason = "Failed: DID NOT RAISE <class 'ValueError'>"
mark = pytest.mark.xfail(raises=None, reason=reason)
request.node.add_marker(mark)
a = pd.array(["a", "b", "c"], dtype=dtype)
b = np.array([["a", "b", "c"]], dtype=object)
with pytest.raises(ValueError, match="3 != 1"):
a + b
s = pd.Series(a)
with pytest.raises(ValueError, match="3 != 1"):
s + b
def test_add_sequence(dtype, request):
if dtype.storage == "pyarrow":
reason = "unsupported operand type(s) for +: 'ArrowStringArray' and 'list'"
mark = pytest.mark.xfail(raises=TypeError, reason=reason)
request.node.add_marker(mark)
a = pd.array(["a", "b", None, None], dtype=dtype)
other = ["x", None, "y", None]
result = a + other
expected = pd.array(["ax", None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = other + a
expected = pd.array(["xa", None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_mul(dtype, request):
if dtype.storage == "pyarrow":
reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'"
mark = pytest.mark.xfail(raises=TypeError, reason=reason)
request.node.add_marker(mark)
a = pd.array(["a", "b", None], dtype=dtype)
result = a * 2
expected = pd.array(["aa", "bb", None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = 2 * a
tm.assert_extension_array_equal(result, expected)
@pytest.mark.xfail(reason="GH-28527")
def test_add_strings(dtype):
arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
df = pd.DataFrame([["t", "u", "v", "w"]])
assert arr.__add__(df) is NotImplemented
result = arr + df
expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype(dtype)
tm.assert_frame_equal(result, expected)
result = df + arr
expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(reason="GH-28527")
def test_add_frame(dtype):
arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype)
df = pd.DataFrame([["x", np.nan, "y", np.nan]])
assert arr.__add__(df) is NotImplemented
result = arr + df
expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype)
tm.assert_frame_equal(result, expected)
result = df + arr
expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype)
tm.assert_frame_equal(result, expected)
def test_comparison_methods_scalar(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
other = "a"
result = getattr(a, op_name)(other)
expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
result = getattr(a, op_name)(pd.NA)
expected = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_comparison_methods_scalar_not_string(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
other = 42
if op_name not in ["__eq__", "__ne__"]:
with pytest.raises(TypeError, match="not supported between"):
getattr(a, op_name)(other)
return
result = getattr(a, op_name)(other)
expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
op_name
]
expected = pd.array(expected_data, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_comparison_methods_array(comparison_op, dtype):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
other = [None, None, "c"]
result = getattr(a, op_name)(other)
expected = np.empty_like(a, dtype="object")
expected[-1] = getattr(other[-1], op_name)(a[-1])
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = getattr(a, op_name)(pd.NA)
expected = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_constructor_raises(cls):
if cls is pd.arrays.StringArray:
msg = "StringArray requires a sequence of strings or pandas.NA"
else:
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", "b"], dtype="S1"))
with pytest.raises(ValueError, match=msg):
cls(np.array([]))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", np.nan], dtype=object))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", None], dtype=object))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", pd.NaT], dtype=object))
@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy, cls, request):
if cls is ArrowStringArray and copy is False:
mark = pytest.mark.xfail(
raises=AssertionError, reason="numpy array are different"
)
request.node.add_marker(mark)
nan_arr = np.array(["a", np.nan], dtype=object)
na_arr = np.array(["a", pd.NA], dtype=object)
result = cls._from_sequence(nan_arr, copy=copy)
if cls is ArrowStringArray:
import pyarrow as pa
expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
else:
expected = cls(na_arr)
tm.assert_extension_array_equal(result, expected)
expected = nan_arr if copy else na_arr
tm.assert_numpy_array_equal(nan_arr, expected)
def test_astype_int(dtype):
arr = pd.array(["1", "2", "3"], dtype=dtype)
result = arr.astype("int64")
expected = np.array([1, 2, 3], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number"
with pytest.raises(TypeError, match=msg):
arr.astype("int64")
def test_astype_nullable_int(dtype):
arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
result = arr.astype("Int64")
expected = pd.array([1, pd.NA, 3], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_astype_float(dtype, any_float_dtype):
# Don't compare arrays (37974)
ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype)
result = ser.astype(any_float_dtype)
expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.xfail(reason="Not implemented StringArray.sum")
def test_reduce(skipna, dtype):
arr = pd.Series(["a", "b", "c"], dtype=dtype)
result = arr.sum(skipna=skipna)
assert result == "abc"
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.xfail(reason="Not implemented StringArray.sum")
def test_reduce_missing(skipna, dtype):
arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
result = arr.sum(skipna=skipna)
if skipna:
assert result == "abc"
else:
assert pd.isna(result)
@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max(method, skipna, dtype, request):
if dtype.storage == "pyarrow":
reason = "'ArrowStringArray' object has no attribute 'max'"
mark = pytest.mark.xfail(raises=TypeError, reason=reason)
request.node.add_marker(mark)
arr = pd.Series(["a", "b", "c", None], dtype=dtype)
result = getattr(arr, method)(skipna=skipna)
if skipna:
expected = "a" if method == "min" else "c"
assert result == expected
else:
assert result is pd.NA
@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize("box", [pd.Series, pd.array])
def test_min_max_numpy(method, box, dtype, request):
if dtype.storage == "pyarrow":
if box is pd.array:
reason = "'<=' not supported between instances of 'str' and 'NoneType'"
else:
reason = "'ArrowStringArray' object has no attribute 'max'"
mark = pytest.mark.xfail(raises=TypeError, reason=reason)
request.node.add_marker(mark)
arr = box(["a", "b", "c", None], dtype=dtype)
result = getattr(np, method)(arr)
expected = "a" if method == "min" else "c"
assert result == expected
def test_fillna_args(dtype, request):
# GH 37987
if dtype.storage == "pyarrow":
reason = (
"Regex pattern \"Cannot set non-string value '1' into "
"a StringArray.\" does not match 'Scalar must be NA or str'"
)
mark = pytest.mark.xfail(raises=AssertionError, reason=reason)
request.node.add_marker(mark)
arr = pd.array(["a", pd.NA], dtype=dtype)
res = arr.fillna(value="b")
expected = pd.array(["a", "b"], dtype=dtype)
tm.assert_extension_array_equal(res, expected)
res = arr.fillna(value=np.str_("b"))
expected = pd.array(["a", "b"], dtype=dtype)
tm.assert_extension_array_equal(res, expected)
msg = "Cannot set non-string value '1' into a StringArray."
with pytest.raises(ValueError, match=msg):
arr.fillna(value=1)
@td.skip_if_no("pyarrow")
def test_arrow_array(dtype):
# protocol added in 0.15.0
import pyarrow as pa
data = pd.array(["a", "b", "c"], dtype=dtype)
arr = pa.array(data)
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
if dtype.storage == "pyarrow":
expected = pa.chunked_array(expected)
assert arr.equals(expected)
@td.skip_if_no("pyarrow")
def test_arrow_roundtrip(dtype, string_storage2):
# roundtrip possible from arrow 1.0.0
import pyarrow as pa
data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is pd.NA
@td.skip_if_no("pyarrow")
def test_arrow_load_from_zero_chunks(dtype, string_storage2):
# GH-41040
import pyarrow as pa
data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
tm.assert_frame_equal(result, expected)
def test_value_counts_na(dtype):
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize(dtype):
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"values, expected",
[
(["a", "b", "c"], np.array([False, False, False])),
(["a", "b", None], np.array([False, False, True])),
],
)
def test_use_inf_as_na(values, expected, dtype):
# https://github.com/pandas-dev/pandas/issues/33655
values = pd.array(values, dtype=dtype)
with pd.option_context("mode.use_inf_as_na", True):
result = values.isna()
tm.assert_numpy_array_equal(result, expected)
result = pd.Series(values).isna()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
result = pd.DataFrame(values).isna()
expected = pd.DataFrame(expected)
tm.assert_frame_equal(result, expected)
def test_memory_usage(dtype):
# GH 33963
if dtype.storage == "pyarrow":
pytest.skip(f"not applicable for {dtype.storage}")
series = pd.Series(["a", "b", "c"], dtype=dtype)
assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True)
@pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64])
def test_astype_from_float_dtype(float_dtype, dtype):
# https://github.com/pandas-dev/pandas/issues/36451
ser = pd.Series([0.1], dtype=float_dtype)
result = ser.astype(dtype)
expected = pd.Series(["0.1"], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_to_numpy_returns_pdna_default(dtype):
arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
result = np.array(arr)
expected = np.array(["a", pd.NA, "b"], dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_na_value(dtype, nulls_fixture):
na_value = nulls_fixture
arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
result = arr.to_numpy(na_value=na_value)
expected = np.array(["a", na_value, "b"], dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_isin(dtype, request, fixed_now_ts):
s = pd.Series(["a", "b", None], dtype=dtype)
result = s.isin(["a", "c"])
expected = pd.Series([True, False, False])
tm.assert_series_equal(result, expected)
result = s.isin(["a", pd.NA])
expected = pd.Series([True, False, True])
tm.assert_series_equal(result, expected)
result = s.isin([])
expected = pd.Series([False, False, False])
tm.assert_series_equal(result, expected)
result = s.isin(["a", fixed_now_ts])
expected = pd.Series([True, False, False])
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,134 @@
import re
import numpy as np
import pytest
from pandas.compat import pa_version_under1p01
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.string_ import (
StringArray,
StringDtype,
)
from pandas.core.arrays.string_arrow import ArrowStringArray
skip_if_no_pyarrow = pytest.mark.skipif(
pa_version_under1p01,
reason="pyarrow>=1.0.0 is required for PyArrow backed StringArray",
)
@skip_if_no_pyarrow
def test_eq_all_na():
a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow"))
result = a == a
expected = pd.array([pd.NA, pd.NA], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_config(string_storage):
with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage
expected = (
StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"])
)
tm.assert_equal(result, expected)
def test_config_bad_storage_raises():
msg = re.escape("Value must be one of python|pyarrow")
with pytest.raises(ValueError, match=msg):
pd.options.mode.string_storage = "foo"
@skip_if_no_pyarrow
@pytest.mark.parametrize("chunked", [True, False])
@pytest.mark.parametrize("array", ["numpy", "pyarrow"])
def test_constructor_not_string_type_raises(array, chunked):
import pyarrow as pa
array = pa if array == "pyarrow" else np
arr = array.array([1, 2, 3])
if chunked:
if array is np:
pytest.skip("chunked not applicable to numpy array")
arr = pa.chunked_array(arr)
if array is np:
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
else:
msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)
@skip_if_no_pyarrow
def test_from_sequence_wrong_dtype_raises():
with pd.option_context("string_storage", "python"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
with pd.option_context("string_storage", "pyarrow"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
with pytest.raises(AssertionError, match=None):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]")
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
with pytest.raises(AssertionError, match=None):
with pd.option_context("string_storage", "python"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
with pd.option_context("string_storage", "pyarrow"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
with pytest.raises(AssertionError, match=None):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
with pd.option_context("string_storage", "python"):
StringArray._from_sequence(["a", None, "c"], dtype="string")
with pd.option_context("string_storage", "pyarrow"):
StringArray._from_sequence(["a", None, "c"], dtype="string")
StringArray._from_sequence(["a", None, "c"], dtype="string[python]")
with pytest.raises(AssertionError, match=None):
StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
with pd.option_context("string_storage", "python"):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
with pytest.raises(AssertionError, match=None):
with pd.option_context("string_storage", "pyarrow"):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
with pytest.raises(AssertionError, match=None):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
@pytest.mark.skipif(
not pa_version_under1p01,
reason="pyarrow is installed",
)
def test_pyarrow_not_installed_raises():
msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed StringArray")
with pytest.raises(ImportError, match=msg):
StringDtype(storage="pyarrow")
with pytest.raises(ImportError, match=msg):
ArrowStringArray([])
with pytest.raises(ImportError, match=msg):
ArrowStringArray._from_sequence(["a", None, "b"])

View File

@ -0,0 +1,451 @@
import datetime
import decimal
import numpy as np
import pytest
import pytz
from pandas.core.dtypes.base import _registry as registry
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import register_extension_dtype
from pandas.api.types import is_scalar
from pandas.arrays import (
BooleanArray,
DatetimeArray,
FloatingArray,
IntegerArray,
IntervalArray,
SparseArray,
TimedeltaArray,
)
from pandas.core.arrays import (
PandasArray,
period_array,
)
from pandas.tests.extension.decimal import (
DecimalArray,
DecimalDtype,
to_decimal,
)
@pytest.mark.parametrize(
"data, dtype, expected",
[
# Basic NumPy defaults.
([1, 2], None, IntegerArray._from_sequence([1, 2])),
([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
(
[1, 2],
np.dtype("float32"),
PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
),
(np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])),
(
np.array([1.0, 2.0], dtype="float64"),
None,
FloatingArray._from_sequence([1.0, 2.0]),
),
# String alias passes through to NumPy
([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))),
([1, 2], "int64", PandasArray(np.array([1, 2], dtype=np.int64))),
# GH#44715 FloatingArray does not support float16, so fall back to PandasArray
(
np.array([1, 2], dtype=np.float16),
None,
PandasArray(np.array([1, 2], dtype=np.float16)),
),
# idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
(
PandasArray(np.array([1, 2], dtype=np.int32)),
None,
PandasArray(np.array([1, 2], dtype=np.int32)),
),
# Period alias
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
"Period[D]",
period_array(["2000", "2001"], freq="D"),
),
# Period dtype
(
[pd.Period("2000", "D")],
pd.PeriodDtype("D"),
period_array(["2000"], freq="D"),
),
# Datetime (naive)
(
[1, 2],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
),
(
np.array([1, 2], dtype="datetime64[ns]"),
None,
DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
),
(
pd.DatetimeIndex(["2000", "2001"]),
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
pd.DatetimeIndex(["2000", "2001"]),
None,
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
["2000", "2001"],
np.dtype("datetime64[ns]"),
DatetimeArray._from_sequence(["2000", "2001"]),
),
# Datetime (tz-aware)
(
["2000", "2001"],
pd.DatetimeTZDtype(tz="CET"),
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
# Timedelta
(
["1H", "2H"],
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
pd.TimedeltaIndex(["1H", "2H"]),
np.dtype("timedelta64[ns]"),
TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
pd.TimedeltaIndex(["1H", "2H"]),
None,
TimedeltaArray._from_sequence(["1H", "2H"]),
),
# Category
(["a", "b"], "category", pd.Categorical(["a", "b"])),
(
["a", "b"],
pd.CategoricalDtype(None, ordered=True),
pd.Categorical(["a", "b"], ordered=True),
),
# Interval
(
[pd.Interval(1, 2), pd.Interval(3, 4)],
"interval",
IntervalArray.from_tuples([(1, 2), (3, 4)]),
),
# Sparse
([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
# IntegerNA
([1, None], "Int16", pd.array([1, None], dtype="Int16")),
(pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# String
(
["a", None],
"string",
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
(
["a", None],
pd.StringDtype(),
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
# Boolean
([True, None], "boolean", BooleanArray._from_sequence([True, None])),
([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])),
# Index
(pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# Series[EA] returns the EA
(
pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
None,
pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
),
# "3rd party" EAs work
([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
# pass an ExtensionArray, but a different dtype
(
period_array(["2000", "2001"], freq="D"),
"category",
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
),
],
)
def test_array(data, dtype, expected):
result = pd.array(data, dtype=dtype)
tm.assert_equal(result, expected)
def test_array_copy():
a = np.array([1, 2])
# default is to copy
b = pd.array(a, dtype=a.dtype)
assert not tm.shares_memory(a, b)
# copy=True
b = pd.array(a, dtype=a.dtype, copy=True)
assert not tm.shares_memory(a, b)
# copy=False
b = pd.array(a, dtype=a.dtype, copy=False)
assert tm.shares_memory(a, b)
cet = pytz.timezone("CET")
@pytest.mark.parametrize(
"data, expected",
[
# period
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
period_array(["2000", "2001"], freq="D"),
),
# interval
([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
# datetime
(
[pd.Timestamp("2000"), pd.Timestamp("2001")],
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
DatetimeArray._from_sequence(["2000", "2001"]),
),
(
np.array([1, 2], dtype="M8[ns]"),
DatetimeArray(np.array([1, 2], dtype="M8[ns]")),
),
(
np.array([1, 2], dtype="M8[us]"),
DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")),
),
# datetimetz
(
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
(
[
datetime.datetime(2000, 1, 1, tzinfo=cet),
datetime.datetime(2001, 1, 1, tzinfo=cet),
],
DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet)
),
),
# timedelta
(
[pd.Timedelta("1H"), pd.Timedelta("2H")],
TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
np.array([1, 2], dtype="m8[ns]"),
TimedeltaArray(np.array([1, 2], dtype="m8[ns]")),
),
(
np.array([1, 2], dtype="m8[us]"),
TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")),
),
# integer
([1, 2], IntegerArray._from_sequence([1, 2])),
([1, None], IntegerArray._from_sequence([1, None])),
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA])),
([1, np.nan], IntegerArray._from_sequence([1, np.nan])),
# float
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2])),
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA])),
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA])),
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA])),
# integer-like float
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA])),
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA])),
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA])),
# mixed-integer-float
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])),
# string
(
["a", "b"],
pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]),
),
(
["a", None],
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False])),
([True, None], BooleanArray._from_sequence([True, None])),
],
)
def test_array_inference(data, expected):
result = pd.array(data)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
# mix of frequencies
[pd.Period("2000", "D"), pd.Period("2001", "A")],
# mix of closed
[pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
# Mix of timezones
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
# Mix of tz-aware and tz-naive
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
],
)
def test_array_inference_fails(data):
result = pd.array(data)
expected = PandasArray(np.array(data, dtype=object))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("data", [np.array(0)])
def test_nd_raises(data):
with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"):
pd.array(data, dtype="int64")
def test_scalar_raises():
with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
pd.array(1)
def test_bounds_check():
# GH21796
with pytest.raises(
TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
):
pd.array([-1, 2, 3], dtype="UInt16")
# ---------------------------------------------------------------------------
# A couple dummy classes to ensure that Series and Indexes are unboxed before
# getting to the EA classes.
@register_extension_dtype
class DecimalDtype2(DecimalDtype):
name = "decimal2"
@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return DecimalArray2
class DecimalArray2(DecimalArray):
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
if isinstance(scalars, (pd.Series, pd.Index)):
raise TypeError("scalars should not be of type pd.Series or pd.Index")
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
def test_array_unboxes(index_or_series):
box = index_or_series
data = box([decimal.Decimal("1"), decimal.Decimal("2")])
# make sure it works
with pytest.raises(
TypeError, match="scalars should not be of type pd.Series or pd.Index"
):
DecimalArray2._from_sequence(data)
result = pd.array(data, dtype="decimal2")
expected = DecimalArray2._from_sequence(data.values)
tm.assert_equal(result, expected)
@pytest.fixture
def registry_without_decimal():
idx = registry.dtypes.index(DecimalDtype)
registry.dtypes.pop(idx)
yield
registry.dtypes.append(DecimalDtype)
def test_array_not_registered(registry_without_decimal):
# check we aren't on it
assert registry.find("decimal") is None
data = [decimal.Decimal("1"), decimal.Decimal("2")]
result = pd.array(data, dtype=DecimalDtype)
expected = DecimalArray._from_sequence(data)
tm.assert_equal(result, expected)
class TestArrayAnalytics:
def test_searchsorted(self, string_dtype):
arr = pd.array(["a", "b", "c"], dtype=string_dtype)
result = arr.searchsorted("a", side="left")
assert is_scalar(result)
assert result == 0
result = arr.searchsorted("a", side="right")
assert is_scalar(result)
assert result == 1
def test_searchsorted_numeric_dtypes_scalar(self, any_real_numpy_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype)
result = arr.searchsorted(30)
assert is_scalar(result)
assert result == 2
result = arr.searchsorted([30])
expected = np.array([2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_searchsorted_numeric_dtypes_vector(self, any_real_numpy_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype)
result = arr.searchsorted([2, 30])
expected = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"arr, val",
[
[
pd.date_range("20120101", periods=10, freq="2D"),
pd.Timestamp("20120102"),
],
[
pd.date_range("20120101", periods=10, freq="2D", tz="Asia/Hong_Kong"),
pd.Timestamp("20120102", tz="Asia/Hong_Kong"),
],
[
pd.timedelta_range(start="1 day", end="10 days", periods=10),
pd.Timedelta("2 days"),
],
],
)
def test_search_sorted_datetime64_scalar(self, arr, val):
arr = pd.array(arr)
result = arr.searchsorted(val)
assert is_scalar(result)
assert result == 1
def test_searchsorted_sorter(self, any_real_numpy_dtype):
arr = pd.array([3, 1, 2], dtype=any_real_numpy_dtype)
result = arr.searchsorted([0, 3], sorter=np.argsort(arr))
expected = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,408 @@
"""
Tests for DatetimeArray
"""
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestDatetimeArrayComparisons:
# TODO: merge this into tests/arithmetic/test_datetime64 once it is
# sufficiently robust
def test_cmp_dt64_arraylike_tznaive(self, comparison_op):
# arbitrary tz-naive DatetimeIndex
op = comparison_op
dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None)
arr = DatetimeArray(dti)
assert arr.freq == dti.freq
assert arr.tz == dti.tz
right = dti
expected = np.ones(len(arr), dtype=bool)
if comparison_op.__name__ in ["ne", "gt", "lt"]:
# for these the comparisons should be all-False
expected = ~expected
result = op(arr, arr)
tm.assert_numpy_array_equal(result, expected)
for other in [
right,
np.array(right),
list(right),
tuple(right),
right.astype(object),
]:
result = op(arr, other)
tm.assert_numpy_array_equal(result, expected)
result = op(other, arr)
tm.assert_numpy_array_equal(result, expected)
class TestDatetimeArray:
def test_astype_to_same(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False)
assert result is arr
@pytest.mark.parametrize("dtype", ["datetime64[ns]", "datetime64[ns, UTC]"])
@pytest.mark.parametrize(
"other", ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, CET]"]
)
def test_astype_copies(self, dtype, other):
# https://github.com/pandas-dev/pandas/pull/32490
ser = pd.Series([1, 2], dtype=dtype)
orig = ser.copy()
warn = None
if (dtype == "datetime64[ns]") ^ (other == "datetime64[ns]"):
# deprecated in favor of tz_localize
warn = FutureWarning
with tm.assert_produces_warning(warn):
t = ser.astype(other)
t[:] = pd.NaT
tm.assert_series_equal(ser, orig)
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(self, dtype):
arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")])
result = arr.astype(dtype)
if np.dtype(dtype).kind == "u":
expected_dtype = np.dtype("uint64")
else:
expected_dtype = np.dtype("int64")
expected = arr.astype(expected_dtype)
assert result.dtype == expected_dtype
tm.assert_numpy_array_equal(result, expected)
def test_tz_setter_raises(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
with pytest.raises(AttributeError, match="tz_localize"):
arr.tz = "UTC"
def test_setitem_str_impute_tz(self, tz_naive_fixture):
# Like for getitem, if we are passed a naive-like string, we impute
# our own timezone.
tz = tz_naive_fixture
data = np.array([1, 2, 3], dtype="M8[ns]")
dtype = data.dtype if tz is None else DatetimeTZDtype(tz=tz)
arr = DatetimeArray(data, dtype=dtype)
expected = arr.copy()
ts = pd.Timestamp("2020-09-08 16:50").tz_localize(tz)
setter = str(ts.tz_localize(None))
# Setting a scalar tznaive string
expected[0] = ts
arr[0] = setter
tm.assert_equal(arr, expected)
# Setting a listlike of tznaive strings
expected[1] = ts
arr[:2] = [setter, setter]
tm.assert_equal(arr, expected)
def test_setitem_different_tz_raises(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central"))
with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"):
arr[0] = pd.Timestamp("2000")
ts = pd.Timestamp("2000", tz="US/Eastern")
with pytest.raises(ValueError, match="US/Central"):
with tm.assert_produces_warning(
FutureWarning, match="mismatched timezones"
):
arr[0] = ts
# once deprecation is enforced
# assert arr[0] == ts.tz_convert("US/Central")
def test_setitem_clears_freq(self):
a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central"))
a[0] = pd.Timestamp("2000", tz="US/Central")
assert a.freq is None
@pytest.mark.parametrize(
"obj",
[
pd.Timestamp("2021-01-01"),
pd.Timestamp("2021-01-01").to_datetime64(),
pd.Timestamp("2021-01-01").to_pydatetime(),
],
)
def test_setitem_objects(self, obj):
# make sure we accept datetime64 and datetime in addition to Timestamp
dti = pd.date_range("2000", periods=2, freq="D")
arr = dti._data
arr[0] = obj
assert arr[0] == obj
def test_repeat_preserves_tz(self):
dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
arr = DatetimeArray(dti)
repeated = arr.repeat([1, 1])
# preserves tz and values, but not freq
expected = DatetimeArray(arr.asi8, freq=None, dtype=arr.dtype)
tm.assert_equal(repeated, expected)
def test_value_counts_preserves_tz(self):
dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
arr = DatetimeArray(dti).repeat([4, 3])
result = arr.value_counts()
# Note: not tm.assert_index_equal, since `freq`s do not match
assert result.index.equals(dti)
arr[-2] = pd.NaT
result = arr.value_counts(dropna=False)
expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["pad", "backfill"])
def test_fillna_preserves_tz(self, method):
dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central")
arr = DatetimeArray(dti, copy=True)
arr[2] = pd.NaT
fill_val = dti[1] if method == "pad" else dti[3]
expected = DatetimeArray._from_sequence(
[dti[0], dti[1], fill_val, dti[3], dti[4]],
dtype=DatetimeTZDtype(tz="US/Central"),
)
result = arr.fillna(method=method)
tm.assert_extension_array_equal(result, expected)
# assert that arr and dti were not modified in-place
assert arr[2] is pd.NaT
assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central")
def test_fillna_2d(self):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
dta = dti._data.reshape(3, 2).copy()
dta[0, 1] = pd.NaT
dta[1, 0] = pd.NaT
res1 = dta.fillna(method="pad")
expected1 = dta.copy()
expected1[1, 0] = dta[0, 0]
tm.assert_extension_array_equal(res1, expected1)
res2 = dta.fillna(method="backfill")
expected2 = dta.copy()
expected2 = dta.copy()
expected2[1, 0] = dta[2, 0]
expected2[0, 1] = dta[1, 1]
tm.assert_extension_array_equal(res2, expected2)
# with different ordering for underlying ndarray; behavior should
# be unchanged
dta2 = dta._from_backing_data(dta._ndarray.copy(order="F"))
assert dta2._ndarray.flags["F_CONTIGUOUS"]
assert not dta2._ndarray.flags["C_CONTIGUOUS"]
tm.assert_extension_array_equal(dta, dta2)
res3 = dta2.fillna(method="pad")
tm.assert_extension_array_equal(res3, expected1)
res4 = dta2.fillna(method="backfill")
tm.assert_extension_array_equal(res4, expected2)
# test the DataFrame method while we're here
df = pd.DataFrame(dta)
res = df.fillna(method="pad")
expected = pd.DataFrame(expected1)
tm.assert_frame_equal(res, expected)
res = df.fillna(method="backfill")
expected = pd.DataFrame(expected2)
tm.assert_frame_equal(res, expected)
def test_array_interface_tz(self):
tz = "US/Central"
data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz))
result = np.asarray(data)
expected = np.array(
[
pd.Timestamp("2017-01-01T00:00:00", tz=tz),
pd.Timestamp("2017-01-02T00:00:00", tz=tz),
],
dtype=object,
)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype="M8[ns]")
expected = np.array(
["2017-01-01T06:00:00", "2017-01-02T06:00:00"], dtype="M8[ns]"
)
tm.assert_numpy_array_equal(result, expected)
def test_array_interface(self):
data = DatetimeArray(pd.date_range("2017", periods=2))
expected = np.array(
["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]"
)
result = np.asarray(data)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype=object)
expected = np.array(
[pd.Timestamp("2017-01-01T00:00:00"), pd.Timestamp("2017-01-02T00:00:00")],
dtype=object,
)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_different_tz(self, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = DatetimeArray(data, freq="D").tz_localize("Asia/Tokyo")
if index:
arr = pd.Index(arr)
expected = arr.searchsorted(arr[2])
result = arr.searchsorted(arr[2].tz_convert("UTC"))
assert result == expected
expected = arr.searchsorted(arr[2:6])
result = arr.searchsorted(arr[2:6].tz_convert("UTC"))
tm.assert_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_tzawareness_compat(self, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = DatetimeArray(data, freq="D")
if index:
arr = pd.Index(arr)
mismatch = arr.tz_localize("Asia/Tokyo")
msg = "Cannot compare tz-naive and tz-aware datetime-like objects"
with pytest.raises(TypeError, match=msg):
arr.searchsorted(mismatch[0])
with pytest.raises(TypeError, match=msg):
arr.searchsorted(mismatch)
with pytest.raises(TypeError, match=msg):
mismatch.searchsorted(arr[0])
with pytest.raises(TypeError, match=msg):
mismatch.searchsorted(arr)
@pytest.mark.parametrize(
"other",
[
1,
np.int64(1),
1.0,
np.timedelta64("NaT"),
pd.Timedelta(days=2),
"invalid",
np.arange(10, dtype="i8") * 24 * 3600 * 10**9,
np.arange(10).view("timedelta64[ns]") * 24 * 3600 * 10**9,
pd.Timestamp("2021-01-01").to_period("D"),
],
)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_invalid_types(self, other, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = DatetimeArray(data, freq="D")
if index:
arr = pd.Index(arr)
msg = "|".join(
[
"searchsorted requires compatible dtype or scalar",
"value should be a 'Timestamp', 'NaT', or array of those. Got",
]
)
with pytest.raises(TypeError, match=msg):
arr.searchsorted(other)
def test_shift_fill_value(self):
dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data
expected = DatetimeArray(np.roll(dta._data, 1))
fv = dta[-1]
for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]:
result = dta.shift(1, fill_value=fill_value)
tm.assert_datetime_array_equal(result, expected)
dta = dta.tz_localize("UTC")
expected = expected.tz_localize("UTC")
fv = dta[-1]
for fill_value in [fv, fv.to_pydatetime()]:
result = dta.shift(1, fill_value=fill_value)
tm.assert_datetime_array_equal(result, expected)
def test_shift_value_tzawareness_mismatch(self):
dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data
fv = dta[-1].tz_localize("UTC")
for invalid in [fv, fv.to_pydatetime()]:
with pytest.raises(TypeError, match="Cannot compare"):
dta.shift(1, fill_value=invalid)
dta = dta.tz_localize("UTC")
fv = dta[-1].tz_localize(None)
for invalid in [fv, fv.to_pydatetime(), fv.to_datetime64()]:
with pytest.raises(TypeError, match="Cannot compare"):
dta.shift(1, fill_value=invalid)
def test_shift_requires_tzmatch(self):
# since filling is setitem-like, we require a matching timezone,
# not just matching tzawawreness
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
dta = dti._data
fill_value = pd.Timestamp("2020-10-18 18:44", tz="US/Pacific")
msg = "Timezones don't match. 'UTC' != 'US/Pacific'"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match="mismatched timezones"
):
dta.shift(1, fill_value=fill_value)
# once deprecation is enforced
# expected = dta.shift(1, fill_value=fill_value.tz_convert("UTC"))
# tm.assert_equal(result, expected)
def test_tz_localize_t2d(self):
dti = pd.date_range("1994-05-12", periods=12, tz="US/Pacific")
dta = dti._data.reshape(3, 4)
result = dta.tz_localize(None)
expected = dta.ravel().tz_localize(None).reshape(dta.shape)
tm.assert_datetime_array_equal(result, expected)
roundtrip = expected.tz_localize("US/Pacific")
tm.assert_datetime_array_equal(roundtrip, dta)

View File

@ -0,0 +1,75 @@
"""
Tests for subclasses of NDArrayBackedExtensionArray
"""
import numpy as np
from pandas import (
CategoricalIndex,
date_range,
)
from pandas.core.arrays import (
Categorical,
DatetimeArray,
PandasArray,
TimedeltaArray,
)
class TestEmpty:
def test_empty_categorical(self):
ci = CategoricalIndex(["a", "b", "c"], ordered=True)
dtype = ci.dtype
# case with int8 codes
shape = (4,)
result = Categorical._empty(shape, dtype=dtype)
assert isinstance(result, Categorical)
assert result.shape == shape
assert result._ndarray.dtype == np.int8
# case where repr would segfault if we didn't override base implementation
result = Categorical._empty((4096,), dtype=dtype)
assert isinstance(result, Categorical)
assert result.shape == (4096,)
assert result._ndarray.dtype == np.int8
repr(result)
# case with int16 codes
ci = CategoricalIndex(list(range(512)) * 4, ordered=False)
dtype = ci.dtype
result = Categorical._empty(shape, dtype=dtype)
assert isinstance(result, Categorical)
assert result.shape == shape
assert result._ndarray.dtype == np.int16
def test_empty_dt64tz(self):
dti = date_range("2016-01-01", periods=2, tz="Asia/Tokyo")
dtype = dti.dtype
shape = (0,)
result = DatetimeArray._empty(shape, dtype=dtype)
assert result.dtype == dtype
assert isinstance(result, DatetimeArray)
assert result.shape == shape
def test_empty_dt64(self):
shape = (3, 9)
result = DatetimeArray._empty(shape, dtype="datetime64[ns]")
assert isinstance(result, DatetimeArray)
assert result.shape == shape
def test_empty_td64(self):
shape = (3, 9)
result = TimedeltaArray._empty(shape, dtype="m8[ns]")
assert isinstance(result, TimedeltaArray)
assert result.shape == shape
def test_empty_pandas_array(self):
arr = PandasArray(np.array([1, 2]))
dtype = arr.dtype
shape = (3, 9)
result = PandasArray._empty(shape, dtype=dtype)
assert isinstance(result, PandasArray)
assert result.dtype == dtype
assert result.shape == shape

View File

@ -0,0 +1,295 @@
"""
Additional tests for PandasArray that aren't covered by
the interface tests.
"""
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import PandasDtype
import pandas as pd
import pandas._testing as tm
from pandas.arrays import PandasArray
@pytest.fixture(
params=[
np.array(["a", "b"], dtype=object),
np.array([0, 1], dtype=float),
np.array([0, 1], dtype=int),
np.array([0, 1 + 2j], dtype=complex),
np.array([True, False], dtype=bool),
np.array([0, 1], dtype="datetime64[ns]"),
np.array([0, 1], dtype="timedelta64[ns]"),
]
)
def any_numpy_array(request):
"""
Parametrized fixture for NumPy arrays with different dtypes.
This excludes string and bytes.
"""
return request.param
# ----------------------------------------------------------------------------
# PandasDtype
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", True),
("uint", True),
("float", True),
("complex", True),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_numeric(dtype, expected):
dtype = PandasDtype(dtype)
assert dtype._is_numeric is expected
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", False),
("uint", False),
("float", False),
("complex", False),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_boolean(dtype, expected):
dtype = PandasDtype(dtype)
assert dtype._is_boolean is expected
def test_repr():
dtype = PandasDtype(np.dtype("int64"))
assert repr(dtype) == "PandasDtype('int64')"
def test_constructor_from_string():
result = PandasDtype.construct_from_string("int64")
expected = PandasDtype(np.dtype("int64"))
assert result == expected
def test_dtype_univalent(any_numpy_dtype):
dtype = PandasDtype(any_numpy_dtype)
result = PandasDtype(dtype)
assert result == dtype
# ----------------------------------------------------------------------------
# Construction
def test_constructor_no_coercion():
with pytest.raises(ValueError, match="NumPy array"):
PandasArray([1, 2, 3])
def test_series_constructor_with_copy():
ndarray = np.array([1, 2, 3])
ser = pd.Series(PandasArray(ndarray), copy=True)
assert ser.values is not ndarray
def test_series_constructor_with_astype():
ndarray = np.array([1, 2, 3])
result = pd.Series(PandasArray(ndarray), dtype="float64")
expected = pd.Series([1.0, 2.0, 3.0], dtype="float64")
tm.assert_series_equal(result, expected)
def test_from_sequence_dtype():
arr = np.array([1, 2, 3], dtype="int64")
result = PandasArray._from_sequence(arr, dtype="uint64")
expected = PandasArray(np.array([1, 2, 3], dtype="uint64"))
tm.assert_extension_array_equal(result, expected)
def test_constructor_copy():
arr = np.array([0, 1])
result = PandasArray(arr, copy=True)
assert not tm.shares_memory(result, arr)
def test_constructor_with_data(any_numpy_array):
nparr = any_numpy_array
arr = PandasArray(nparr)
assert arr.dtype.numpy_dtype == nparr.dtype
# ----------------------------------------------------------------------------
# Conversion
def test_to_numpy():
arr = PandasArray(np.array([1, 2, 3]))
result = arr.to_numpy()
assert result is arr._ndarray
result = arr.to_numpy(copy=True)
assert result is not arr._ndarray
result = arr.to_numpy(dtype="f8")
expected = np.array([1, 2, 3], dtype="f8")
tm.assert_numpy_array_equal(result, expected)
# ----------------------------------------------------------------------------
# Setitem
def test_setitem_series():
ser = pd.Series([1, 2, 3])
ser.array[0] = 10
expected = pd.Series([10, 2, 3])
tm.assert_series_equal(ser, expected)
def test_setitem(any_numpy_array):
nparr = any_numpy_array
arr = PandasArray(nparr, copy=True)
arr[0] = arr[1]
nparr[0] = nparr[1]
tm.assert_numpy_array_equal(arr.to_numpy(), nparr)
# ----------------------------------------------------------------------------
# Reductions
def test_bad_reduce_raises():
arr = np.array([1, 2, 3], dtype="int64")
arr = PandasArray(arr)
msg = "cannot perform not_a_method with type int"
with pytest.raises(TypeError, match=msg):
arr._reduce(msg)
def test_validate_reduction_keyword_args():
arr = PandasArray(np.array([1, 2, 3]))
msg = "the 'keepdims' parameter is not supported .*all"
with pytest.raises(ValueError, match=msg):
arr.all(keepdims=True)
def test_np_max_nested_tuples():
# case where checking in ufunc.nout works while checking for tuples
# does not
vals = [
(("j", "k"), ("l", "m")),
(("l", "m"), ("o", "p")),
(("o", "p"), ("j", "k")),
]
ser = pd.Series(vals)
arr = ser.array
assert arr.max() is arr[2]
assert ser.max() is arr[2]
result = np.maximum.reduce(arr)
assert result == arr[2]
result = np.maximum.reduce(ser)
assert result == arr[2]
def test_np_reduce_2d():
raw = np.arange(12).reshape(4, 3)
arr = PandasArray(raw)
res = np.maximum.reduce(arr, axis=0)
tm.assert_extension_array_equal(res, arr[-1])
alt = arr.max(axis=0)
tm.assert_extension_array_equal(alt, arr[-1])
# ----------------------------------------------------------------------------
# Ops
@pytest.mark.parametrize("ufunc", [np.abs, np.negative, np.positive])
def test_ufunc_unary(ufunc):
arr = PandasArray(np.array([-1.0, 0.0, 1.0]))
result = ufunc(arr)
expected = PandasArray(ufunc(arr._ndarray))
tm.assert_extension_array_equal(result, expected)
def test_ufunc():
arr = PandasArray(np.array([-1.0, 0.0, 1.0]))
r1, r2 = np.divmod(arr, np.add(arr, 2))
e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2))
e1 = PandasArray(e1)
e2 = PandasArray(e2)
tm.assert_extension_array_equal(r1, e1)
tm.assert_extension_array_equal(r2, e2)
def test_basic_binop():
# Just a basic smoke test. The EA interface tests exercise this
# more thoroughly.
x = PandasArray(np.array([1, 2, 3]))
result = x + x
expected = PandasArray(np.array([2, 4, 6]))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, object])
def test_setitem_object_typecode(dtype):
arr = PandasArray(np.array(["a", "b", "c"], dtype=dtype))
arr[0] = "t"
expected = PandasArray(np.array(["t", "b", "c"], dtype=dtype))
tm.assert_extension_array_equal(arr, expected)
def test_setitem_no_coercion():
# https://github.com/pandas-dev/pandas/issues/28150
arr = PandasArray(np.array([1, 2, 3]))
with pytest.raises(ValueError, match="int"):
arr[0] = "a"
# With a value that we do coerce, check that we coerce the value
# and not the underlying array.
arr[0] = 2.5
assert isinstance(arr[0], (int, np.integer)), type(arr[0])
def test_setitem_preserves_views():
# GH#28150, see also extension test of the same name
arr = PandasArray(np.array([1, 2, 3]))
view1 = arr.view()
view2 = arr[:]
view3 = np.asarray(arr)
arr[0] = 9
assert view1[0] == 9
assert view2[0] == 9
assert view3[0] == 9
arr[-1] = 2.5
view1[-1] = 5
assert arr[-1] == 5

View File

@ -0,0 +1,162 @@
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
from pandas._libs.tslibs.period import IncompatibleFrequency
from pandas.core.dtypes.base import _registry as registry
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
PeriodArray,
period_array,
)
# ----------------------------------------------------------------------------
# Dtype
def test_registered():
assert PeriodDtype in registry.dtypes
result = registry.find("Period[D]")
expected = PeriodDtype("D")
assert result == expected
# ----------------------------------------------------------------------------
# period_array
def test_asi8():
result = period_array(["2000", "2001", None], freq="D").asi8
expected = np.array([10957, 11323, iNaT])
tm.assert_numpy_array_equal(result, expected)
def test_take_raises():
arr = period_array(["2000", "2001"], freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr.take([0, -1], allow_fill=True, fill_value=pd.Period("2000", freq="W"))
msg = "value should be a 'Period' or 'NaT'. Got 'str' instead"
with pytest.raises(TypeError, match=msg):
arr.take([0, -1], allow_fill=True, fill_value="foo")
def test_fillna_raises():
arr = period_array(["2000", "2001", "2002"], freq="D")
with pytest.raises(ValueError, match="Length"):
arr.fillna(arr[:2])
def test_fillna_copies():
arr = period_array(["2000", "2001", "2002"], freq="D")
result = arr.fillna(pd.Period("2000", "D"))
assert result is not arr
# ----------------------------------------------------------------------------
# setitem
@pytest.mark.parametrize(
"key, value, expected",
[
([0], pd.Period("2000", "D"), [10957, 1, 2]),
([0], None, [iNaT, 1, 2]),
([0], np.nan, [iNaT, 1, 2]),
([0, 1, 2], pd.Period("2000", "D"), [10957] * 3),
(
[0, 1, 2],
[pd.Period("2000", "D"), pd.Period("2001", "D"), pd.Period("2002", "D")],
[10957, 11323, 11688],
),
],
)
def test_setitem(key, value, expected):
arr = PeriodArray(np.arange(3), freq="D")
expected = PeriodArray(expected, freq="D")
arr[key] = value
tm.assert_period_array_equal(arr, expected)
def test_setitem_raises_incompatible_freq():
arr = PeriodArray(np.arange(3), freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr[0] = pd.Period("2000", freq="A")
other = period_array(["2000", "2001"], freq="A")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr[[0, 1]] = other
def test_setitem_raises_length():
arr = PeriodArray(np.arange(3), freq="D")
with pytest.raises(ValueError, match="length"):
arr[[0, 1]] = [pd.Period("2000", freq="D")]
def test_setitem_raises_type():
arr = PeriodArray(np.arange(3), freq="D")
with pytest.raises(TypeError, match="int"):
arr[0] = 1
# ----------------------------------------------------------------------------
# Ops
def test_sub_period():
arr = period_array(["2000", "2001"], freq="D")
other = pd.Period("2000", freq="M")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr - other
# ----------------------------------------------------------------------------
# Methods
@pytest.mark.parametrize(
"other",
[pd.Period("2000", freq="H"), period_array(["2000", "2001", "2000"], freq="H")],
)
def test_where_different_freq_raises(other):
ser = pd.Series(period_array(["2000", "2001", "2002"], freq="D"))
cond = np.array([True, False, True])
with pytest.raises(IncompatibleFrequency, match="freq"):
ser.where(cond, other)
# ----------------------------------------------------------------------------
# Printing
def test_repr_small():
arr = period_array(["2000", "2001"], freq="D")
result = str(arr)
expected = (
"<PeriodArray>\n['2000-01-01', '2001-01-01']\nLength: 2, dtype: period[D]"
)
assert result == expected
def test_repr_large():
arr = period_array(["2000", "2001"] * 500, freq="D")
result = str(arr)
expected = (
"<PeriodArray>\n"
"['2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
"'2000-01-01',\n"
" '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
"'2001-01-01',\n"
" ...\n"
" '2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
"'2000-01-01',\n"
" '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
"'2001-01-01']\n"
"Length: 1000, dtype: period[D]"
)
assert result == expected

View File

@ -0,0 +1,126 @@
import numpy as np
import pytest
import pandas as pd
from pandas import Timedelta
import pandas._testing as tm
from pandas.core.arrays import TimedeltaArray
class TestTimedeltaArray:
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(self, dtype):
arr = TimedeltaArray._from_sequence([Timedelta("1H"), Timedelta("2H")])
result = arr.astype(dtype)
if np.dtype(dtype).kind == "u":
expected_dtype = np.dtype("uint64")
else:
expected_dtype = np.dtype("int64")
expected = arr.astype(expected_dtype)
assert result.dtype == expected_dtype
tm.assert_numpy_array_equal(result, expected)
def test_setitem_clears_freq(self):
a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H"))
a[0] = Timedelta("1H")
assert a.freq is None
@pytest.mark.parametrize(
"obj",
[
Timedelta(seconds=1),
Timedelta(seconds=1).to_timedelta64(),
Timedelta(seconds=1).to_pytimedelta(),
],
)
def test_setitem_objects(self, obj):
# make sure we accept timedelta64 and timedelta in addition to Timedelta
tdi = pd.timedelta_range("2 Days", periods=4, freq="H")
arr = TimedeltaArray(tdi, freq=tdi.freq)
arr[0] = obj
assert arr[0] == Timedelta(seconds=1)
@pytest.mark.parametrize(
"other",
[
1,
np.int64(1),
1.0,
np.datetime64("NaT"),
pd.Timestamp("2021-01-01"),
"invalid",
np.arange(10, dtype="i8") * 24 * 3600 * 10**9,
(np.arange(10) * 24 * 3600 * 10**9).view("datetime64[ns]"),
pd.Timestamp("2021-01-01").to_period("D"),
],
)
@pytest.mark.parametrize("index", [True, False])
def test_searchsorted_invalid_types(self, other, index):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
arr = TimedeltaArray(data, freq="D")
if index:
arr = pd.Index(arr)
msg = "|".join(
[
"searchsorted requires compatible dtype or scalar",
"value should be a 'Timedelta', 'NaT', or array of those. Got",
]
)
with pytest.raises(TypeError, match=msg):
arr.searchsorted(other)
class TestUnaryOps:
def test_abs(self):
vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
arr = TimedeltaArray(vals)
evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
expected = TimedeltaArray(evals)
result = abs(arr)
tm.assert_timedelta_array_equal(result, expected)
result2 = np.abs(arr)
tm.assert_timedelta_array_equal(result2, expected)
def test_pos(self):
vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
arr = TimedeltaArray(vals)
result = +arr
tm.assert_timedelta_array_equal(result, arr)
assert not tm.shares_memory(result, arr)
result2 = np.positive(arr)
tm.assert_timedelta_array_equal(result2, arr)
assert not tm.shares_memory(result2, arr)
def test_neg(self):
vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
arr = TimedeltaArray(vals)
evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]")
expected = TimedeltaArray(evals)
result = -arr
tm.assert_timedelta_array_equal(result, expected)
result2 = np.negative(arr)
tm.assert_timedelta_array_equal(result2, expected)
def test_neg_freq(self):
tdi = pd.timedelta_range("2 Days", periods=4, freq="H")
arr = TimedeltaArray(tdi, freq=tdi.freq)
expected = TimedeltaArray(-tdi._data, freq=-tdi.freq)
result = -arr
tm.assert_timedelta_array_equal(result, expected)
result2 = np.negative(arr)
tm.assert_timedelta_array_equal(result2, expected)

View File

@ -0,0 +1,63 @@
import numpy as np
import pytest
from pandas.core.arrays import TimedeltaArray
class TestTimedeltaArrayConstructor:
def test_only_1dim_accepted(self):
# GH#25282
arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
TimedeltaArray(arr.reshape(2, 2, 1))
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
TimedeltaArray(arr[[0]].squeeze())
def test_freq_validation(self):
# ensure that the public constructor cannot create an invalid instance
arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9
msg = (
"Inferred frequency None from passed values does not "
"conform to passed frequency D"
)
with pytest.raises(ValueError, match=msg):
TimedeltaArray(arr.view("timedelta64[ns]"), freq="D")
def test_non_array_raises(self):
with pytest.raises(ValueError, match="list"):
TimedeltaArray([1, 2, 3])
def test_other_type_raises(self):
with pytest.raises(ValueError, match="dtype bool cannot be converted"):
TimedeltaArray(np.array([1, 2, 3], dtype="bool"))
def test_incorrect_dtype_raises(self):
# TODO: why TypeError for 'category' but ValueError for i8?
with pytest.raises(
ValueError, match=r"category cannot be converted to timedelta64\[ns\]"
):
TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
with pytest.raises(
ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]"
):
TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64"))
def test_copy(self):
data = np.array([1, 2, 3], dtype="m8[ns]")
arr = TimedeltaArray(data, copy=False)
assert arr._data is data
arr = TimedeltaArray(data, copy=True)
assert arr._data is not data
assert arr._data.base is not data
def test_from_sequence_dtype(self):
msg = "dtype .*object.* cannot be converted to timedelta64"
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence([], dtype=object)

View File

@ -0,0 +1,214 @@
import numpy as np
import pytest
import pandas as pd
from pandas import Timedelta
import pandas._testing as tm
from pandas.core import nanops
from pandas.core.arrays import TimedeltaArray
class TestReductions:
@pytest.mark.parametrize("name", ["std", "min", "max", "median", "mean"])
@pytest.mark.parametrize("skipna", [True, False])
def test_reductions_empty(self, name, skipna):
tdi = pd.TimedeltaIndex([])
arr = tdi.array
result = getattr(tdi, name)(skipna=skipna)
assert result is pd.NaT
result = getattr(arr, name)(skipna=skipna)
assert result is pd.NaT
@pytest.mark.parametrize("skipna", [True, False])
def test_sum_empty(self, skipna):
tdi = pd.TimedeltaIndex([])
arr = tdi.array
result = tdi.sum(skipna=skipna)
assert isinstance(result, Timedelta)
assert result == Timedelta(0)
result = arr.sum(skipna=skipna)
assert isinstance(result, Timedelta)
assert result == Timedelta(0)
def test_min_max(self):
arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"])
result = arr.min()
expected = Timedelta("2H")
assert result == expected
result = arr.max()
expected = Timedelta("5H")
assert result == expected
result = arr.min(skipna=False)
assert result is pd.NaT
result = arr.max(skipna=False)
assert result is pd.NaT
def test_sum(self):
tdi = pd.TimedeltaIndex(["3H", "3H", "NaT", "2H", "5H", "4H"])
arr = tdi.array
result = arr.sum(skipna=True)
expected = Timedelta(hours=17)
assert isinstance(result, Timedelta)
assert result == expected
result = tdi.sum(skipna=True)
assert isinstance(result, Timedelta)
assert result == expected
result = arr.sum(skipna=False)
assert result is pd.NaT
result = tdi.sum(skipna=False)
assert result is pd.NaT
result = arr.sum(min_count=9)
assert result is pd.NaT
result = tdi.sum(min_count=9)
assert result is pd.NaT
result = arr.sum(min_count=1)
assert isinstance(result, Timedelta)
assert result == expected
result = tdi.sum(min_count=1)
assert isinstance(result, Timedelta)
assert result == expected
def test_npsum(self):
# GH#25282, GH#25335 np.sum should return a Timedelta, not timedelta64
tdi = pd.TimedeltaIndex(["3H", "3H", "2H", "5H", "4H"])
arr = tdi.array
result = np.sum(tdi)
expected = Timedelta(hours=17)
assert isinstance(result, Timedelta)
assert result == expected
result = np.sum(arr)
assert isinstance(result, Timedelta)
assert result == expected
def test_sum_2d_skipna_false(self):
arr = np.arange(8).astype(np.int64).view("m8[s]").astype("m8[ns]").reshape(4, 2)
arr[-1, -1] = "Nat"
tda = TimedeltaArray(arr)
result = tda.sum(skipna=False)
assert result is pd.NaT
result = tda.sum(axis=0, skipna=False)
expected = pd.TimedeltaIndex([Timedelta(seconds=12), pd.NaT])._values
tm.assert_timedelta_array_equal(result, expected)
result = tda.sum(axis=1, skipna=False)
expected = pd.TimedeltaIndex(
[
Timedelta(seconds=1),
Timedelta(seconds=5),
Timedelta(seconds=9),
pd.NaT,
]
)._values
tm.assert_timedelta_array_equal(result, expected)
# Adding a Timestamp makes this a test for DatetimeArray.std
@pytest.mark.parametrize(
"add",
[
Timedelta(0),
pd.Timestamp("2021-01-01"),
pd.Timestamp("2021-01-01", tz="UTC"),
pd.Timestamp("2021-01-01", tz="Asia/Tokyo"),
],
)
def test_std(self, add):
tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"]) + add
arr = tdi.array
result = arr.std(skipna=True)
expected = Timedelta(hours=2)
assert isinstance(result, Timedelta)
assert result == expected
result = tdi.std(skipna=True)
assert isinstance(result, Timedelta)
assert result == expected
if getattr(arr, "tz", None) is None:
result = nanops.nanstd(np.asarray(arr), skipna=True)
assert isinstance(result, Timedelta)
assert result == expected
result = arr.std(skipna=False)
assert result is pd.NaT
result = tdi.std(skipna=False)
assert result is pd.NaT
if getattr(arr, "tz", None) is None:
result = nanops.nanstd(np.asarray(arr), skipna=False)
assert result is pd.NaT
def test_median(self):
tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"])
arr = tdi.array
result = arr.median(skipna=True)
expected = Timedelta(hours=2)
assert isinstance(result, Timedelta)
assert result == expected
result = tdi.median(skipna=True)
assert isinstance(result, Timedelta)
assert result == expected
result = arr.median(skipna=False)
assert result is pd.NaT
result = tdi.median(skipna=False)
assert result is pd.NaT
def test_mean(self):
tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"])
arr = tdi._data
# manually verified result
expected = Timedelta(arr.dropna()._ndarray.mean())
result = arr.mean()
assert result == expected
result = arr.mean(skipna=False)
assert result is pd.NaT
result = arr.dropna().mean(skipna=False)
assert result == expected
result = arr.mean(axis=0)
assert result == expected
def test_mean_2d(self):
tdi = pd.timedelta_range("14 days", periods=6)
tda = tdi._data.reshape(3, 2)
result = tda.mean(axis=0)
expected = tda[1]
tm.assert_timedelta_array_equal(result, expected)
result = tda.mean(axis=1)
expected = tda[:, 0] + Timedelta(hours=12)
tm.assert_timedelta_array_equal(result, expected)
result = tda.mean(axis=None)
expected = tdi.mean()
assert result == expected