first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@ -0,0 +1,159 @@
import string
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import (
SparseArray,
SparseDtype,
)
class TestSeriesAccessor:
# TODO: collect other Series accessor tests
def test_to_dense(self):
s = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]")
result = s.sparse.to_dense()
expected = pd.Series([0, 1, 0, 10])
tm.assert_series_equal(result, expected)
class TestFrameAccessor:
def test_accessor_raises(self):
df = pd.DataFrame({"A": [0, 1]})
with pytest.raises(AttributeError, match="sparse"):
df.sparse
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
@pytest.mark.parametrize("dtype", ["float64", "int64"])
@td.skip_if_no_scipy
def test_from_spmatrix(self, format, labels, dtype):
import scipy.sparse
sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
mat = scipy.sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
expected = pd.DataFrame(
np.eye(10, dtype=dtype), index=labels, columns=labels
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@td.skip_if_no_scipy
def test_from_spmatrix_including_explicit_zero(self, format):
import scipy.sparse
mat = scipy.sparse.random(10, 2, density=0.5, format=format)
mat.data[0] = 0
result = pd.DataFrame.sparse.from_spmatrix(mat)
dtype = SparseDtype("float64", 0.0)
expected = pd.DataFrame(mat.todense()).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"columns",
[["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]],
)
@td.skip_if_no_scipy
def test_from_spmatrix_columns(self, columns):
import scipy.sparse
dtype = SparseDtype("float64", 0.0)
mat = scipy.sparse.random(10, 2, density=0.5)
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
)
@td.skip_if_no_scipy
def test_to_coo(self, colnames):
import scipy.sparse
df = pd.DataFrame(
{colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
)
result = df.sparse.to_coo()
expected = scipy.sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0
@pytest.mark.parametrize("fill_value", [1, np.nan])
@td.skip_if_no_scipy
def test_to_coo_nonzero_fill_val_raises(self, fill_value):
df = pd.DataFrame(
{
"A": SparseArray(
[fill_value, fill_value, fill_value, 2], fill_value=fill_value
),
"B": SparseArray(
[fill_value, 2, fill_value, fill_value], fill_value=fill_value
),
}
)
with pytest.raises(ValueError, match="fill value must be 0"):
df.sparse.to_coo()
def test_to_dense(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)),
"B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)),
"C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)),
},
index=["b", "a"],
)
result = df.sparse.to_dense()
expected = pd.DataFrame(
{"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"]
)
tm.assert_frame_equal(result, expected)
def test_density(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0, 2, 1], fill_value=0),
"B": SparseArray([0, 1, 1, 1], fill_value=0),
}
)
res = df.sparse.density
expected = 0.75
assert res == expected
@pytest.mark.parametrize("dtype", ["int64", "float64"])
@pytest.mark.parametrize("dense_index", [True, False])
@td.skip_if_no_scipy
def test_series_from_coo(self, dtype, dense_index):
import scipy.sparse
A = scipy.sparse.eye(3, format="coo", dtype=dtype)
result = pd.Series.sparse.from_coo(A, dense_index=dense_index)
index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index)
if dense_index:
expected = expected.reindex(pd.MultiIndex.from_product(index.levels))
tm.assert_series_equal(result, expected)
@td.skip_if_no_scipy
def test_series_from_coo_incorrect_format_raises(self):
# gh-26554
import scipy.sparse
m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]]))
with pytest.raises(
TypeError, match="Expected coo_matrix. Got csr_matrix instead."
):
pd.Series.sparse.from_coo(m)
def test_with_column_named_sparse(self):
# https://github.com/pandas-dev/pandas/issues/30758
df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])})
assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor)

View File

@ -0,0 +1,530 @@
import operator
import numpy as np
import pytest
from pandas.compat import np_version_under1p20
import pandas as pd
import pandas._testing as tm
from pandas.core import ops
from pandas.core.arrays.sparse import (
SparseArray,
SparseDtype,
)
@pytest.fixture(params=["integer", "block"])
def kind(request):
"""kind kwarg to pass to SparseArray/SparseSeries"""
return request.param
@pytest.fixture(params=[True, False])
def mix(request):
# whether to operate op(sparse, dense) instead of op(sparse, sparse)
return request.param
class TestSparseArrayArithmetics:
_base = np.array
_klass = SparseArray
def _assert(self, a, b):
# We have to use tm.assert_sp_array_equal. See GH #45126
tm.assert_numpy_array_equal(a, b)
def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op):
# Check that arithmetic behavior matches non-Sparse Series arithmetic
if isinstance(a_dense, np.ndarray):
expected = op(pd.Series(a_dense), b_dense).values
elif isinstance(b_dense, np.ndarray):
expected = op(a_dense, pd.Series(b_dense)).values
else:
raise NotImplementedError
with np.errstate(invalid="ignore", divide="ignore"):
if mix:
result = op(a, b_dense).to_dense()
else:
result = op(a, b).to_dense()
self._assert(result, expected)
def _check_bool_result(self, res):
assert isinstance(res, self._klass)
assert isinstance(res.dtype, SparseDtype)
assert res.dtype.subtype == np.bool_
assert isinstance(res.fill_value, bool)
def _check_comparison_ops(self, a, b, a_dense, b_dense):
with np.errstate(invalid="ignore"):
# Unfortunately, trying to wrap the computation of each expected
# value is with np.errstate() is too tedious.
#
# sparse & sparse
self._check_bool_result(a == b)
self._assert((a == b).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b)
self._assert((a != b).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b)
self._assert((a >= b).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b)
self._assert((a <= b).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b)
self._assert((a > b).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b)
self._assert((a < b).to_dense(), a_dense < b_dense)
# sparse & dense
self._check_bool_result(a == b_dense)
self._assert((a == b_dense).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b_dense)
self._assert((a != b_dense).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b_dense)
self._assert((a >= b_dense).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b_dense)
self._assert((a <= b_dense).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b_dense)
self._assert((a > b_dense).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b_dense)
self._assert((a < b_dense).to_dense(), a_dense < b_dense)
def _check_logical_ops(self, a, b, a_dense, b_dense):
# sparse & sparse
self._check_bool_result(a & b)
self._assert((a & b).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b)
self._assert((a | b).to_dense(), a_dense | b_dense)
# sparse & dense
self._check_bool_result(a & b_dense)
self._assert((a & b_dense).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b_dense)
self._assert((a | b_dense).to_dense(), a_dense | b_dense)
@pytest.mark.parametrize("scalar", [0, 1, 3])
@pytest.mark.parametrize("fill_value", [None, 0, 2])
def test_float_scalar(
self, kind, mix, all_arithmetic_functions, fill_value, scalar, request
):
op = all_arithmetic_functions
if np_version_under1p20:
if op in [operator.floordiv, ops.rfloordiv]:
if op is operator.floordiv and scalar != 0:
pass
elif op is ops.rfloordiv and scalar == 0:
pass
else:
mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172")
request.node.add_marker(mark)
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = self._klass(values, kind=kind, fill_value=fill_value)
self._check_numeric_ops(a, scalar, values, scalar, mix, op)
def test_float_scalar_comparison(self, kind):
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = self._klass(values, kind=kind)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = self._klass(values, kind=kind, fill_value=0)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = self._klass(values, kind=kind, fill_value=2)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions):
# when sp_index are the same
op = all_arithmetic_functions
values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_with_nans(
self, kind, mix, all_arithmetic_functions, request
):
# when sp_index are the same
op = all_arithmetic_functions
if (
np_version_under1p20
and op is ops.rfloordiv
and not (mix and kind == "block")
):
mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172")
request.node.add_marker(mark)
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_comparison(self, kind):
# when sp_index are the same
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
def test_float_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_different_kind(self, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind="integer")
b = self._klass(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind="integer", fill_value=0)
b = self._klass(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind="integer", fill_value=0)
b = self._klass(rvalues, kind="block", fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind="integer", fill_value=1)
b = self._klass(rvalues, kind="block", fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_comparison(self, kind):
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_int_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
# have to specify dtype explicitly until fixing GH 667
dtype = np.int64
values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = self._klass(values, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, fill_value=1, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype, fill_value=1)
b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_int_array_comparison(self, kind):
dtype = "int64"
# int32 NI ATM
values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = self._klass(values, dtype=dtype, kind=kind)
b = self._klass(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
b = self._klass(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=1)
b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_same_index(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = self._base([True, False, True, True], dtype=np.bool_)
rvalues = self._base([True, False, True, True], dtype=np.bool_)
a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_array_logical(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = self._base([True, False, True, False, True, True], dtype=np.bool_)
rvalues = self._base([True, False, False, True, False, True], dtype=np.bool_)
a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request):
op = all_arithmetic_functions
if np_version_under1p20 and op in [operator.floordiv, ops.rfloordiv] and mix:
mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172")
request.node.add_marker(mark)
rdtype = "int64"
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_mixed_array_comparison(self, kind):
rdtype = "int64"
# int32 NI ATM
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_xor(self):
s = SparseArray([True, True, False, False])
t = SparseArray([True, False, True, False])
result = s ^ t
sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
expected = SparseArray([False, True, True], sparse_index=sp_index)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("op", [operator.eq, operator.add])
def test_with_list(op):
arr = SparseArray([0, 1], fill_value=0)
result = op(arr, [0, 1])
expected = op(arr, SparseArray([0, 1]))
tm.assert_sp_array_equal(result, expected)
def test_with_dataframe():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
df = pd.DataFrame([[1, 2], [3, 4]])
result = arr.__add__(df)
assert result is NotImplemented
def test_with_zerodim_ndarray():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
result = arr * np.array(2)
expected = arr * 2
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.abs, np.exp])
@pytest.mark.parametrize(
"arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])]
)
def test_ufuncs(ufunc, arr):
result = ufunc(arr)
fill_value = ufunc(arr.fill_value)
expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
(SparseArray([0, 0, 0]), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
],
)
@pytest.mark.parametrize("ufunc", [np.add, np.greater])
def test_binary_ufuncs(ufunc, a, b):
# can't say anything about fill value here.
result = ufunc(a, b)
expected = ufunc(np.asarray(a), np.asarray(b))
assert isinstance(result, SparseArray)
tm.assert_numpy_array_equal(np.asarray(result), expected)
def test_ndarray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
ndarray += sparray
expected = np.array([0, 3, 2, 3])
tm.assert_numpy_array_equal(ndarray, expected)
def test_sparray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
sparray += ndarray
expected = SparseArray([0, 3, 2, 3], fill_value=0)
tm.assert_sp_array_equal(sparray, expected)
@pytest.mark.parametrize("fill_value", [True, False])
def test_invert(fill_value):
arr = np.array([True, False, False, True])
sparray = SparseArray(arr, fill_value=fill_value)
result = ~sparray
expected = SparseArray(~arr, fill_value=not fill_value)
tm.assert_sp_array_equal(result, expected)
result = ~pd.Series(sparray)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
result = ~pd.DataFrame({"A": sparray})
expected = pd.DataFrame({"A": expected})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("fill_value", [0, np.nan])
@pytest.mark.parametrize("op", [operator.pos, operator.neg])
def test_unary_op(op, fill_value):
arr = np.array([0, 1, np.nan, 2])
sparray = SparseArray(arr, fill_value=fill_value)
result = op(sparray)
expected = SparseArray(op(arr), fill_value=op(fill_value))
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("cons", [list, np.array, SparseArray])
def test_mismatched_length_cmp_op(cons):
left = SparseArray([True, True])
right = cons([True, True, True])
with pytest.raises(ValueError, match="operands have mismatched length"):
left & right

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,62 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestSparseArrayConcat:
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_basic(self, kind):
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=kind)
result = SparseArray._concat_same_type([a, b])
# Can't make any assertions about the sparse index itself
# since we aren't don't merge sparse blocs across arrays
# in to_concat
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_uses_first_kind(self, kind):
other = "integer" if kind == "block" else "block"
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=other)
result = SparseArray._concat_same_type([a, b])
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize(
"other, expected_dtype",
[
# compatible dtype -> preserve sparse
(pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
# (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
# incompatible dtype -> Sparse[common dtype]
(pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)),
# incompatible dtype -> Sparse[object] dtype
(pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
# categorical with compatible categories -> dtype of the categories
(pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
(pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
# categorical with incompatible categories -> object dtype
(pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
],
)
def test_concat_with_non_sparse(other, expected_dtype):
# https://github.com/pandas-dev/pandas/issues/34336
s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))
result = pd.concat([s_sparse, other], ignore_index=True)
expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
tm.assert_series_equal(result, expected)
result = pd.concat([other, s_sparse], ignore_index=True)
expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,209 @@
import re
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.sparse import SparseDtype
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", 0),
("float", np.nan),
("bool", False),
("object", np.nan),
("datetime64[ns]", np.datetime64("NaT", "ns")),
("timedelta64[ns]", np.timedelta64("NaT", "ns")),
],
)
def test_inferred_dtype(dtype, fill_value):
sparse_dtype = SparseDtype(dtype)
result = sparse_dtype.fill_value
if pd.isna(fill_value):
assert pd.isna(result) and type(result) == type(fill_value)
else:
assert result == fill_value
def test_from_sparse_dtype():
dtype = SparseDtype("float", 0)
result = SparseDtype(dtype)
assert result.fill_value == 0
def test_from_sparse_dtype_fill_value():
dtype = SparseDtype("int", 1)
result = SparseDtype(dtype, fill_value=2)
expected = SparseDtype("int", 2)
assert result == expected
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", None),
("float", None),
("bool", None),
("object", None),
("datetime64[ns]", None),
("timedelta64[ns]", None),
("int", np.nan),
("float", 0),
],
)
def test_equal(dtype, fill_value):
a = SparseDtype(dtype, fill_value)
b = SparseDtype(dtype, fill_value)
assert a == b
assert b == a
def test_nans_equal():
a = SparseDtype(float, float("nan"))
b = SparseDtype(float, np.nan)
assert a == b
assert b == a
@pytest.mark.parametrize(
"a, b",
[
(SparseDtype("float64"), SparseDtype("float32")),
(SparseDtype("float64"), SparseDtype("float64", 0)),
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
(SparseDtype("float64"), np.dtype("float64")),
],
)
def test_not_equal(a, b):
assert a != b
def test_construct_from_string_raises():
with pytest.raises(
TypeError, match="Cannot construct a 'SparseDtype' from 'not a dtype'"
):
SparseDtype.construct_from_string("not a dtype")
@pytest.mark.parametrize(
"dtype, expected",
[
(SparseDtype(int), True),
(SparseDtype(float), True),
(SparseDtype(bool), True),
(SparseDtype(object), False),
(SparseDtype(str), False),
],
)
def test_is_numeric(dtype, expected):
assert dtype._is_numeric is expected
def test_str_uses_object():
result = SparseDtype(str).subtype
assert result == np.dtype("object")
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[float64]", SparseDtype(np.dtype("float64"))),
("Sparse[float32]", SparseDtype(np.dtype("float32"))),
("Sparse[int]", SparseDtype(np.dtype("int"))),
("Sparse[str]", SparseDtype(np.dtype("str"))),
("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))),
("Sparse", SparseDtype(np.dtype("float"), np.nan)),
],
)
def test_construct_from_string(string, expected):
result = SparseDtype.construct_from_string(string)
assert result == expected
@pytest.mark.parametrize(
"a, b, expected",
[
(SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True),
(SparseDtype(int, 0), SparseDtype(int, 0), True),
(SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True),
(SparseDtype(float, 0), SparseDtype(float, np.nan), False),
(SparseDtype(int, 0.0), SparseDtype(float, 0.0), False),
],
)
def test_hash_equal(a, b, expected):
result = a == b
assert result is expected
result = hash(a) == hash(b)
assert result is expected
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[int]", "int"),
("Sparse[int, 0]", "int"),
("Sparse[int64]", "int64"),
("Sparse[int64, 0]", "int64"),
("Sparse[datetime64[ns], 0]", "datetime64[ns]"),
],
)
def test_parse_subtype(string, expected):
subtype, _ = SparseDtype._parse_subtype(string)
assert subtype == expected
@pytest.mark.parametrize(
"string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"]
)
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match="fill_value in the string is not"):
SparseDtype.construct_from_string(string)
@pytest.mark.parametrize(
"original, dtype, expected",
[
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, "1")),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
],
)
def test_update_dtype(original, dtype, expected):
result = original.update_dtype(dtype)
assert result == expected
@pytest.mark.parametrize(
"original, dtype, expected_error_msg",
[
(
SparseDtype(float, np.nan),
int,
re.escape("Cannot convert non-finite values (NA or inf) to integer"),
),
(
SparseDtype(str, "abc"),
int,
re.escape("invalid literal for int() with base 10: 'abc'"),
),
],
)
def test_update_dtype_raises(original, dtype, expected_error_msg):
with pytest.raises(ValueError, match=expected_error_msg):
original.update_dtype(dtype)
def test_repr():
# GH-34352
result = str(SparseDtype("int64", fill_value=0))
expected = "Sparse[int64, 0]"
assert result == expected
result = str(SparseDtype(object, fill_value="0"))
expected = "Sparse[object, '0']"
assert result == expected

View File

@ -0,0 +1,618 @@
import operator
import numpy as np
import pytest
import pandas._libs.sparse as splib
import pandas.util._test_decorators as td
from pandas import Series
import pandas._testing as tm
from pandas.core.arrays.sparse import (
BlockIndex,
IntIndex,
make_sparse_index,
)
TEST_LENGTH = 20
plain_case = {
"xloc": [0, 7, 15],
"xlen": [3, 5, 5],
"yloc": [2, 9, 14],
"ylen": [2, 3, 5],
"intersect_loc": [2, 9, 15],
"intersect_len": [1, 3, 4],
}
delete_blocks = {
"xloc": [0, 5],
"xlen": [4, 4],
"yloc": [1],
"ylen": [4],
"intersect_loc": [1],
"intersect_len": [3],
}
split_blocks = {
"xloc": [0],
"xlen": [10],
"yloc": [0, 5],
"ylen": [3, 7],
"intersect_loc": [0, 5],
"intersect_len": [3, 5],
}
skip_block = {
"xloc": [10],
"xlen": [5],
"yloc": [0, 12],
"ylen": [5, 3],
"intersect_loc": [12],
"intersect_len": [3],
}
no_intersect = {
"xloc": [0, 10],
"xlen": [4, 6],
"yloc": [5, 17],
"ylen": [4, 2],
"intersect_loc": [],
"intersect_len": [],
}
def check_cases(_check_case):
def _check_case_dict(case):
_check_case(
case["xloc"],
case["xlen"],
case["yloc"],
case["ylen"],
case["intersect_loc"],
case["intersect_len"],
)
_check_case_dict(plain_case)
_check_case_dict(delete_blocks)
_check_case_dict(split_blocks)
_check_case_dict(skip_block)
_check_case_dict(no_intersect)
# one or both is empty
_check_case([0], [5], [], [], [], [])
_check_case([], [], [], [], [], [])
class TestSparseIndexUnion:
def test_index_make_union(self):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
bresult = xindex.make_union(yindex)
assert isinstance(bresult, BlockIndex)
tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32))
tm.assert_numpy_array_equal(
bresult.blengths, np.array(elen, dtype=np.int32)
)
ixindex = xindex.to_int_index()
iyindex = yindex.to_int_index()
iresult = ixindex.make_union(iyindex)
assert isinstance(iresult, IntIndex)
tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices)
"""
x: ----
y: ----
r: --------
"""
xloc = [0]
xlen = [5]
yloc = [5]
ylen = [4]
eloc = [0]
elen = [9]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----- -----
y: ----- --
"""
xloc = [0, 10]
xlen = [5, 5]
yloc = [2, 17]
ylen = [5, 2]
eloc = [0, 10, 17]
elen = [7, 5, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------
y: -------
r: ----------
"""
xloc = [1]
xlen = [5]
yloc = [3]
ylen = [5]
eloc = [1]
elen = [7]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: -------
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4]
ylen = [8]
eloc = [2]
elen = [12]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: --- -----
y: -------
r: -------------
"""
xloc = [0, 5]
xlen = [3, 5]
yloc = [0]
ylen = [7]
eloc = [0]
elen = [10]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: ------- ---
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4, 13]
ylen = [8, 4]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----------------------
y: ---- ---- ---
r: ----------------------
"""
xloc = [2]
xlen = [15]
yloc = [4, 9, 14]
ylen = [3, 2, 2]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ---- ---
y: --- ---
"""
xloc = [0, 10]
xlen = [3, 3]
yloc = [5, 15]
ylen = [2, 2]
eloc = [0, 5, 10, 15]
elen = [3, 2, 3, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
def test_int_index_make_union(self):
a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1], dtype=np.int32))
b = IntIndex(4, np.array([0, 1], dtype=np.int32))
msg = "Indices must reference same underlying length"
with pytest.raises(ValueError, match=msg):
a.make_union(b)
class TestSparseIndexIntersect:
@td.skip_if_windows
def test_intersect(self):
def _check_correct(a, b, expected):
result = a.intersect(b)
assert result.equals(expected)
def _check_length_exc(a, longer):
msg = "Indices must reference same underlying length"
with pytest.raises(Exception, match=msg):
a.intersect(longer)
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
expected = BlockIndex(TEST_LENGTH, eloc, elen)
longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen)
_check_correct(xindex, yindex, expected)
_check_correct(
xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index()
)
_check_length_exc(xindex, longer_index)
_check_length_exc(xindex.to_int_index(), longer_index.to_int_index())
check_cases(_check_case)
def test_intersect_empty(self):
xindex = IntIndex(4, np.array([], dtype=np.int32))
yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
xindex = xindex.to_block_index()
yindex = yindex.to_block_index()
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
def test_intersect_identical(self):
cases = [
IntIndex(5, np.array([1, 2], dtype=np.int32)),
IntIndex(5, np.array([0, 2, 4], dtype=np.int32)),
IntIndex(0, np.array([], dtype=np.int32)),
IntIndex(5, np.array([], dtype=np.int32)),
]
for case in cases:
assert case.intersect(case).equals(case)
case = case.to_block_index()
assert case.intersect(case).equals(case)
class TestSparseIndexCommon:
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
def test_lookup(self):
for kind in ["integer", "block"]:
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == -1
assert idx.lookup(1) == -1
assert idx.lookup(2) == 0
assert idx.lookup(3) == 1
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
for i in range(-1, 5):
assert idx.lookup(i) == -1
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == 1
assert idx.lookup(2) == 2
assert idx.lookup(3) == 3
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == -1
assert idx.lookup(2) == 1
assert idx.lookup(3) == 2
assert idx.lookup(4) == -1
def test_lookup_array(self):
for kind in ["integer", "block"]:
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, -1, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 0, -1, 1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
exp = np.array([-1, -1, -1, -1], dtype=np.int32)
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, 0, 2], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 2, 1, 3], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
exp = np.array([1, -1, 2, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
exp = np.array([-1, -1, 1, -1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
@pytest.mark.parametrize(
"idx, expected",
[
[0, -1],
[5, 0],
[7, 2],
[8, -1],
[9, -1],
[10, -1],
[11, -1],
[12, 3],
[17, 8],
[18, -1],
],
)
def test_lookup_basics(self, idx, expected):
bindex = BlockIndex(20, [5, 12], [3, 6])
assert bindex.lookup(idx) == expected
iindex = bindex.to_int_index()
assert iindex.lookup(idx) == expected
class TestBlockIndex:
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
def test_make_block_boundary(self):
for i in [5, 10, 100, 101]:
idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
exp = np.arange(0, i, 2, dtype=np.int32)
tm.assert_numpy_array_equal(idx.blocs, exp)
tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32))
def test_equals(self):
index = BlockIndex(10, [0, 4], [2, 5])
assert index.equals(index)
assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
def test_check_integrity(self):
locs = []
lengths = []
# 0-length OK
BlockIndex(0, locs, lengths)
# also OK even though empty
BlockIndex(1, locs, lengths)
msg = "Block 0 extends beyond end"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [5], [10])
msg = "Block 0 overlaps"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [2, 5], [5, 3])
def test_to_int_index(self):
locs = [0, 10]
lengths = [4, 6]
exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
block = BlockIndex(20, locs, lengths)
dense = block.to_int_index()
tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32))
def test_to_block_index(self):
index = BlockIndex(10, [0, 5], [4, 5])
assert index.to_block_index() is index
class TestIntIndex:
def test_check_integrity(self):
# Too many indices than specified in self.length
msg = "Too many indices"
with pytest.raises(ValueError, match=msg):
IntIndex(length=1, indices=[1, 2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# All indices must be less than the length.
msg = "All indices must be less than the length"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 5])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 6])
# Indices must be strictly ascending.
msg = "Indices must be strictly increasing"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 2])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 3])
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_equals(self):
index = IntIndex(10, [0, 1, 2, 3, 4])
assert index.equals(index)
assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
def test_to_block_index(self):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
# see if survive the round trip
xbindex = xindex.to_int_index().to_block_index()
ybindex = yindex.to_int_index().to_block_index()
assert isinstance(xbindex, BlockIndex)
assert xbindex.equals(xindex)
assert ybindex.equals(yindex)
check_cases(_check_case)
def test_to_int_index(self):
index = IntIndex(10, [2, 3, 4, 5, 6])
assert index.to_int_index() is index
class TestSparseOperators:
def _op_tests(self, sparse_op, python_op):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
xdindex = xindex.to_int_index()
ydindex = yindex.to_int_index()
x = np.arange(xindex.npoints) * 10.0 + 1
y = np.arange(yindex.npoints) * 100.0 + 1
xfill = 0
yfill = 2
result_block_vals, rb_index, bfill = sparse_op(
x, xindex, xfill, y, yindex, yfill
)
result_int_vals, ri_index, ifill = sparse_op(
x, xdindex, xfill, y, ydindex, yfill
)
assert rb_index.to_int_index().equals(ri_index)
tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
assert bfill == ifill
# check versus Series...
xseries = Series(x, xdindex.indices)
xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill)
yseries = Series(y, ydindex.indices)
yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill)
series_result = python_op(xseries, yseries)
series_result = series_result.reindex(ri_index.indices)
tm.assert_numpy_array_equal(result_block_vals, series_result.values)
tm.assert_numpy_array_equal(result_int_vals, series_result.values)
check_cases(_check_case)
@pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"])
def test_op(self, opname):
sparse_op = getattr(splib, f"sparse_{opname}_float64")
python_op = getattr(operator, opname)
self._op_tests(sparse_op, python_op)