2022-05-23 00:16:32 +04:00

492 lines
17 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
import sys
import weakref
import numpy as np
import pyarrow as pa
try:
from scipy.sparse import csr_matrix, coo_matrix
except ImportError:
coo_matrix = None
csr_matrix = None
try:
import sparse
except ImportError:
sparse = None
tensor_type_pairs = [
('i1', pa.int8()),
('i2', pa.int16()),
('i4', pa.int32()),
('i8', pa.int64()),
('u1', pa.uint8()),
('u2', pa.uint16()),
('u4', pa.uint32()),
('u8', pa.uint64()),
('f2', pa.float16()),
('f4', pa.float32()),
('f8', pa.float64())
]
@pytest.mark.parametrize('sparse_tensor_type', [
pa.SparseCSRMatrix,
pa.SparseCSCMatrix,
pa.SparseCOOTensor,
pa.SparseCSFTensor,
])
def test_sparse_tensor_attrs(sparse_tensor_type):
data = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
])
dim_names = ('x', 'y')
sparse_tensor = sparse_tensor_type.from_dense_numpy(data, dim_names)
assert sparse_tensor.ndim == 2
assert sparse_tensor.size == 24
assert sparse_tensor.shape == data.shape
assert sparse_tensor.is_mutable
assert sparse_tensor.dim_name(0) == dim_names[0]
assert sparse_tensor.dim_names == dim_names
assert sparse_tensor.non_zero_length == 6
wr = weakref.ref(sparse_tensor)
assert wr() is not None
del sparse_tensor
assert wr() is None
def test_sparse_coo_tensor_base_object():
expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T
expected_coords = np.array([
[0, 0, 1, 2, 3, 3],
[0, 2, 5, 0, 4, 5],
]).T
array = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
])
sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array)
n = sys.getrefcount(sparse_tensor)
result_data, result_coords = sparse_tensor.to_numpy()
assert sparse_tensor.has_canonical_format
assert sys.getrefcount(sparse_tensor) == n + 2
sparse_tensor = None
assert np.array_equal(expected_data, result_data)
assert np.array_equal(expected_coords, result_coords)
assert result_coords.flags.c_contiguous # row-major
def test_sparse_csr_matrix_base_object():
data = np.array([[8, 2, 5, 3, 4, 6]]).T
indptr = np.array([0, 2, 3, 4, 6])
indices = np.array([0, 2, 5, 0, 4, 5])
array = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
])
sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array)
n = sys.getrefcount(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sys.getrefcount(sparse_tensor) == n + 3
sparse_tensor = None
assert np.array_equal(data, result_data)
assert np.array_equal(indptr, result_indptr)
assert np.array_equal(indices, result_indices)
def test_sparse_csf_tensor_base_object():
data = np.array([[8, 2, 5, 3, 4, 6]]).T
indptr = [np.array([0, 2, 3, 4, 6])]
indices = [
np.array([0, 1, 2, 3]),
np.array([0, 2, 5, 0, 4, 5])
]
array = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
])
sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array)
n = sys.getrefcount(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sys.getrefcount(sparse_tensor) == n + 4
sparse_tensor = None
assert np.array_equal(data, result_data)
assert np.array_equal(indptr[0], result_indptr[0])
assert np.array_equal(indices[0], result_indices[0])
assert np.array_equal(indices[1], result_indices[1])
@pytest.mark.parametrize('sparse_tensor_type', [
pa.SparseCSRMatrix,
pa.SparseCSCMatrix,
pa.SparseCOOTensor,
pa.SparseCSFTensor,
])
def test_sparse_tensor_equals(sparse_tensor_type):
def eq(a, b):
assert a.equals(b)
assert a == b
assert not (a != b)
def ne(a, b):
assert not a.equals(b)
assert not (a == b)
assert a != b
data = np.random.randn(10, 6)[::, ::2]
sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data)
sparse_tensor2 = sparse_tensor_type.from_dense_numpy(
np.ascontiguousarray(data))
eq(sparse_tensor1, sparse_tensor2)
data = data.copy()
data[9, 0] = 1.0
sparse_tensor2 = sparse_tensor_type.from_dense_numpy(
np.ascontiguousarray(data))
ne(sparse_tensor1, sparse_tensor2)
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_coo_tensor_from_dense(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
expected_coords = np.array([
[0, 0, 1, 2, 3, 3],
[0, 2, 5, 0, 4, 5],
]).T
array = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
]).astype(dtype)
tensor = pa.Tensor.from_numpy(array)
# Test from numpy array
sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array)
repr(sparse_tensor)
result_data, result_coords = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(expected_data, result_data)
assert np.array_equal(expected_coords, result_coords)
# Test from Tensor
sparse_tensor = pa.SparseCOOTensor.from_tensor(tensor)
repr(sparse_tensor)
result_data, result_coords = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(expected_data, result_data)
assert np.array_equal(expected_coords, result_coords)
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_csr_matrix_from_dense(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
indptr = np.array([0, 2, 3, 4, 6])
indices = np.array([0, 2, 5, 0, 4, 5])
array = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
]).astype(dtype)
tensor = pa.Tensor.from_numpy(array)
# Test from numpy array
sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array)
repr(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(data, result_data)
assert np.array_equal(indptr, result_indptr)
assert np.array_equal(indices, result_indices)
# Test from Tensor
sparse_tensor = pa.SparseCSRMatrix.from_tensor(tensor)
repr(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(data, result_data)
assert np.array_equal(indptr, result_indptr)
assert np.array_equal(indices, result_indices)
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_csf_tensor_from_dense_numpy(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
indptr = [np.array([0, 2, 3, 4, 6])]
indices = [
np.array([0, 1, 2, 3]),
np.array([0, 2, 5, 0, 4, 5])
]
array = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
]).astype(dtype)
# Test from numpy array
sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array)
repr(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(data, result_data)
assert np.array_equal(indptr[0], result_indptr[0])
assert np.array_equal(indices[0], result_indices[0])
assert np.array_equal(indices[1], result_indices[1])
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_csf_tensor_from_dense_tensor(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
indptr = [np.array([0, 2, 3, 4, 6])]
indices = [
np.array([0, 1, 2, 3]),
np.array([0, 2, 5, 0, 4, 5])
]
array = np.array([
[8, 0, 2, 0, 0, 0],
[0, 0, 0, 0, 0, 5],
[3, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 4, 6],
]).astype(dtype)
tensor = pa.Tensor.from_numpy(array)
# Test from Tensor
sparse_tensor = pa.SparseCSFTensor.from_tensor(tensor)
repr(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(data, result_data)
assert np.array_equal(indptr[0], result_indptr[0])
assert np.array_equal(indices[0], result_indices[0])
assert np.array_equal(indices[1], result_indices[1])
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_coo_tensor_numpy_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([[1, 2, 3, 4, 5, 6]]).T.astype(dtype)
coords = np.array([
[0, 0, 2, 3, 1, 3],
[0, 2, 0, 4, 5, 5],
]).T
shape = (4, 6)
dim_names = ('x', 'y')
sparse_tensor = pa.SparseCOOTensor.from_numpy(data, coords, shape,
dim_names)
repr(sparse_tensor)
result_data, result_coords = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(data, result_data)
assert np.array_equal(coords, result_coords)
assert sparse_tensor.dim_names == dim_names
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_csr_matrix_numpy_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
indptr = np.array([0, 2, 3, 4, 6])
indices = np.array([0, 2, 5, 0, 4, 5])
shape = (4, 6)
dim_names = ('x', 'y')
sparse_tensor = pa.SparseCSRMatrix.from_numpy(data, indptr, indices,
shape, dim_names)
repr(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(data, result_data)
assert np.array_equal(indptr, result_indptr)
assert np.array_equal(indices, result_indices)
assert sparse_tensor.dim_names == dim_names
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_csf_tensor_numpy_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
indptr = [np.array([0, 2, 3, 4, 6])]
indices = [
np.array([0, 1, 2, 3]),
np.array([0, 2, 5, 0, 4, 5])
]
axis_order = (0, 1)
shape = (4, 6)
dim_names = ('x', 'y')
sparse_tensor = pa.SparseCSFTensor.from_numpy(data, indptr, indices,
shape, axis_order,
dim_names)
repr(sparse_tensor)
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert np.array_equal(data, result_data)
assert np.array_equal(indptr[0], result_indptr[0])
assert np.array_equal(indices[0], result_indices[0])
assert np.array_equal(indices[1], result_indices[1])
assert sparse_tensor.dim_names == dim_names
@pytest.mark.parametrize('sparse_tensor_type', [
pa.SparseCSRMatrix,
pa.SparseCSCMatrix,
pa.SparseCOOTensor,
pa.SparseCSFTensor,
])
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type):
dtype = np.dtype(dtype_str)
array = np.array([[4, 0, 9, 0],
[0, 7, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 5]]).astype(dtype)
dim_names = ('x', 'y')
sparse_tensor = sparse_tensor_type.from_dense_numpy(array, dim_names)
tensor = sparse_tensor.to_tensor()
result_array = tensor.to_numpy()
assert sparse_tensor.type == arrow_type
assert tensor.type == arrow_type
assert sparse_tensor.dim_names == dim_names
assert np.array_equal(array, result_array)
@pytest.mark.skipif(not coo_matrix, reason="requires scipy")
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype)
row = np.array([0, 0, 2, 3, 1, 3])
col = np.array([0, 2, 0, 4, 5, 5])
shape = (4, 6)
dim_names = ('x', 'y')
# non-canonical sparse coo matrix
scipy_matrix = coo_matrix((data, (row, col)), shape=shape)
sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix,
dim_names=dim_names)
out_scipy_matrix = sparse_tensor.to_scipy()
assert not scipy_matrix.has_canonical_format
assert not sparse_tensor.has_canonical_format
assert not out_scipy_matrix.has_canonical_format
assert sparse_tensor.type == arrow_type
assert sparse_tensor.dim_names == dim_names
assert scipy_matrix.dtype == out_scipy_matrix.dtype
assert np.array_equal(scipy_matrix.data, out_scipy_matrix.data)
assert np.array_equal(scipy_matrix.row, out_scipy_matrix.row)
assert np.array_equal(scipy_matrix.col, out_scipy_matrix.col)
if dtype_str == 'f2':
dense_array = \
scipy_matrix.astype(np.float32).toarray().astype(np.float16)
else:
dense_array = scipy_matrix.toarray()
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())
# canonical sparse coo matrix
scipy_matrix.sum_duplicates()
sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix,
dim_names=dim_names)
out_scipy_matrix = sparse_tensor.to_scipy()
assert scipy_matrix.has_canonical_format
assert sparse_tensor.has_canonical_format
assert out_scipy_matrix.has_canonical_format
@pytest.mark.skipif(not csr_matrix, reason="requires scipy")
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([8, 2, 5, 3, 4, 6]).astype(dtype)
indptr = np.array([0, 2, 3, 4, 6])
indices = np.array([0, 2, 5, 0, 4, 5])
shape = (4, 6)
dim_names = ('x', 'y')
sparse_array = csr_matrix((data, indices, indptr), shape=shape)
sparse_tensor = pa.SparseCSRMatrix.from_scipy(sparse_array,
dim_names=dim_names)
out_sparse_array = sparse_tensor.to_scipy()
assert sparse_tensor.type == arrow_type
assert sparse_tensor.dim_names == dim_names
assert sparse_array.dtype == out_sparse_array.dtype
assert np.array_equal(sparse_array.data, out_sparse_array.data)
assert np.array_equal(sparse_array.indptr, out_sparse_array.indptr)
assert np.array_equal(sparse_array.indices, out_sparse_array.indices)
if dtype_str == 'f2':
dense_array = \
sparse_array.astype(np.float32).toarray().astype(np.float16)
else:
dense_array = sparse_array.toarray()
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())
@pytest.mark.skipif(not sparse, reason="requires pydata/sparse")
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype)
coords = np.array([
[0, 0, 2, 3, 1, 3],
[0, 2, 0, 4, 5, 5],
])
shape = (4, 6)
dim_names = ("x", "y")
sparse_array = sparse.COO(data=data, coords=coords, shape=shape)
sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array,
dim_names=dim_names)
out_sparse_array = sparse_tensor.to_pydata_sparse()
assert sparse_tensor.type == arrow_type
assert sparse_tensor.dim_names == dim_names
assert sparse_array.dtype == out_sparse_array.dtype
assert np.array_equal(sparse_array.data, out_sparse_array.data)
assert np.array_equal(sparse_array.coords, out_sparse_array.coords)
assert np.array_equal(sparse_array.todense(),
sparse_tensor.to_tensor().to_numpy())