# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import pytest import sys import weakref import numpy as np import pyarrow as pa try: from scipy.sparse import csr_matrix, coo_matrix except ImportError: coo_matrix = None csr_matrix = None try: import sparse except ImportError: sparse = None tensor_type_pairs = [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64()) ] @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseCSRMatrix, pa.SparseCSCMatrix, pa.SparseCOOTensor, pa.SparseCSFTensor, ]) def test_sparse_tensor_attrs(sparse_tensor_type): data = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]) dim_names = ('x', 'y') sparse_tensor = sparse_tensor_type.from_dense_numpy(data, dim_names) assert sparse_tensor.ndim == 2 assert sparse_tensor.size == 24 assert sparse_tensor.shape == data.shape assert sparse_tensor.is_mutable assert sparse_tensor.dim_name(0) == dim_names[0] assert sparse_tensor.dim_names == dim_names assert sparse_tensor.non_zero_length == 6 wr = weakref.ref(sparse_tensor) assert wr() is not None del sparse_tensor assert wr() is None def test_sparse_coo_tensor_base_object(): expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T expected_coords = np.array([ [0, 0, 1, 2, 3, 3], [0, 2, 5, 0, 4, 5], ]).T array = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]) sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array) n = sys.getrefcount(sparse_tensor) result_data, result_coords = sparse_tensor.to_numpy() assert sparse_tensor.has_canonical_format assert sys.getrefcount(sparse_tensor) == n + 2 sparse_tensor = None assert np.array_equal(expected_data, result_data) assert np.array_equal(expected_coords, result_coords) assert result_coords.flags.c_contiguous # row-major def test_sparse_csr_matrix_base_object(): data = np.array([[8, 2, 5, 3, 4, 6]]).T indptr = np.array([0, 2, 3, 4, 6]) indices = np.array([0, 2, 5, 0, 4, 5]) array = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]) sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array) n = sys.getrefcount(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sys.getrefcount(sparse_tensor) == n + 3 sparse_tensor = None assert np.array_equal(data, result_data) assert np.array_equal(indptr, result_indptr) assert np.array_equal(indices, result_indices) def test_sparse_csf_tensor_base_object(): data = np.array([[8, 2, 5, 3, 4, 6]]).T indptr = [np.array([0, 2, 3, 4, 6])] indices = [ np.array([0, 1, 2, 3]), np.array([0, 2, 5, 0, 4, 5]) ] array = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]) sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array) n = sys.getrefcount(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sys.getrefcount(sparse_tensor) == n + 4 sparse_tensor = None assert np.array_equal(data, result_data) assert np.array_equal(indptr[0], result_indptr[0]) assert np.array_equal(indices[0], result_indices[0]) assert np.array_equal(indices[1], result_indices[1]) @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseCSRMatrix, pa.SparseCSCMatrix, pa.SparseCOOTensor, pa.SparseCSFTensor, ]) def test_sparse_tensor_equals(sparse_tensor_type): def eq(a, b): assert a.equals(b) assert a == b assert not (a != b) def ne(a, b): assert not a.equals(b) assert not (a == b) assert a != b data = np.random.randn(10, 6)[::, ::2] sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data) sparse_tensor2 = sparse_tensor_type.from_dense_numpy( np.ascontiguousarray(data)) eq(sparse_tensor1, sparse_tensor2) data = data.copy() data[9, 0] = 1.0 sparse_tensor2 = sparse_tensor_type.from_dense_numpy( np.ascontiguousarray(data)) ne(sparse_tensor1, sparse_tensor2) @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_coo_tensor_from_dense(dtype_str, arrow_type): dtype = np.dtype(dtype_str) expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) expected_coords = np.array([ [0, 0, 1, 2, 3, 3], [0, 2, 5, 0, 4, 5], ]).T array = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]).astype(dtype) tensor = pa.Tensor.from_numpy(array) # Test from numpy array sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array) repr(sparse_tensor) result_data, result_coords = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(expected_data, result_data) assert np.array_equal(expected_coords, result_coords) # Test from Tensor sparse_tensor = pa.SparseCOOTensor.from_tensor(tensor) repr(sparse_tensor) result_data, result_coords = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(expected_data, result_data) assert np.array_equal(expected_coords, result_coords) @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_csr_matrix_from_dense(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) indptr = np.array([0, 2, 3, 4, 6]) indices = np.array([0, 2, 5, 0, 4, 5]) array = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]).astype(dtype) tensor = pa.Tensor.from_numpy(array) # Test from numpy array sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array) repr(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(data, result_data) assert np.array_equal(indptr, result_indptr) assert np.array_equal(indices, result_indices) # Test from Tensor sparse_tensor = pa.SparseCSRMatrix.from_tensor(tensor) repr(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(data, result_data) assert np.array_equal(indptr, result_indptr) assert np.array_equal(indices, result_indices) @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_csf_tensor_from_dense_numpy(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) indptr = [np.array([0, 2, 3, 4, 6])] indices = [ np.array([0, 1, 2, 3]), np.array([0, 2, 5, 0, 4, 5]) ] array = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]).astype(dtype) # Test from numpy array sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array) repr(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(data, result_data) assert np.array_equal(indptr[0], result_indptr[0]) assert np.array_equal(indices[0], result_indices[0]) assert np.array_equal(indices[1], result_indices[1]) @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_csf_tensor_from_dense_tensor(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) indptr = [np.array([0, 2, 3, 4, 6])] indices = [ np.array([0, 1, 2, 3]), np.array([0, 2, 5, 0, 4, 5]) ] array = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ]).astype(dtype) tensor = pa.Tensor.from_numpy(array) # Test from Tensor sparse_tensor = pa.SparseCSFTensor.from_tensor(tensor) repr(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(data, result_data) assert np.array_equal(indptr[0], result_indptr[0]) assert np.array_equal(indices[0], result_indices[0]) assert np.array_equal(indices[1], result_indices[1]) @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_coo_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[1, 2, 3, 4, 5, 6]]).T.astype(dtype) coords = np.array([ [0, 0, 2, 3, 1, 3], [0, 2, 0, 4, 5, 5], ]).T shape = (4, 6) dim_names = ('x', 'y') sparse_tensor = pa.SparseCOOTensor.from_numpy(data, coords, shape, dim_names) repr(sparse_tensor) result_data, result_coords = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(data, result_data) assert np.array_equal(coords, result_coords) assert sparse_tensor.dim_names == dim_names @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_csr_matrix_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) indptr = np.array([0, 2, 3, 4, 6]) indices = np.array([0, 2, 5, 0, 4, 5]) shape = (4, 6) dim_names = ('x', 'y') sparse_tensor = pa.SparseCSRMatrix.from_numpy(data, indptr, indices, shape, dim_names) repr(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(data, result_data) assert np.array_equal(indptr, result_indptr) assert np.array_equal(indices, result_indices) assert sparse_tensor.dim_names == dim_names @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_csf_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) indptr = [np.array([0, 2, 3, 4, 6])] indices = [ np.array([0, 1, 2, 3]), np.array([0, 2, 5, 0, 4, 5]) ] axis_order = (0, 1) shape = (4, 6) dim_names = ('x', 'y') sparse_tensor = pa.SparseCSFTensor.from_numpy(data, indptr, indices, shape, axis_order, dim_names) repr(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() assert sparse_tensor.type == arrow_type assert np.array_equal(data, result_data) assert np.array_equal(indptr[0], result_indptr[0]) assert np.array_equal(indices[0], result_indices[0]) assert np.array_equal(indices[1], result_indices[1]) assert sparse_tensor.dim_names == dim_names @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseCSRMatrix, pa.SparseCSCMatrix, pa.SparseCOOTensor, pa.SparseCSFTensor, ]) @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): dtype = np.dtype(dtype_str) array = np.array([[4, 0, 9, 0], [0, 7, 0, 0], [0, 0, 0, 0], [0, 0, 0, 5]]).astype(dtype) dim_names = ('x', 'y') sparse_tensor = sparse_tensor_type.from_dense_numpy(array, dim_names) tensor = sparse_tensor.to_tensor() result_array = tensor.to_numpy() assert sparse_tensor.type == arrow_type assert tensor.type == arrow_type assert sparse_tensor.dim_names == dim_names assert np.array_equal(array, result_array) @pytest.mark.skipif(not coo_matrix, reason="requires scipy") @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) row = np.array([0, 0, 2, 3, 1, 3]) col = np.array([0, 2, 0, 4, 5, 5]) shape = (4, 6) dim_names = ('x', 'y') # non-canonical sparse coo matrix scipy_matrix = coo_matrix((data, (row, col)), shape=shape) sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix, dim_names=dim_names) out_scipy_matrix = sparse_tensor.to_scipy() assert not scipy_matrix.has_canonical_format assert not sparse_tensor.has_canonical_format assert not out_scipy_matrix.has_canonical_format assert sparse_tensor.type == arrow_type assert sparse_tensor.dim_names == dim_names assert scipy_matrix.dtype == out_scipy_matrix.dtype assert np.array_equal(scipy_matrix.data, out_scipy_matrix.data) assert np.array_equal(scipy_matrix.row, out_scipy_matrix.row) assert np.array_equal(scipy_matrix.col, out_scipy_matrix.col) if dtype_str == 'f2': dense_array = \ scipy_matrix.astype(np.float32).toarray().astype(np.float16) else: dense_array = scipy_matrix.toarray() assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) # canonical sparse coo matrix scipy_matrix.sum_duplicates() sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix, dim_names=dim_names) out_scipy_matrix = sparse_tensor.to_scipy() assert scipy_matrix.has_canonical_format assert sparse_tensor.has_canonical_format assert out_scipy_matrix.has_canonical_format @pytest.mark.skipif(not csr_matrix, reason="requires scipy") @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([8, 2, 5, 3, 4, 6]).astype(dtype) indptr = np.array([0, 2, 3, 4, 6]) indices = np.array([0, 2, 5, 0, 4, 5]) shape = (4, 6) dim_names = ('x', 'y') sparse_array = csr_matrix((data, indices, indptr), shape=shape) sparse_tensor = pa.SparseCSRMatrix.from_scipy(sparse_array, dim_names=dim_names) out_sparse_array = sparse_tensor.to_scipy() assert sparse_tensor.type == arrow_type assert sparse_tensor.dim_names == dim_names assert sparse_array.dtype == out_sparse_array.dtype assert np.array_equal(sparse_array.data, out_sparse_array.data) assert np.array_equal(sparse_array.indptr, out_sparse_array.indptr) assert np.array_equal(sparse_array.indices, out_sparse_array.indices) if dtype_str == 'f2': dense_array = \ sparse_array.astype(np.float32).toarray().astype(np.float16) else: dense_array = sparse_array.toarray() assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) @pytest.mark.skipif(not sparse, reason="requires pydata/sparse") @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) coords = np.array([ [0, 0, 2, 3, 1, 3], [0, 2, 0, 4, 5, 5], ]) shape = (4, 6) dim_names = ("x", "y") sparse_array = sparse.COO(data=data, coords=coords, shape=shape) sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array, dim_names=dim_names) out_sparse_array = sparse_tensor.to_pydata_sparse() assert sparse_tensor.type == arrow_type assert sparse_tensor.dim_names == dim_names assert sparse_array.dtype == out_sparse_array.dtype assert np.array_equal(sparse_array.data, out_sparse_array.data) assert np.array_equal(sparse_array.coords, out_sparse_array.coords) assert np.array_equal(sparse_array.todense(), sparse_tensor.to_tensor().to_numpy())