mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-21 18:23:35 +00:00
492 lines
17 KiB
Python
492 lines
17 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
import sys
|
|
import weakref
|
|
|
|
import numpy as np
|
|
import pyarrow as pa
|
|
|
|
try:
|
|
from scipy.sparse import csr_matrix, coo_matrix
|
|
except ImportError:
|
|
coo_matrix = None
|
|
csr_matrix = None
|
|
|
|
try:
|
|
import sparse
|
|
except ImportError:
|
|
sparse = None
|
|
|
|
|
|
tensor_type_pairs = [
|
|
('i1', pa.int8()),
|
|
('i2', pa.int16()),
|
|
('i4', pa.int32()),
|
|
('i8', pa.int64()),
|
|
('u1', pa.uint8()),
|
|
('u2', pa.uint16()),
|
|
('u4', pa.uint32()),
|
|
('u8', pa.uint64()),
|
|
('f2', pa.float16()),
|
|
('f4', pa.float32()),
|
|
('f8', pa.float64())
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize('sparse_tensor_type', [
|
|
pa.SparseCSRMatrix,
|
|
pa.SparseCSCMatrix,
|
|
pa.SparseCOOTensor,
|
|
pa.SparseCSFTensor,
|
|
])
|
|
def test_sparse_tensor_attrs(sparse_tensor_type):
|
|
data = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
])
|
|
dim_names = ('x', 'y')
|
|
sparse_tensor = sparse_tensor_type.from_dense_numpy(data, dim_names)
|
|
|
|
assert sparse_tensor.ndim == 2
|
|
assert sparse_tensor.size == 24
|
|
assert sparse_tensor.shape == data.shape
|
|
assert sparse_tensor.is_mutable
|
|
assert sparse_tensor.dim_name(0) == dim_names[0]
|
|
assert sparse_tensor.dim_names == dim_names
|
|
assert sparse_tensor.non_zero_length == 6
|
|
|
|
wr = weakref.ref(sparse_tensor)
|
|
assert wr() is not None
|
|
del sparse_tensor
|
|
assert wr() is None
|
|
|
|
|
|
def test_sparse_coo_tensor_base_object():
|
|
expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T
|
|
expected_coords = np.array([
|
|
[0, 0, 1, 2, 3, 3],
|
|
[0, 2, 5, 0, 4, 5],
|
|
]).T
|
|
array = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
])
|
|
sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array)
|
|
n = sys.getrefcount(sparse_tensor)
|
|
result_data, result_coords = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.has_canonical_format
|
|
assert sys.getrefcount(sparse_tensor) == n + 2
|
|
|
|
sparse_tensor = None
|
|
assert np.array_equal(expected_data, result_data)
|
|
assert np.array_equal(expected_coords, result_coords)
|
|
assert result_coords.flags.c_contiguous # row-major
|
|
|
|
|
|
def test_sparse_csr_matrix_base_object():
|
|
data = np.array([[8, 2, 5, 3, 4, 6]]).T
|
|
indptr = np.array([0, 2, 3, 4, 6])
|
|
indices = np.array([0, 2, 5, 0, 4, 5])
|
|
array = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
])
|
|
sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array)
|
|
n = sys.getrefcount(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sys.getrefcount(sparse_tensor) == n + 3
|
|
|
|
sparse_tensor = None
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr, result_indptr)
|
|
assert np.array_equal(indices, result_indices)
|
|
|
|
|
|
def test_sparse_csf_tensor_base_object():
|
|
data = np.array([[8, 2, 5, 3, 4, 6]]).T
|
|
indptr = [np.array([0, 2, 3, 4, 6])]
|
|
indices = [
|
|
np.array([0, 1, 2, 3]),
|
|
np.array([0, 2, 5, 0, 4, 5])
|
|
]
|
|
array = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
])
|
|
sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array)
|
|
n = sys.getrefcount(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sys.getrefcount(sparse_tensor) == n + 4
|
|
|
|
sparse_tensor = None
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr[0], result_indptr[0])
|
|
assert np.array_equal(indices[0], result_indices[0])
|
|
assert np.array_equal(indices[1], result_indices[1])
|
|
|
|
|
|
@pytest.mark.parametrize('sparse_tensor_type', [
|
|
pa.SparseCSRMatrix,
|
|
pa.SparseCSCMatrix,
|
|
pa.SparseCOOTensor,
|
|
pa.SparseCSFTensor,
|
|
])
|
|
def test_sparse_tensor_equals(sparse_tensor_type):
|
|
def eq(a, b):
|
|
assert a.equals(b)
|
|
assert a == b
|
|
assert not (a != b)
|
|
|
|
def ne(a, b):
|
|
assert not a.equals(b)
|
|
assert not (a == b)
|
|
assert a != b
|
|
|
|
data = np.random.randn(10, 6)[::, ::2]
|
|
sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data)
|
|
sparse_tensor2 = sparse_tensor_type.from_dense_numpy(
|
|
np.ascontiguousarray(data))
|
|
eq(sparse_tensor1, sparse_tensor2)
|
|
data = data.copy()
|
|
data[9, 0] = 1.0
|
|
sparse_tensor2 = sparse_tensor_type.from_dense_numpy(
|
|
np.ascontiguousarray(data))
|
|
ne(sparse_tensor1, sparse_tensor2)
|
|
|
|
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_coo_tensor_from_dense(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
|
|
expected_coords = np.array([
|
|
[0, 0, 1, 2, 3, 3],
|
|
[0, 2, 5, 0, 4, 5],
|
|
]).T
|
|
array = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
]).astype(dtype)
|
|
tensor = pa.Tensor.from_numpy(array)
|
|
|
|
# Test from numpy array
|
|
sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array)
|
|
repr(sparse_tensor)
|
|
result_data, result_coords = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(expected_data, result_data)
|
|
assert np.array_equal(expected_coords, result_coords)
|
|
|
|
# Test from Tensor
|
|
sparse_tensor = pa.SparseCOOTensor.from_tensor(tensor)
|
|
repr(sparse_tensor)
|
|
result_data, result_coords = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(expected_data, result_data)
|
|
assert np.array_equal(expected_coords, result_coords)
|
|
|
|
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_csr_matrix_from_dense(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
|
|
indptr = np.array([0, 2, 3, 4, 6])
|
|
indices = np.array([0, 2, 5, 0, 4, 5])
|
|
array = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
]).astype(dtype)
|
|
tensor = pa.Tensor.from_numpy(array)
|
|
|
|
# Test from numpy array
|
|
sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array)
|
|
repr(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr, result_indptr)
|
|
assert np.array_equal(indices, result_indices)
|
|
|
|
# Test from Tensor
|
|
sparse_tensor = pa.SparseCSRMatrix.from_tensor(tensor)
|
|
repr(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr, result_indptr)
|
|
assert np.array_equal(indices, result_indices)
|
|
|
|
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_csf_tensor_from_dense_numpy(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
|
|
indptr = [np.array([0, 2, 3, 4, 6])]
|
|
indices = [
|
|
np.array([0, 1, 2, 3]),
|
|
np.array([0, 2, 5, 0, 4, 5])
|
|
]
|
|
array = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
]).astype(dtype)
|
|
|
|
# Test from numpy array
|
|
sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array)
|
|
repr(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr[0], result_indptr[0])
|
|
assert np.array_equal(indices[0], result_indices[0])
|
|
assert np.array_equal(indices[1], result_indices[1])
|
|
|
|
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_csf_tensor_from_dense_tensor(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
|
|
indptr = [np.array([0, 2, 3, 4, 6])]
|
|
indices = [
|
|
np.array([0, 1, 2, 3]),
|
|
np.array([0, 2, 5, 0, 4, 5])
|
|
]
|
|
array = np.array([
|
|
[8, 0, 2, 0, 0, 0],
|
|
[0, 0, 0, 0, 0, 5],
|
|
[3, 0, 0, 0, 0, 0],
|
|
[0, 0, 0, 0, 4, 6],
|
|
]).astype(dtype)
|
|
tensor = pa.Tensor.from_numpy(array)
|
|
|
|
# Test from Tensor
|
|
sparse_tensor = pa.SparseCSFTensor.from_tensor(tensor)
|
|
repr(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr[0], result_indptr[0])
|
|
assert np.array_equal(indices[0], result_indices[0])
|
|
assert np.array_equal(indices[1], result_indices[1])
|
|
|
|
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_coo_tensor_numpy_roundtrip(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([[1, 2, 3, 4, 5, 6]]).T.astype(dtype)
|
|
coords = np.array([
|
|
[0, 0, 2, 3, 1, 3],
|
|
[0, 2, 0, 4, 5, 5],
|
|
]).T
|
|
shape = (4, 6)
|
|
dim_names = ('x', 'y')
|
|
|
|
sparse_tensor = pa.SparseCOOTensor.from_numpy(data, coords, shape,
|
|
dim_names)
|
|
repr(sparse_tensor)
|
|
result_data, result_coords = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(coords, result_coords)
|
|
assert sparse_tensor.dim_names == dim_names
|
|
|
|
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_csr_matrix_numpy_roundtrip(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
|
|
indptr = np.array([0, 2, 3, 4, 6])
|
|
indices = np.array([0, 2, 5, 0, 4, 5])
|
|
shape = (4, 6)
|
|
dim_names = ('x', 'y')
|
|
|
|
sparse_tensor = pa.SparseCSRMatrix.from_numpy(data, indptr, indices,
|
|
shape, dim_names)
|
|
repr(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr, result_indptr)
|
|
assert np.array_equal(indices, result_indices)
|
|
assert sparse_tensor.dim_names == dim_names
|
|
|
|
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_csf_tensor_numpy_roundtrip(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype)
|
|
indptr = [np.array([0, 2, 3, 4, 6])]
|
|
indices = [
|
|
np.array([0, 1, 2, 3]),
|
|
np.array([0, 2, 5, 0, 4, 5])
|
|
]
|
|
axis_order = (0, 1)
|
|
shape = (4, 6)
|
|
dim_names = ('x', 'y')
|
|
|
|
sparse_tensor = pa.SparseCSFTensor.from_numpy(data, indptr, indices,
|
|
shape, axis_order,
|
|
dim_names)
|
|
repr(sparse_tensor)
|
|
result_data, result_indptr, result_indices = sparse_tensor.to_numpy()
|
|
assert sparse_tensor.type == arrow_type
|
|
assert np.array_equal(data, result_data)
|
|
assert np.array_equal(indptr[0], result_indptr[0])
|
|
assert np.array_equal(indices[0], result_indices[0])
|
|
assert np.array_equal(indices[1], result_indices[1])
|
|
assert sparse_tensor.dim_names == dim_names
|
|
|
|
|
|
@pytest.mark.parametrize('sparse_tensor_type', [
|
|
pa.SparseCSRMatrix,
|
|
pa.SparseCSCMatrix,
|
|
pa.SparseCOOTensor,
|
|
pa.SparseCSFTensor,
|
|
])
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type):
|
|
dtype = np.dtype(dtype_str)
|
|
array = np.array([[4, 0, 9, 0],
|
|
[0, 7, 0, 0],
|
|
[0, 0, 0, 0],
|
|
[0, 0, 0, 5]]).astype(dtype)
|
|
dim_names = ('x', 'y')
|
|
|
|
sparse_tensor = sparse_tensor_type.from_dense_numpy(array, dim_names)
|
|
tensor = sparse_tensor.to_tensor()
|
|
result_array = tensor.to_numpy()
|
|
|
|
assert sparse_tensor.type == arrow_type
|
|
assert tensor.type == arrow_type
|
|
assert sparse_tensor.dim_names == dim_names
|
|
assert np.array_equal(array, result_array)
|
|
|
|
|
|
@pytest.mark.skipif(not coo_matrix, reason="requires scipy")
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype)
|
|
row = np.array([0, 0, 2, 3, 1, 3])
|
|
col = np.array([0, 2, 0, 4, 5, 5])
|
|
shape = (4, 6)
|
|
dim_names = ('x', 'y')
|
|
|
|
# non-canonical sparse coo matrix
|
|
scipy_matrix = coo_matrix((data, (row, col)), shape=shape)
|
|
sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix,
|
|
dim_names=dim_names)
|
|
out_scipy_matrix = sparse_tensor.to_scipy()
|
|
|
|
assert not scipy_matrix.has_canonical_format
|
|
assert not sparse_tensor.has_canonical_format
|
|
assert not out_scipy_matrix.has_canonical_format
|
|
assert sparse_tensor.type == arrow_type
|
|
assert sparse_tensor.dim_names == dim_names
|
|
assert scipy_matrix.dtype == out_scipy_matrix.dtype
|
|
assert np.array_equal(scipy_matrix.data, out_scipy_matrix.data)
|
|
assert np.array_equal(scipy_matrix.row, out_scipy_matrix.row)
|
|
assert np.array_equal(scipy_matrix.col, out_scipy_matrix.col)
|
|
|
|
if dtype_str == 'f2':
|
|
dense_array = \
|
|
scipy_matrix.astype(np.float32).toarray().astype(np.float16)
|
|
else:
|
|
dense_array = scipy_matrix.toarray()
|
|
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())
|
|
|
|
# canonical sparse coo matrix
|
|
scipy_matrix.sum_duplicates()
|
|
sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix,
|
|
dim_names=dim_names)
|
|
out_scipy_matrix = sparse_tensor.to_scipy()
|
|
|
|
assert scipy_matrix.has_canonical_format
|
|
assert sparse_tensor.has_canonical_format
|
|
assert out_scipy_matrix.has_canonical_format
|
|
|
|
|
|
@pytest.mark.skipif(not csr_matrix, reason="requires scipy")
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([8, 2, 5, 3, 4, 6]).astype(dtype)
|
|
indptr = np.array([0, 2, 3, 4, 6])
|
|
indices = np.array([0, 2, 5, 0, 4, 5])
|
|
shape = (4, 6)
|
|
dim_names = ('x', 'y')
|
|
|
|
sparse_array = csr_matrix((data, indices, indptr), shape=shape)
|
|
sparse_tensor = pa.SparseCSRMatrix.from_scipy(sparse_array,
|
|
dim_names=dim_names)
|
|
out_sparse_array = sparse_tensor.to_scipy()
|
|
|
|
assert sparse_tensor.type == arrow_type
|
|
assert sparse_tensor.dim_names == dim_names
|
|
assert sparse_array.dtype == out_sparse_array.dtype
|
|
assert np.array_equal(sparse_array.data, out_sparse_array.data)
|
|
assert np.array_equal(sparse_array.indptr, out_sparse_array.indptr)
|
|
assert np.array_equal(sparse_array.indices, out_sparse_array.indices)
|
|
|
|
if dtype_str == 'f2':
|
|
dense_array = \
|
|
sparse_array.astype(np.float32).toarray().astype(np.float16)
|
|
else:
|
|
dense_array = sparse_array.toarray()
|
|
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())
|
|
|
|
|
|
@pytest.mark.skipif(not sparse, reason="requires pydata/sparse")
|
|
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
|
|
def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type):
|
|
dtype = np.dtype(dtype_str)
|
|
data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype)
|
|
coords = np.array([
|
|
[0, 0, 2, 3, 1, 3],
|
|
[0, 2, 0, 4, 5, 5],
|
|
])
|
|
shape = (4, 6)
|
|
dim_names = ("x", "y")
|
|
|
|
sparse_array = sparse.COO(data=data, coords=coords, shape=shape)
|
|
sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array,
|
|
dim_names=dim_names)
|
|
out_sparse_array = sparse_tensor.to_pydata_sparse()
|
|
|
|
assert sparse_tensor.type == arrow_type
|
|
assert sparse_tensor.dim_names == dim_names
|
|
assert sparse_array.dtype == out_sparse_array.dtype
|
|
assert np.array_equal(sparse_array.data, out_sparse_array.data)
|
|
assert np.array_equal(sparse_array.coords, out_sparse_array.coords)
|
|
assert np.array_equal(sparse_array.todense(),
|
|
sparse_tensor.to_tensor().to_numpy())
|