mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-04 07:08:05 +00:00
first commit
This commit is contained in:
225
.venv/Lib/site-packages/pandas/tests/extension/arrow/arrays.py
Normal file
225
.venv/Lib/site-packages/pandas/tests/extension/arrow/arrays.py
Normal file
@ -0,0 +1,225 @@
|
||||
"""
|
||||
Rudimentary Apache Arrow-backed ExtensionArray.
|
||||
|
||||
At the moment, just a boolean array / type is implemented.
|
||||
Eventually, we'll want to parametrize the type and support
|
||||
multiple dtypes. Not all methods are implemented yet, and the
|
||||
current implementation is not efficient.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
from pandas._typing import type_t
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import (
|
||||
ExtensionArray,
|
||||
ExtensionDtype,
|
||||
register_extension_dtype,
|
||||
take,
|
||||
)
|
||||
from pandas.api.types import is_scalar
|
||||
from pandas.core.arraylike import OpsMixin
|
||||
from pandas.core.construction import extract_array
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class ArrowBoolDtype(ExtensionDtype):
|
||||
|
||||
type = np.bool_
|
||||
kind = "b"
|
||||
name = "arrow_bool"
|
||||
na_value = pa.NULL
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ArrowBoolArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return ArrowBoolArray
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class ArrowStringDtype(ExtensionDtype):
|
||||
|
||||
type = str
|
||||
kind = "U"
|
||||
name = "arrow_string"
|
||||
na_value = pa.NULL
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ArrowStringArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return ArrowStringArray
|
||||
|
||||
|
||||
class ArrowExtensionArray(OpsMixin, ExtensionArray):
|
||||
_data: pa.ChunkedArray
|
||||
|
||||
@classmethod
|
||||
def from_scalars(cls, values):
|
||||
if isinstance(values, cls):
|
||||
# in particular for empty cases the pa.array(np.asarray(...))
|
||||
# does not round-trip
|
||||
return cls(values._data)
|
||||
|
||||
elif not len(values):
|
||||
if isinstance(values, list):
|
||||
dtype = bool if cls is ArrowBoolArray else str
|
||||
values = np.array([], dtype=dtype)
|
||||
|
||||
arr = pa.chunked_array([pa.array(np.asarray(values))])
|
||||
return cls(arr)
|
||||
|
||||
@classmethod
|
||||
def from_array(cls, arr):
|
||||
assert isinstance(arr, pa.Array)
|
||||
return cls(pa.chunked_array([arr]))
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, dtype=None, copy=False):
|
||||
return cls.from_scalars(scalars)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{type(self).__name__}({repr(self._data)})"
|
||||
|
||||
def __contains__(self, obj) -> bool:
|
||||
if obj is None or obj is self.dtype.na_value:
|
||||
# None -> EA.__contains__ only checks for self._dtype.na_value, not
|
||||
# any compatible NA value.
|
||||
# self.dtype.na_value -> <pa.NullScalar:None> isn't recognized by pd.isna
|
||||
return bool(self.isna().any())
|
||||
return bool(super().__contains__(obj))
|
||||
|
||||
def __getitem__(self, item):
|
||||
if is_scalar(item):
|
||||
return self._data.to_pandas()[item]
|
||||
else:
|
||||
vals = self._data.to_pandas()[item]
|
||||
return type(self).from_scalars(vals)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
||||
|
||||
def astype(self, dtype, copy=True):
|
||||
# needed to fix this astype for the Series constructor.
|
||||
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
return super().astype(dtype, copy)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
if not isinstance(other, type(self)):
|
||||
raise NotImplementedError()
|
||||
|
||||
result = op(np.array(self._data), np.array(other._data))
|
||||
return ArrowBoolArray(
|
||||
pa.chunked_array([pa.array(result, mask=pd.isna(self._data.to_pandas()))])
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, type(self)):
|
||||
# TODO: use some pyarrow function here?
|
||||
return np.asarray(self).__eq__(other)
|
||||
|
||||
return self._logical_method(other, operator.eq)
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return sum(
|
||||
x.size
|
||||
for chunk in self._data.chunks
|
||||
for x in chunk.buffers()
|
||||
if x is not None
|
||||
)
|
||||
|
||||
def isna(self):
|
||||
nas = pd.isna(self._data.to_pandas())
|
||||
return type(self).from_scalars(nas)
|
||||
|
||||
def take(self, indices, allow_fill=False, fill_value=None):
|
||||
data = self._data.to_pandas()
|
||||
data = extract_array(data, extract_numpy=True)
|
||||
|
||||
if allow_fill and fill_value is None:
|
||||
fill_value = self.dtype.na_value
|
||||
|
||||
result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
|
||||
return self._from_sequence(result, dtype=self.dtype)
|
||||
|
||||
def copy(self):
|
||||
return type(self)(copy.copy(self._data))
|
||||
|
||||
@classmethod
|
||||
def _concat_same_type(cls, to_concat):
|
||||
chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat))
|
||||
arr = pa.chunked_array(chunks)
|
||||
return cls(arr)
|
||||
|
||||
def __invert__(self):
|
||||
return type(self).from_scalars(~self._data.to_pandas())
|
||||
|
||||
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
|
||||
if skipna:
|
||||
arr = self[~self.isna()]
|
||||
else:
|
||||
arr = self
|
||||
|
||||
try:
|
||||
op = getattr(arr, name)
|
||||
except AttributeError as err:
|
||||
raise TypeError from err
|
||||
return op(**kwargs)
|
||||
|
||||
def any(self, axis=0, out=None):
|
||||
# Explicitly return a plain bool to reproduce GH-34660
|
||||
return bool(self._data.to_pandas().any())
|
||||
|
||||
def all(self, axis=0, out=None):
|
||||
# Explicitly return a plain bool to reproduce GH-34660
|
||||
return bool(self._data.to_pandas().all())
|
||||
|
||||
|
||||
class ArrowBoolArray(ArrowExtensionArray):
|
||||
def __init__(self, values):
|
||||
if not isinstance(values, pa.ChunkedArray):
|
||||
raise ValueError
|
||||
|
||||
assert values.type == pa.bool_()
|
||||
self._data = values
|
||||
self._dtype = ArrowBoolDtype()
|
||||
|
||||
|
||||
class ArrowStringArray(ArrowExtensionArray):
|
||||
def __init__(self, values):
|
||||
if not isinstance(values, pa.ChunkedArray):
|
||||
raise ValueError
|
||||
|
||||
assert values.type == pa.string()
|
||||
self._data = values
|
||||
self._dtype = ArrowStringDtype()
|
@ -0,0 +1,113 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import (
|
||||
is_ci_environment,
|
||||
is_platform_windows,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import is_bool_dtype
|
||||
from pandas.tests.extension import base
|
||||
|
||||
pytest.importorskip("pyarrow", minversion="1.0.1")
|
||||
|
||||
from pandas.tests.extension.arrow.arrays import ( # isort:skip
|
||||
ArrowBoolArray,
|
||||
ArrowBoolDtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return ArrowBoolDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
values = np.random.randint(0, 2, size=100, dtype=bool)
|
||||
values[1] = ~values[0]
|
||||
return ArrowBoolArray.from_scalars(values)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing():
|
||||
return ArrowBoolArray.from_scalars([None, True])
|
||||
|
||||
|
||||
def test_basic_equals(data):
|
||||
# https://github.com/pandas-dev/pandas/issues/34660
|
||||
assert pd.Series(data).equals(pd.Series(data))
|
||||
|
||||
|
||||
class BaseArrowTests:
|
||||
pass
|
||||
|
||||
|
||||
class TestDtype(BaseArrowTests, base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(BaseArrowTests, base.BaseInterfaceTests):
|
||||
def test_copy(self, data):
|
||||
# __setitem__ does not work, so we only have a smoke-test
|
||||
data.copy()
|
||||
|
||||
def test_view(self, data):
|
||||
# __setitem__ does not work, so we only have a smoke-test
|
||||
data.view()
|
||||
|
||||
@pytest.mark.xfail(
|
||||
raises=AssertionError,
|
||||
reason="Doesn't recognize data._na_value as NA",
|
||||
)
|
||||
def test_contains(self, data, data_missing):
|
||||
super().test_contains(data, data_missing)
|
||||
|
||||
|
||||
class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
|
||||
# seems like some bug in isna on empty BoolArray returning floats.
|
||||
@pytest.mark.xfail(reason="bad is-na for empty data")
|
||||
def test_from_sequence_from_cls(self, data):
|
||||
super().test_from_sequence_from_cls(data)
|
||||
|
||||
@pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
|
||||
def test_series_constructor_no_data_with_index(self, dtype, na_value):
|
||||
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
|
||||
super().test_series_constructor_no_data_with_index(dtype, na_value)
|
||||
|
||||
@pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
|
||||
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
|
||||
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
|
||||
super().test_series_constructor_scalar_na_with_index(dtype, na_value)
|
||||
|
||||
@pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types")
|
||||
def test_construct_empty_dataframe(self, dtype):
|
||||
super().test_construct_empty_dataframe(dtype)
|
||||
|
||||
@pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
|
||||
def test_empty(self, dtype):
|
||||
super().test_empty(dtype)
|
||||
|
||||
|
||||
class TestReduce(base.BaseNoReduceTests):
|
||||
def test_reduce_series_boolean(self):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
is_ci_environment() and is_platform_windows(),
|
||||
reason="Causes stack overflow on Windows CI",
|
||||
)
|
||||
class TestReduceBoolean(base.BaseBooleanReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
def test_is_bool_dtype(data):
|
||||
assert is_bool_dtype(data)
|
||||
assert pd.core.common.is_bool_indexer(data)
|
||||
s = pd.Series(range(len(data)))
|
||||
result = s[data]
|
||||
expected = s[np.asarray(data)]
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,12 @@
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
|
||||
pytest.importorskip("pyarrow", minversion="1.0.0")
|
||||
|
||||
|
||||
def test_constructor_from_list():
|
||||
# GH 27673
|
||||
result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow"))
|
||||
assert isinstance(result.dtype, pd.StringDtype)
|
||||
assert result.dtype.storage == "pyarrow"
|
@ -0,0 +1,60 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas._typing import type_t
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import (
|
||||
ExtensionDtype,
|
||||
register_extension_dtype,
|
||||
)
|
||||
|
||||
pytest.importorskip("pyarrow", minversion="1.0.1")
|
||||
|
||||
import pyarrow as pa # isort:skip
|
||||
|
||||
from pandas.tests.extension.arrow.arrays import ArrowExtensionArray # isort:skip
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class ArrowTimestampUSDtype(ExtensionDtype):
|
||||
|
||||
type = datetime.datetime
|
||||
kind = "M"
|
||||
name = "arrow_timestamp_us"
|
||||
na_value = pa.NULL
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ArrowTimestampUSArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return ArrowTimestampUSArray
|
||||
|
||||
|
||||
class ArrowTimestampUSArray(ArrowExtensionArray):
|
||||
def __init__(self, values):
|
||||
if not isinstance(values, pa.ChunkedArray):
|
||||
raise ValueError
|
||||
|
||||
assert values.type == pa.timestamp("us")
|
||||
self._data = values
|
||||
self._dtype = ArrowTimestampUSDtype()
|
||||
|
||||
|
||||
def test_constructor_extensionblock():
|
||||
# GH 34986
|
||||
pd.DataFrame(
|
||||
{
|
||||
"timestamp": ArrowTimestampUSArray.from_scalars(
|
||||
[None, datetime.datetime(2010, 9, 8, 7, 6, 5, 4)]
|
||||
)
|
||||
}
|
||||
)
|
Reference in New Issue
Block a user