first commit

2025-12-15 22:09:19 +00:00 · 2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions
--- a/.venv/Lib/site-packages/pandas/tests/extension/arrow/init.py
+++ b/.venv/Lib/site-packages/pandas/tests/extension/arrow/init.py
--- a/.venv/Lib/site-packages/pandas/tests/extension/arrow/arrays.py
+++ b/.venv/Lib/site-packages/pandas/tests/extension/arrow/arrays.py
@@ -0,0 +1,225 @@
+"""
+Rudimentary Apache Arrow-backed ExtensionArray.
+
+At the moment, just a boolean array / type is implemented.
+Eventually, we'll want to parametrize the type and support
+multiple dtypes. Not all methods are implemented yet, and the
+current implementation is not efficient.
+"""
+from __future__ import annotations
+
+import copy
+import itertools
+import operator
+
+import numpy as np
+import pyarrow as pa
+
+from pandas._typing import type_t
+
+import pandas as pd
+from pandas.api.extensions import (
+    ExtensionArray,
+    ExtensionDtype,
+    register_extension_dtype,
+    take,
+)
+from pandas.api.types import is_scalar
+from pandas.core.arraylike import OpsMixin
+from pandas.core.construction import extract_array
+
+
+@register_extension_dtype
+class ArrowBoolDtype(ExtensionDtype):
+
+    type = np.bool_
+    kind = "b"
+    name = "arrow_bool"
+    na_value = pa.NULL
+
+    @classmethod
+    def construct_array_type(cls) -> type_t[ArrowBoolArray]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ArrowBoolArray
+
+    @property
+    def _is_boolean(self) -> bool:
+        return True
+
+
+@register_extension_dtype
+class ArrowStringDtype(ExtensionDtype):
+
+    type = str
+    kind = "U"
+    name = "arrow_string"
+    na_value = pa.NULL
+
+    @classmethod
+    def construct_array_type(cls) -> type_t[ArrowStringArray]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ArrowStringArray
+
+
+class ArrowExtensionArray(OpsMixin, ExtensionArray):
+    _data: pa.ChunkedArray
+
+    @classmethod
+    def from_scalars(cls, values):
+        if isinstance(values, cls):
+            # in particular for empty cases the pa.array(np.asarray(...))
+            #  does not round-trip
+            return cls(values._data)
+
+        elif not len(values):
+            if isinstance(values, list):
+                dtype = bool if cls is ArrowBoolArray else str
+                values = np.array([], dtype=dtype)
+
+        arr = pa.chunked_array([pa.array(np.asarray(values))])
+        return cls(arr)
+
+    @classmethod
+    def from_array(cls, arr):
+        assert isinstance(arr, pa.Array)
+        return cls(pa.chunked_array([arr]))
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        return cls.from_scalars(scalars)
+
+    def __repr__(self):
+        return f"{type(self).__name__}({repr(self._data)})"
+
+    def __contains__(self, obj) -> bool:
+        if obj is None or obj is self.dtype.na_value:
+            # None -> EA.__contains__ only checks for self._dtype.na_value, not
+            #  any compatible NA value.
+            # self.dtype.na_value -> <pa.NullScalar:None> isn't recognized by pd.isna
+            return bool(self.isna().any())
+        return bool(super().__contains__(obj))
+
+    def __getitem__(self, item):
+        if is_scalar(item):
+            return self._data.to_pandas()[item]
+        else:
+            vals = self._data.to_pandas()[item]
+            return type(self).from_scalars(vals)
+
+    def __len__(self):
+        return len(self._data)
+
+    def astype(self, dtype, copy=True):
+        # needed to fix this astype for the Series constructor.
+        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        return super().astype(dtype, copy)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def _logical_method(self, other, op):
+        if not isinstance(other, type(self)):
+            raise NotImplementedError()
+
+        result = op(np.array(self._data), np.array(other._data))
+        return ArrowBoolArray(
+            pa.chunked_array([pa.array(result, mask=pd.isna(self._data.to_pandas()))])
+        )
+
+    def __eq__(self, other):
+        if not isinstance(other, type(self)):
+            # TODO: use some pyarrow function here?
+            return np.asarray(self).__eq__(other)
+
+        return self._logical_method(other, operator.eq)
+
+    @property
+    def nbytes(self) -> int:
+        return sum(
+            x.size
+            for chunk in self._data.chunks
+            for x in chunk.buffers()
+            if x is not None
+        )
+
+    def isna(self):
+        nas = pd.isna(self._data.to_pandas())
+        return type(self).from_scalars(nas)
+
+    def take(self, indices, allow_fill=False, fill_value=None):
+        data = self._data.to_pandas()
+        data = extract_array(data, extract_numpy=True)
+
+        if allow_fill and fill_value is None:
+            fill_value = self.dtype.na_value
+
+        result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
+        return self._from_sequence(result, dtype=self.dtype)
+
+    def copy(self):
+        return type(self)(copy.copy(self._data))
+
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat))
+        arr = pa.chunked_array(chunks)
+        return cls(arr)
+
+    def __invert__(self):
+        return type(self).from_scalars(~self._data.to_pandas())
+
+    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+        if skipna:
+            arr = self[~self.isna()]
+        else:
+            arr = self
+
+        try:
+            op = getattr(arr, name)
+        except AttributeError as err:
+            raise TypeError from err
+        return op(**kwargs)
+
+    def any(self, axis=0, out=None):
+        # Explicitly return a plain bool to reproduce GH-34660
+        return bool(self._data.to_pandas().any())
+
+    def all(self, axis=0, out=None):
+        # Explicitly return a plain bool to reproduce GH-34660
+        return bool(self._data.to_pandas().all())
+
+
+class ArrowBoolArray(ArrowExtensionArray):
+    def __init__(self, values):
+        if not isinstance(values, pa.ChunkedArray):
+            raise ValueError
+
+        assert values.type == pa.bool_()
+        self._data = values
+        self._dtype = ArrowBoolDtype()
+
+
+class ArrowStringArray(ArrowExtensionArray):
+    def __init__(self, values):
+        if not isinstance(values, pa.ChunkedArray):
+            raise ValueError
+
+        assert values.type == pa.string()
+        self._data = values
+        self._dtype = ArrowStringDtype()
--- a/.venv/Lib/site-packages/pandas/tests/extension/arrow/test_bool.py
+++ b/.venv/Lib/site-packages/pandas/tests/extension/arrow/test_bool.py
@@ -0,0 +1,113 @@
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    is_ci_environment,
+    is_platform_windows,
+)
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.types import is_bool_dtype
+from pandas.tests.extension import base
+
+pytest.importorskip("pyarrow", minversion="1.0.1")
+
+from pandas.tests.extension.arrow.arrays import (  # isort:skip
+    ArrowBoolArray,
+    ArrowBoolDtype,
+)
+
+
+@pytest.fixture
+def dtype():
+    return ArrowBoolDtype()
+
+
+@pytest.fixture
+def data():
+    values = np.random.randint(0, 2, size=100, dtype=bool)
+    values[1] = ~values[0]
+    return ArrowBoolArray.from_scalars(values)
+
+
+@pytest.fixture
+def data_missing():
+    return ArrowBoolArray.from_scalars([None, True])
+
+
+def test_basic_equals(data):
+    # https://github.com/pandas-dev/pandas/issues/34660
+    assert pd.Series(data).equals(pd.Series(data))
+
+
+class BaseArrowTests:
+    pass
+
+
+class TestDtype(BaseArrowTests, base.BaseDtypeTests):
+    pass
+
+
+class TestInterface(BaseArrowTests, base.BaseInterfaceTests):
+    def test_copy(self, data):
+        # __setitem__ does not work, so we only have a smoke-test
+        data.copy()
+
+    def test_view(self, data):
+        # __setitem__ does not work, so we only have a smoke-test
+        data.view()
+
+    @pytest.mark.xfail(
+        raises=AssertionError,
+        reason="Doesn't recognize data._na_value as NA",
+    )
+    def test_contains(self, data, data_missing):
+        super().test_contains(data, data_missing)
+
+
+class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
+    # seems like some bug in isna on empty BoolArray returning floats.
+    @pytest.mark.xfail(reason="bad is-na for empty data")
+    def test_from_sequence_from_cls(self, data):
+        super().test_from_sequence_from_cls(data)
+
+    @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
+    def test_series_constructor_no_data_with_index(self, dtype, na_value):
+        # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
+        super().test_series_constructor_no_data_with_index(dtype, na_value)
+
+    @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
+    def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
+        # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
+        super().test_series_constructor_scalar_na_with_index(dtype, na_value)
+
+    @pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types")
+    def test_construct_empty_dataframe(self, dtype):
+        super().test_construct_empty_dataframe(dtype)
+
+    @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
+    def test_empty(self, dtype):
+        super().test_empty(dtype)
+
+
+class TestReduce(base.BaseNoReduceTests):
+    def test_reduce_series_boolean(self):
+        pass
+
+
+@pytest.mark.skipif(
+    is_ci_environment() and is_platform_windows(),
+    reason="Causes stack overflow on Windows CI",
+)
+class TestReduceBoolean(base.BaseBooleanReduceTests):
+    pass
+
+
+def test_is_bool_dtype(data):
+    assert is_bool_dtype(data)
+    assert pd.core.common.is_bool_indexer(data)
+    s = pd.Series(range(len(data)))
+    result = s[data]
+    expected = s[np.asarray(data)]
+    tm.assert_series_equal(result, expected)
--- a/.venv/Lib/site-packages/pandas/tests/extension/arrow/test_string.py
+++ b/.venv/Lib/site-packages/pandas/tests/extension/arrow/test_string.py
@@ -0,0 +1,12 @@
+import pytest
+
+import pandas as pd
+
+pytest.importorskip("pyarrow", minversion="1.0.0")
+
+
+def test_constructor_from_list():
+    # GH 27673
+    result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow"))
+    assert isinstance(result.dtype, pd.StringDtype)
+    assert result.dtype.storage == "pyarrow"
--- a/.venv/Lib/site-packages/pandas/tests/extension/arrow/test_timestamp.py
+++ b/.venv/Lib/site-packages/pandas/tests/extension/arrow/test_timestamp.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import datetime
+
+import pytest
+
+from pandas._typing import type_t
+
+import pandas as pd
+from pandas.api.extensions import (
+    ExtensionDtype,
+    register_extension_dtype,
+)
+
+pytest.importorskip("pyarrow", minversion="1.0.1")
+
+import pyarrow as pa  # isort:skip
+
+from pandas.tests.extension.arrow.arrays import ArrowExtensionArray  # isort:skip
+
+
+@register_extension_dtype
+class ArrowTimestampUSDtype(ExtensionDtype):
+
+    type = datetime.datetime
+    kind = "M"
+    name = "arrow_timestamp_us"
+    na_value = pa.NULL
+
+    @classmethod
+    def construct_array_type(cls) -> type_t[ArrowTimestampUSArray]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ArrowTimestampUSArray
+
+
+class ArrowTimestampUSArray(ArrowExtensionArray):
+    def __init__(self, values):
+        if not isinstance(values, pa.ChunkedArray):
+            raise ValueError
+
+        assert values.type == pa.timestamp("us")
+        self._data = values
+        self._dtype = ArrowTimestampUSDtype()
+
+
+def test_constructor_extensionblock():
+    # GH 34986
+    pd.DataFrame(
+        {
+            "timestamp": ArrowTimestampUSArray.from_scalars(
+                [None, datetime.datetime(2010, 9, 8, 7, 6, 5, 4)]
+            )
+        }
+    )