mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-21 18:23:35 +00:00
2733 lines
100 KiB
Python
2733 lines
100 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
||
# or more contributor license agreements. See the NOTICE file
|
||
# distributed with this work for additional information
|
||
# regarding copyright ownership. The ASF licenses this file
|
||
# to you under the Apache License, Version 2.0 (the
|
||
# "License"); you may not use this file except in compliance
|
||
# with the License. You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing,
|
||
# software distributed under the License is distributed on an
|
||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
# KIND, either express or implied. See the License for the
|
||
# specific language governing permissions and limitations
|
||
# under the License.
|
||
|
||
from datetime import datetime
|
||
from functools import lru_cache, partial
|
||
import inspect
|
||
import os
|
||
import pickle
|
||
import pytest
|
||
import random
|
||
import sys
|
||
import textwrap
|
||
|
||
import numpy as np
|
||
|
||
try:
|
||
import pandas as pd
|
||
except ImportError:
|
||
pd = None
|
||
|
||
import pyarrow as pa
|
||
import pyarrow.compute as pc
|
||
|
||
all_array_types = [
|
||
('bool', [True, False, False, True, True]),
|
||
('uint8', np.arange(5)),
|
||
('int8', np.arange(5)),
|
||
('uint16', np.arange(5)),
|
||
('int16', np.arange(5)),
|
||
('uint32', np.arange(5)),
|
||
('int32', np.arange(5)),
|
||
('uint64', np.arange(5, 10)),
|
||
('int64', np.arange(5, 10)),
|
||
('float', np.arange(0, 0.5, 0.1)),
|
||
('double', np.arange(0, 0.5, 0.1)),
|
||
('string', ['a', 'b', None, 'ddd', 'ee']),
|
||
('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
|
||
(pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
|
||
(pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
|
||
(pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
|
||
(pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
|
||
{'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
|
||
]
|
||
|
||
exported_functions = [
|
||
func for (name, func) in sorted(pc.__dict__.items())
|
||
if hasattr(func, '__arrow_compute_function__')]
|
||
|
||
exported_option_classes = [
|
||
cls for (name, cls) in sorted(pc.__dict__.items())
|
||
if (isinstance(cls, type) and
|
||
cls is not pc.FunctionOptions and
|
||
issubclass(cls, pc.FunctionOptions))]
|
||
|
||
numerical_arrow_types = [
|
||
pa.int8(),
|
||
pa.int16(),
|
||
pa.int64(),
|
||
pa.uint8(),
|
||
pa.uint16(),
|
||
pa.uint64(),
|
||
pa.float32(),
|
||
pa.float64()
|
||
]
|
||
|
||
|
||
def test_exported_functions():
|
||
# Check that all exported concrete functions can be called with
|
||
# the right number of arguments.
|
||
# Note that unregistered functions (e.g. with a mismatching name)
|
||
# will raise KeyError.
|
||
functions = exported_functions
|
||
assert len(functions) >= 10
|
||
for func in functions:
|
||
desc = func.__arrow_compute_function__
|
||
if desc['options_required']:
|
||
# Skip this function as it will fail with a different error
|
||
# message if we don't pass an options instance.
|
||
continue
|
||
arity = desc['arity']
|
||
if arity == 0:
|
||
continue
|
||
if arity is Ellipsis:
|
||
args = [object()] * 3
|
||
else:
|
||
args = [object()] * arity
|
||
with pytest.raises(TypeError,
|
||
match="Got unexpected argument type "
|
||
"<class 'object'> for compute function"):
|
||
func(*args)
|
||
|
||
|
||
def test_hash_aggregate_not_exported():
|
||
# Ensure we are not leaking hash aggregate functions
|
||
# which are not callable by themselves.
|
||
for func in exported_functions:
|
||
arrow_f = pc.get_function(func.__arrow_compute_function__["name"])
|
||
assert arrow_f.kind != "hash_aggregate"
|
||
|
||
|
||
def test_exported_option_classes():
|
||
classes = exported_option_classes
|
||
assert len(classes) >= 10
|
||
for cls in classes:
|
||
# Option classes must have an introspectable constructor signature,
|
||
# and that signature should not have any *args or **kwargs.
|
||
sig = inspect.signature(cls)
|
||
for param in sig.parameters.values():
|
||
assert param.kind not in (param.VAR_POSITIONAL,
|
||
param.VAR_KEYWORD)
|
||
|
||
|
||
def test_option_class_equality():
|
||
options = [
|
||
pc.ArraySortOptions(),
|
||
pc.AssumeTimezoneOptions("UTC"),
|
||
pc.CastOptions.safe(pa.int8()),
|
||
pc.CountOptions(),
|
||
pc.DayOfWeekOptions(count_from_zero=False, week_start=0),
|
||
pc.DictionaryEncodeOptions(),
|
||
pc.ElementWiseAggregateOptions(skip_nulls=True),
|
||
pc.ExtractRegexOptions("pattern"),
|
||
pc.FilterOptions(),
|
||
pc.IndexOptions(pa.scalar(1)),
|
||
pc.JoinOptions(),
|
||
pc.MakeStructOptions(["field", "names"],
|
||
field_nullability=[True, True],
|
||
field_metadata=[pa.KeyValueMetadata({"a": "1"}),
|
||
pa.KeyValueMetadata({"b": "2"})]),
|
||
pc.MapLookupOptions(pa.scalar(1), "first"),
|
||
pc.MatchSubstringOptions("pattern"),
|
||
pc.ModeOptions(),
|
||
pc.NullOptions(),
|
||
pc.PadOptions(5),
|
||
pc.PartitionNthOptions(1, null_placement="at_start"),
|
||
pc.QuantileOptions(),
|
||
pc.RandomOptions(10),
|
||
pc.ReplaceSliceOptions(0, 1, "a"),
|
||
pc.ReplaceSubstringOptions("a", "b"),
|
||
pc.RoundOptions(2, "towards_infinity"),
|
||
pc.RoundTemporalOptions(1, "second", True),
|
||
pc.RoundToMultipleOptions(100, "towards_infinity"),
|
||
pc.ScalarAggregateOptions(),
|
||
pc.SelectKOptions(0, sort_keys=[("b", "ascending")]),
|
||
pc.SetLookupOptions(pa.array([1])),
|
||
pc.SliceOptions(0, 1, 1),
|
||
pc.SortOptions([("dummy", "descending")], null_placement="at_start"),
|
||
pc.SplitOptions(),
|
||
pc.SplitPatternOptions("pattern"),
|
||
pc.StrftimeOptions(),
|
||
pc.StrptimeOptions("%Y", "s", True),
|
||
pc.StructFieldOptions(indices=[]),
|
||
pc.TakeOptions(),
|
||
pc.TDigestOptions(),
|
||
pc.TrimOptions(" "),
|
||
pc.Utf8NormalizeOptions("NFKC"),
|
||
pc.VarianceOptions(),
|
||
pc.WeekOptions(week_starts_monday=True, count_from_zero=False,
|
||
first_week_is_fully_in_year=False),
|
||
]
|
||
# TODO: We should test on windows once ARROW-13168 is resolved.
|
||
# Timezone database is not available on Windows yet
|
||
if sys.platform != 'win32':
|
||
options.append(pc.AssumeTimezoneOptions("Europe/Ljubljana"))
|
||
|
||
classes = {type(option) for option in options}
|
||
|
||
for cls in exported_option_classes:
|
||
# Timezone database is not available on Windows yet
|
||
if cls not in classes and sys.platform != 'win32' and \
|
||
cls != pc.AssumeTimezoneOptions:
|
||
try:
|
||
options.append(cls())
|
||
except TypeError:
|
||
pytest.fail(f"Options class is not tested: {cls}")
|
||
|
||
for option in options:
|
||
assert option == option
|
||
assert repr(option).startswith(option.__class__.__name__)
|
||
buf = option.serialize()
|
||
deserialized = pc.FunctionOptions.deserialize(buf)
|
||
assert option == deserialized
|
||
assert repr(option) == repr(deserialized)
|
||
for option1, option2 in zip(options, options[1:]):
|
||
assert option1 != option2
|
||
|
||
assert repr(pc.IndexOptions(pa.scalar(1))) == "IndexOptions(value=int64:1)"
|
||
assert repr(pc.ArraySortOptions()) == \
|
||
"ArraySortOptions(order=Ascending, null_placement=AtEnd)"
|
||
|
||
|
||
def test_list_functions():
|
||
assert len(pc.list_functions()) > 10
|
||
assert "add" in pc.list_functions()
|
||
|
||
|
||
def _check_get_function(name, expected_func_cls, expected_ker_cls,
|
||
min_num_kernels=1):
|
||
func = pc.get_function(name)
|
||
assert isinstance(func, expected_func_cls)
|
||
n = func.num_kernels
|
||
assert n >= min_num_kernels
|
||
assert n == len(func.kernels)
|
||
assert all(isinstance(ker, expected_ker_cls) for ker in func.kernels)
|
||
|
||
|
||
def test_get_function_scalar():
|
||
_check_get_function("add", pc.ScalarFunction, pc.ScalarKernel, 8)
|
||
|
||
|
||
def test_get_function_vector():
|
||
_check_get_function("unique", pc.VectorFunction, pc.VectorKernel, 8)
|
||
|
||
|
||
def test_get_function_scalar_aggregate():
|
||
_check_get_function("mean", pc.ScalarAggregateFunction,
|
||
pc.ScalarAggregateKernel, 8)
|
||
|
||
|
||
def test_get_function_hash_aggregate():
|
||
_check_get_function("hash_sum", pc.HashAggregateFunction,
|
||
pc.HashAggregateKernel, 1)
|
||
|
||
|
||
def test_call_function_with_memory_pool():
|
||
arr = pa.array(["foo", "bar", "baz"])
|
||
indices = np.array([2, 2, 1])
|
||
result1 = arr.take(indices)
|
||
result2 = pc.call_function('take', [arr, indices],
|
||
memory_pool=pa.default_memory_pool())
|
||
expected = pa.array(["baz", "baz", "bar"])
|
||
assert result1.equals(expected)
|
||
assert result2.equals(expected)
|
||
|
||
result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool())
|
||
assert result3.equals(expected)
|
||
|
||
|
||
def test_pickle_functions():
|
||
# Pickle registered functions
|
||
for name in pc.list_functions():
|
||
func = pc.get_function(name)
|
||
reconstructed = pickle.loads(pickle.dumps(func))
|
||
assert type(reconstructed) is type(func)
|
||
assert reconstructed.name == func.name
|
||
assert reconstructed.arity == func.arity
|
||
assert reconstructed.num_kernels == func.num_kernels
|
||
|
||
|
||
def test_pickle_global_functions():
|
||
# Pickle global wrappers (manual or automatic) of registered functions
|
||
for name in pc.list_functions():
|
||
try:
|
||
func = getattr(pc, name)
|
||
except AttributeError:
|
||
# hash_aggregate functions are not exported as callables.
|
||
continue
|
||
reconstructed = pickle.loads(pickle.dumps(func))
|
||
assert reconstructed is func
|
||
|
||
|
||
def test_function_attributes():
|
||
# Sanity check attributes of registered functions
|
||
for name in pc.list_functions():
|
||
func = pc.get_function(name)
|
||
assert isinstance(func, pc.Function)
|
||
assert func.name == name
|
||
kernels = func.kernels
|
||
assert func.num_kernels == len(kernels)
|
||
assert all(isinstance(ker, pc.Kernel) for ker in kernels)
|
||
repr(func)
|
||
for ker in kernels:
|
||
repr(ker)
|
||
|
||
|
||
def test_input_type_conversion():
|
||
# Automatic array conversion from Python
|
||
arr = pc.add([1, 2], [4, None])
|
||
assert arr.to_pylist() == [5, None]
|
||
# Automatic scalar conversion from Python
|
||
arr = pc.add([1, 2], 4)
|
||
assert arr.to_pylist() == [5, 6]
|
||
# Other scalar type
|
||
assert pc.equal(["foo", "bar", None],
|
||
"foo").to_pylist() == [True, False, None]
|
||
|
||
|
||
@pytest.mark.parametrize('arrow_type', numerical_arrow_types)
|
||
def test_sum_array(arrow_type):
|
||
arr = pa.array([1, 2, 3, 4], type=arrow_type)
|
||
assert arr.sum().as_py() == 10
|
||
assert pc.sum(arr).as_py() == 10
|
||
|
||
arr = pa.array([1, 2, 3, 4, None], type=arrow_type)
|
||
assert arr.sum().as_py() == 10
|
||
assert pc.sum(arr).as_py() == 10
|
||
|
||
arr = pa.array([None], type=arrow_type)
|
||
assert arr.sum().as_py() is None # noqa: E711
|
||
assert pc.sum(arr).as_py() is None # noqa: E711
|
||
assert arr.sum(min_count=0).as_py() == 0
|
||
assert pc.sum(arr, min_count=0).as_py() == 0
|
||
|
||
arr = pa.array([], type=arrow_type)
|
||
assert arr.sum().as_py() is None # noqa: E711
|
||
assert arr.sum(min_count=0).as_py() == 0
|
||
assert pc.sum(arr, min_count=0).as_py() == 0
|
||
|
||
|
||
@pytest.mark.parametrize('arrow_type', numerical_arrow_types)
|
||
def test_sum_chunked_array(arrow_type):
|
||
arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)])
|
||
assert pc.sum(arr).as_py() == 10
|
||
|
||
arr = pa.chunked_array([
|
||
pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type)
|
||
])
|
||
assert pc.sum(arr).as_py() == 10
|
||
|
||
arr = pa.chunked_array([
|
||
pa.array([1, 2], type=arrow_type),
|
||
pa.array([], type=arrow_type),
|
||
pa.array([3, 4], type=arrow_type)
|
||
])
|
||
assert pc.sum(arr).as_py() == 10
|
||
|
||
arr = pa.chunked_array((), type=arrow_type)
|
||
assert arr.num_chunks == 0
|
||
assert pc.sum(arr).as_py() is None # noqa: E711
|
||
assert pc.sum(arr, min_count=0).as_py() == 0
|
||
|
||
|
||
def test_mode_array():
|
||
# ARROW-9917
|
||
arr = pa.array([1, 1, 3, 4, 3, 5], type='int64')
|
||
mode = pc.mode(arr)
|
||
assert len(mode) == 1
|
||
assert mode[0].as_py() == {"mode": 1, "count": 2}
|
||
|
||
mode = pc.mode(arr, n=2)
|
||
assert len(mode) == 2
|
||
assert mode[0].as_py() == {"mode": 1, "count": 2}
|
||
assert mode[1].as_py() == {"mode": 3, "count": 2}
|
||
|
||
arr = pa.array([], type='int64')
|
||
assert len(pc.mode(arr)) == 0
|
||
|
||
arr = pa.array([1, 1, 3, 4, 3, None], type='int64')
|
||
mode = pc.mode(arr, skip_nulls=False)
|
||
assert len(mode) == 0
|
||
mode = pc.mode(arr, min_count=6)
|
||
assert len(mode) == 0
|
||
mode = pc.mode(arr, skip_nulls=False, min_count=5)
|
||
assert len(mode) == 0
|
||
|
||
|
||
def test_mode_chunked_array():
|
||
# ARROW-9917
|
||
arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')])
|
||
mode = pc.mode(arr)
|
||
assert len(mode) == 1
|
||
assert mode[0].as_py() == {"mode": 1, "count": 2}
|
||
|
||
mode = pc.mode(arr, n=2)
|
||
assert len(mode) == 2
|
||
assert mode[0].as_py() == {"mode": 1, "count": 2}
|
||
assert mode[1].as_py() == {"mode": 3, "count": 2}
|
||
|
||
arr = pa.chunked_array((), type='int64')
|
||
assert arr.num_chunks == 0
|
||
assert len(pc.mode(arr)) == 0
|
||
|
||
|
||
def test_variance():
|
||
data = [1, 2, 3, 4, 5, 6, 7, 8]
|
||
assert pc.variance(data).as_py() == 5.25
|
||
assert pc.variance(data, ddof=0).as_py() == 5.25
|
||
assert pc.variance(data, ddof=1).as_py() == 6.0
|
||
|
||
|
||
def test_count_substring():
|
||
for (ty, offset) in [(pa.string(), pa.int32()),
|
||
(pa.large_string(), pa.int64())]:
|
||
arr = pa.array(["ab", "cab", "abcab", "ba", "AB", None], type=ty)
|
||
|
||
result = pc.count_substring(arr, "ab")
|
||
expected = pa.array([1, 1, 2, 0, 0, None], type=offset)
|
||
assert expected == result
|
||
|
||
result = pc.count_substring(arr, "ab", ignore_case=True)
|
||
expected = pa.array([1, 1, 2, 0, 1, None], type=offset)
|
||
assert expected == result
|
||
|
||
|
||
def test_count_substring_regex():
|
||
for (ty, offset) in [(pa.string(), pa.int32()),
|
||
(pa.large_string(), pa.int64())]:
|
||
arr = pa.array(["ab", "cab", "baAacaa", "ba", "AB", None], type=ty)
|
||
|
||
result = pc.count_substring_regex(arr, "a+")
|
||
expected = pa.array([1, 1, 3, 1, 0, None], type=offset)
|
||
assert expected.equals(result)
|
||
|
||
result = pc.count_substring_regex(arr, "a+", ignore_case=True)
|
||
expected = pa.array([1, 1, 2, 1, 1, None], type=offset)
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_find_substring():
|
||
for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]:
|
||
arr = pa.array(["ab", "cab", "ba", None], type=ty)
|
||
result = pc.find_substring(arr, "ab")
|
||
assert result.to_pylist() == [0, 1, -1, None]
|
||
|
||
result = pc.find_substring_regex(arr, "a?b")
|
||
assert result.to_pylist() == [0, 1, 0, None]
|
||
|
||
arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty)
|
||
result = pc.find_substring(arr, "aB*", ignore_case=True)
|
||
assert result.to_pylist() == [0, 1, -1, -1]
|
||
|
||
result = pc.find_substring_regex(arr, "a?b", ignore_case=True)
|
||
assert result.to_pylist() == [0, 1, 0, 0]
|
||
|
||
|
||
def test_match_like():
|
||
arr = pa.array(["ab", "ba%", "ba", "ca%d", None])
|
||
result = pc.match_like(arr, r"_a\%%")
|
||
expected = pa.array([False, True, False, True, None])
|
||
assert expected.equals(result)
|
||
|
||
arr = pa.array(["aB", "bA%", "ba", "ca%d", None])
|
||
result = pc.match_like(arr, r"_a\%%", ignore_case=True)
|
||
expected = pa.array([False, True, False, True, None])
|
||
assert expected.equals(result)
|
||
result = pc.match_like(arr, r"_a\%%", ignore_case=False)
|
||
expected = pa.array([False, False, False, True, None])
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_match_substring():
|
||
arr = pa.array(["ab", "abc", "ba", None])
|
||
result = pc.match_substring(arr, "ab")
|
||
expected = pa.array([True, True, False, None])
|
||
assert expected.equals(result)
|
||
|
||
arr = pa.array(["áB", "Ábc", "ba", None])
|
||
result = pc.match_substring(arr, "áb", ignore_case=True)
|
||
expected = pa.array([True, True, False, None])
|
||
assert expected.equals(result)
|
||
result = pc.match_substring(arr, "áb", ignore_case=False)
|
||
expected = pa.array([False, False, False, None])
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_match_substring_regex():
|
||
arr = pa.array(["ab", "abc", "ba", "c", None])
|
||
result = pc.match_substring_regex(arr, "^a?b")
|
||
expected = pa.array([True, True, True, False, None])
|
||
assert expected.equals(result)
|
||
|
||
arr = pa.array(["aB", "Abc", "BA", "c", None])
|
||
result = pc.match_substring_regex(arr, "^a?b", ignore_case=True)
|
||
expected = pa.array([True, True, True, False, None])
|
||
assert expected.equals(result)
|
||
result = pc.match_substring_regex(arr, "^a?b", ignore_case=False)
|
||
expected = pa.array([False, False, False, False, None])
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_trim():
|
||
# \u3000 is unicode whitespace
|
||
arr = pa.array([" foo", None, " \u3000foo bar \t"])
|
||
result = pc.utf8_trim_whitespace(arr)
|
||
expected = pa.array(["foo", None, "foo bar"])
|
||
assert expected.equals(result)
|
||
|
||
arr = pa.array([" foo", None, " \u3000foo bar \t"])
|
||
result = pc.ascii_trim_whitespace(arr)
|
||
expected = pa.array(["foo", None, "\u3000foo bar"])
|
||
assert expected.equals(result)
|
||
|
||
arr = pa.array([" foo", None, " \u3000foo bar \t"])
|
||
result = pc.utf8_trim(arr, characters=' f\u3000')
|
||
expected = pa.array(["oo", None, "oo bar \t"])
|
||
assert expected.equals(result)
|
||
# Positional option
|
||
result = pc.utf8_trim(arr, ' f\u3000')
|
||
expected = pa.array(["oo", None, "oo bar \t"])
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_slice_compatibility():
|
||
arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])
|
||
for start in range(-6, 6):
|
||
for stop in range(-6, 6):
|
||
for step in [-3, -2, -1, 1, 2, 3]:
|
||
expected = pa.array([k.as_py()[start:stop:step]
|
||
for k in arr])
|
||
result = pc.utf8_slice_codeunits(
|
||
arr, start=start, stop=stop, step=step)
|
||
assert expected.equals(result)
|
||
# Positional options
|
||
assert pc.utf8_slice_codeunits(arr,
|
||
start, stop, step) == result
|
||
|
||
|
||
def test_split_pattern():
|
||
arr = pa.array(["-foo---bar--", "---foo---b"])
|
||
result = pc.split_pattern(arr, pattern="---")
|
||
expected = pa.array([["-foo", "bar--"], ["", "foo", "b"]])
|
||
assert expected.equals(result)
|
||
|
||
result = pc.split_pattern(arr, "---", max_splits=1)
|
||
expected = pa.array([["-foo", "bar--"], ["", "foo---b"]])
|
||
assert expected.equals(result)
|
||
|
||
result = pc.split_pattern(arr, "---", max_splits=1, reverse=True)
|
||
expected = pa.array([["-foo", "bar--"], ["---foo", "b"]])
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_split_whitespace_utf8():
|
||
arr = pa.array(["foo bar", " foo \u3000\tb"])
|
||
result = pc.utf8_split_whitespace(arr)
|
||
expected = pa.array([["foo", "bar"], ["", "foo", "b"]])
|
||
assert expected.equals(result)
|
||
|
||
result = pc.utf8_split_whitespace(arr, max_splits=1)
|
||
expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]])
|
||
assert expected.equals(result)
|
||
|
||
result = pc.utf8_split_whitespace(arr, max_splits=1, reverse=True)
|
||
expected = pa.array([["foo", "bar"], [" foo", "b"]])
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_split_whitespace_ascii():
|
||
arr = pa.array(["foo bar", " foo \u3000\tb"])
|
||
result = pc.ascii_split_whitespace(arr)
|
||
expected = pa.array([["foo", "bar"], ["", "foo", "\u3000", "b"]])
|
||
assert expected.equals(result)
|
||
|
||
result = pc.ascii_split_whitespace(arr, max_splits=1)
|
||
expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]])
|
||
assert expected.equals(result)
|
||
|
||
result = pc.ascii_split_whitespace(arr, max_splits=1, reverse=True)
|
||
expected = pa.array([["foo", "bar"], [" foo \u3000", "b"]])
|
||
assert expected.equals(result)
|
||
|
||
|
||
def test_split_pattern_regex():
|
||
arr = pa.array(["-foo---bar--", "---foo---b"])
|
||
result = pc.split_pattern_regex(arr, pattern="-+")
|
||
expected = pa.array([["", "foo", "bar", ""], ["", "foo", "b"]])
|
||
assert expected.equals(result)
|
||
|
||
result = pc.split_pattern_regex(arr, "-+", max_splits=1)
|
||
expected = pa.array([["", "foo---bar--"], ["", "foo---b"]])
|
||
assert expected.equals(result)
|
||
|
||
with pytest.raises(NotImplementedError,
|
||
match="Cannot split in reverse with regex"):
|
||
result = pc.split_pattern_regex(
|
||
arr, pattern="---", max_splits=1, reverse=True)
|
||
|
||
|
||
def test_min_max():
|
||
# An example generated function wrapper with possible options
|
||
data = [4, 5, 6, None, 1]
|
||
s = pc.min_max(data)
|
||
assert s.as_py() == {'min': 1, 'max': 6}
|
||
s = pc.min_max(data, options=pc.ScalarAggregateOptions())
|
||
assert s.as_py() == {'min': 1, 'max': 6}
|
||
s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True))
|
||
assert s.as_py() == {'min': 1, 'max': 6}
|
||
s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False))
|
||
assert s.as_py() == {'min': None, 'max': None}
|
||
|
||
# Options as dict of kwargs
|
||
s = pc.min_max(data, options={'skip_nulls': False})
|
||
assert s.as_py() == {'min': None, 'max': None}
|
||
# Options as named functions arguments
|
||
s = pc.min_max(data, skip_nulls=False)
|
||
assert s.as_py() == {'min': None, 'max': None}
|
||
|
||
# Both options and named arguments
|
||
with pytest.raises(TypeError):
|
||
s = pc.min_max(
|
||
data, options=pc.ScalarAggregateOptions(), skip_nulls=False)
|
||
|
||
# Wrong options type
|
||
options = pc.TakeOptions()
|
||
with pytest.raises(TypeError):
|
||
s = pc.min_max(data, options=options)
|
||
|
||
# Missing argument
|
||
with pytest.raises(TypeError, match="min_max takes 1 positional"):
|
||
s = pc.min_max()
|
||
|
||
|
||
def test_any():
|
||
# ARROW-1846
|
||
|
||
options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0)
|
||
|
||
a = pa.array([], type='bool')
|
||
assert pc.any(a).as_py() is None
|
||
assert pc.any(a, min_count=0).as_py() is False
|
||
assert pc.any(a, options=options).as_py() is False
|
||
|
||
a = pa.array([False, None, True])
|
||
assert pc.any(a).as_py() is True
|
||
assert pc.any(a, options=options).as_py() is True
|
||
|
||
a = pa.array([False, None, False])
|
||
assert pc.any(a).as_py() is False
|
||
assert pc.any(a, options=options).as_py() is None
|
||
|
||
|
||
def test_all():
|
||
# ARROW-10301
|
||
|
||
options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0)
|
||
|
||
a = pa.array([], type='bool')
|
||
assert pc.all(a).as_py() is None
|
||
assert pc.all(a, min_count=0).as_py() is True
|
||
assert pc.all(a, options=options).as_py() is True
|
||
|
||
a = pa.array([False, True])
|
||
assert pc.all(a).as_py() is False
|
||
assert pc.all(a, options=options).as_py() is False
|
||
|
||
a = pa.array([True, None])
|
||
assert pc.all(a).as_py() is True
|
||
assert pc.all(a, options=options).as_py() is None
|
||
|
||
a = pa.chunked_array([[True], [True, None]])
|
||
assert pc.all(a).as_py() is True
|
||
assert pc.all(a, options=options).as_py() is None
|
||
|
||
a = pa.chunked_array([[True], [False]])
|
||
assert pc.all(a).as_py() is False
|
||
assert pc.all(a, options=options).as_py() is False
|
||
|
||
|
||
def test_is_valid():
|
||
# An example generated function wrapper without options
|
||
data = [4, 5, None]
|
||
assert pc.is_valid(data).to_pylist() == [True, True, False]
|
||
|
||
with pytest.raises(TypeError):
|
||
pc.is_valid(data, options=None)
|
||
|
||
|
||
def test_generated_docstrings():
|
||
# With options
|
||
assert pc.min_max.__doc__ == textwrap.dedent("""\
|
||
Compute the minimum and maximum values of a numeric array.
|
||
|
||
Null values are ignored by default.
|
||
This can be changed through ScalarAggregateOptions.
|
||
|
||
Parameters
|
||
----------
|
||
array : Array-like
|
||
Argument to compute function.
|
||
skip_nulls : bool, default True
|
||
Whether to skip (ignore) nulls in the input.
|
||
If False, any null in the input forces the output to null.
|
||
min_count : int, default 1
|
||
Minimum number of non-null values in the input. If the number
|
||
of non-null values is below `min_count`, the output is null.
|
||
options : pyarrow.compute.ScalarAggregateOptions, optional
|
||
Alternative way of passing options.
|
||
memory_pool : pyarrow.MemoryPool, optional
|
||
If not passed, will allocate memory from the default memory pool.
|
||
""")
|
||
# Without options
|
||
assert pc.add.__doc__ == textwrap.dedent("""\
|
||
Add the arguments element-wise.
|
||
|
||
Results will wrap around on integer overflow.
|
||
Use function "add_checked" if you want overflow
|
||
to return an error.
|
||
|
||
Parameters
|
||
----------
|
||
x : Array-like or scalar-like
|
||
Argument to compute function.
|
||
y : Array-like or scalar-like
|
||
Argument to compute function.
|
||
memory_pool : pyarrow.MemoryPool, optional
|
||
If not passed, will allocate memory from the default memory pool.
|
||
""")
|
||
# Varargs with options
|
||
assert pc.min_element_wise.__doc__ == textwrap.dedent("""\
|
||
Find the element-wise minimum value.
|
||
|
||
Nulls are ignored (by default) or propagated.
|
||
NaN is preferred over null, but not over any valid value.
|
||
|
||
Parameters
|
||
----------
|
||
*args : Array-like or scalar-like
|
||
Argument to compute function.
|
||
skip_nulls : bool, default True
|
||
Whether to skip (ignore) nulls in the input.
|
||
If False, any null in the input forces the output to null.
|
||
options : pyarrow.compute.ElementWiseAggregateOptions, optional
|
||
Alternative way of passing options.
|
||
memory_pool : pyarrow.MemoryPool, optional
|
||
If not passed, will allocate memory from the default memory pool.
|
||
""")
|
||
# Nullary with options
|
||
assert pc.random.__doc__ == textwrap.dedent("""\
|
||
Generate numbers in the range [0, 1).
|
||
|
||
Generated values are uniformly-distributed, double-precision """ +
|
||
"""in range [0, 1).
|
||
Length of generated data, algorithm and seed can be changed """ +
|
||
"""via RandomOptions.
|
||
|
||
Parameters
|
||
----------
|
||
length : int
|
||
Number of random values to generate.
|
||
initializer : int or str
|
||
How to initialize the underlying random generator.
|
||
If an integer is given, it is used as a seed.
|
||
If "system" is given, the random generator is initialized with
|
||
a system-specific source of (hopefully true) randomness.
|
||
Other values are invalid.
|
||
options : pyarrow.compute.RandomOptions, optional
|
||
Alternative way of passing options.
|
||
memory_pool : pyarrow.MemoryPool, optional
|
||
If not passed, will allocate memory from the default memory pool.
|
||
""")
|
||
# With custom examples
|
||
assert pc.filter.__doc__ == textwrap.dedent("""\
|
||
Filter with a boolean selection filter.
|
||
|
||
The output is populated with values from the input at positions
|
||
where the selection filter is non-zero. Nulls in the selection filter
|
||
are handled based on FilterOptions.
|
||
|
||
Parameters
|
||
----------
|
||
input : Array-like or scalar-like
|
||
Argument to compute function.
|
||
selection_filter : Array-like or scalar-like
|
||
Argument to compute function.
|
||
null_selection_behavior : str, default "drop"
|
||
How to handle nulls in the selection filter.
|
||
Accepted values are "drop", "emit_null".
|
||
options : pyarrow.compute.FilterOptions, optional
|
||
Alternative way of passing options.
|
||
memory_pool : pyarrow.MemoryPool, optional
|
||
If not passed, will allocate memory from the default memory pool.
|
||
|
||
Examples
|
||
--------
|
||
>>> import pyarrow as pa
|
||
>>> arr = pa.array(["a", "b", "c", None, "e"])
|
||
>>> mask = pa.array([True, False, None, False, True])
|
||
>>> arr.filter(mask)
|
||
<pyarrow.lib.StringArray object at 0x7fa826df9200>
|
||
[
|
||
"a",
|
||
"e"
|
||
]
|
||
>>> arr.filter(mask, null_selection_behavior='emit_null')
|
||
<pyarrow.lib.StringArray object at 0x7fa826df9200>
|
||
[
|
||
"a",
|
||
null,
|
||
"e"
|
||
]
|
||
""")
|
||
|
||
|
||
def test_generated_signatures():
|
||
# The self-documentation provided by signatures should show acceptable
|
||
# options and their default values.
|
||
|
||
# Without options
|
||
sig = inspect.signature(pc.add)
|
||
assert str(sig) == "(x, y, /, *, memory_pool=None)"
|
||
# With options
|
||
sig = inspect.signature(pc.min_max)
|
||
assert str(sig) == ("(array, /, *, skip_nulls=True, min_count=1, "
|
||
"options=None, memory_pool=None)")
|
||
# With positional options
|
||
sig = inspect.signature(pc.quantile)
|
||
assert str(sig) == ("(array, /, q=0.5, *, interpolation='linear', "
|
||
"skip_nulls=True, min_count=0, "
|
||
"options=None, memory_pool=None)")
|
||
# Varargs with options
|
||
sig = inspect.signature(pc.binary_join_element_wise)
|
||
assert str(sig) == ("(*strings, null_handling='emit_null', "
|
||
"null_replacement='', options=None, "
|
||
"memory_pool=None)")
|
||
# Varargs without options
|
||
sig = inspect.signature(pc.choose)
|
||
assert str(sig) == "(indices, /, *values, memory_pool=None)"
|
||
# Nullary with options
|
||
sig = inspect.signature(pc.random)
|
||
assert str(sig) == ("(length, *, initializer='system', "
|
||
"options=None, memory_pool=None)")
|
||
|
||
|
||
# We use isprintable to find about codepoints that Python doesn't know, but
|
||
# utf8proc does (or in a future version of Python the other way around).
|
||
# These codepoints cannot be compared between Arrow and the Python
|
||
# implementation.
|
||
@lru_cache()
|
||
def find_new_unicode_codepoints():
|
||
new = set()
|
||
characters = [chr(c) for c in range(0x80, 0x11000)
|
||
if not (0xD800 <= c < 0xE000)]
|
||
is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist()
|
||
for i, c in enumerate(characters):
|
||
if is_printable[i] != c.isprintable():
|
||
new.add(ord(c))
|
||
return new
|
||
|
||
|
||
# Python claims there are not alpha, not sure why, they are in
|
||
# gc='Other Letter': https://graphemica.com/%E1%B3%B2
|
||
unknown_issue_is_alpha = {0x1cf2, 0x1cf3}
|
||
# utf8proc does not know if codepoints are lower case
|
||
utf8proc_issue_is_lower = {
|
||
0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4,
|
||
0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0,
|
||
0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d,
|
||
0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33,
|
||
0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39,
|
||
0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f,
|
||
0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45,
|
||
0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b,
|
||
0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51,
|
||
0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57,
|
||
0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d,
|
||
0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63,
|
||
0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69,
|
||
0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e,
|
||
0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4,
|
||
0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa,
|
||
0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0,
|
||
0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6,
|
||
0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc,
|
||
0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090,
|
||
0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096,
|
||
0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c,
|
||
0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8,
|
||
0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, }
|
||
# utf8proc does not store if a codepoint is numeric
|
||
numeric_info_missing = {
|
||
0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
|
||
0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
|
||
0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
|
||
0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
|
||
0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
|
||
0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
|
||
0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
|
||
0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
|
||
0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
|
||
0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
|
||
0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5,
|
||
0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca,
|
||
0x10fcb, }
|
||
# utf8proc has no no digit/numeric information
|
||
digit_info_missing = {
|
||
0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c,
|
||
0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070,
|
||
0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080,
|
||
0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087,
|
||
0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464,
|
||
0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476,
|
||
0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488,
|
||
0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f,
|
||
0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9,
|
||
0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777,
|
||
0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e,
|
||
0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786,
|
||
0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e,
|
||
0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41,
|
||
0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63,
|
||
0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, }
|
||
numeric_info_missing = {
|
||
0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
|
||
0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96,
|
||
0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70,
|
||
0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341,
|
||
0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2,
|
||
0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a,
|
||
0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10,
|
||
0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e,
|
||
0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621,
|
||
0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973,
|
||
0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, }
|
||
|
||
codepoints_ignore = {
|
||
'is_alnum': numeric_info_missing | digit_info_missing |
|
||
unknown_issue_is_alpha,
|
||
'is_alpha': unknown_issue_is_alpha,
|
||
'is_digit': digit_info_missing,
|
||
'is_numeric': numeric_info_missing,
|
||
'is_lower': utf8proc_issue_is_lower
|
||
}
|
||
|
||
|
||
@pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha',
|
||
'is_ascii', 'is_decimal',
|
||
'is_digit', 'is_lower',
|
||
'is_numeric', 'is_printable',
|
||
'is_space', 'is_upper', ])
|
||
@pytest.mark.parametrize('variant', ['ascii', 'utf8'])
|
||
def test_string_py_compat_boolean(function_name, variant):
|
||
arrow_name = variant + "_" + function_name
|
||
py_name = function_name.replace('_', '')
|
||
ignore = codepoints_ignore.get(function_name, set()) | \
|
||
find_new_unicode_codepoints()
|
||
for i in range(128 if ascii else 0x11000):
|
||
if i in range(0xD800, 0xE000):
|
||
continue # bug? pyarrow doesn't allow utf16 surrogates
|
||
# the issues we know of, we skip
|
||
if i in ignore:
|
||
continue
|
||
# Compare results with the equivalent Python predicate
|
||
# (except "is_space" where functions are known to be incompatible)
|
||
c = chr(i)
|
||
if hasattr(pc, arrow_name) and function_name != 'is_space':
|
||
ar = pa.array([c])
|
||
arrow_func = getattr(pc, arrow_name)
|
||
assert arrow_func(ar)[0].as_py() == getattr(c, py_name)()
|
||
|
||
|
||
def test_pad():
|
||
arr = pa.array([None, 'a', 'abcd'])
|
||
assert pc.ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd']
|
||
assert pc.ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd']
|
||
assert pc.ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd']
|
||
assert pc.ascii_center(arr, 3).tolist() == [None, ' a ', 'abcd']
|
||
assert pc.ascii_lpad(arr, 3).tolist() == [None, ' a', 'abcd']
|
||
assert pc.ascii_rpad(arr, 3).tolist() == [None, 'a ', 'abcd']
|
||
|
||
arr = pa.array([None, 'á', 'abcd'])
|
||
assert pc.utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd']
|
||
assert pc.utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd']
|
||
assert pc.utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd']
|
||
assert pc.utf8_center(arr, 3).tolist() == [None, ' á ', 'abcd']
|
||
assert pc.utf8_lpad(arr, 3).tolist() == [None, ' á', 'abcd']
|
||
assert pc.utf8_rpad(arr, 3).tolist() == [None, 'á ', 'abcd']
|
||
|
||
|
||
@pytest.mark.pandas
|
||
def test_replace_slice():
|
||
offsets = range(-3, 4)
|
||
|
||
arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde'])
|
||
series = arr.to_pandas()
|
||
for start in offsets:
|
||
for stop in offsets:
|
||
expected = series.str.slice_replace(start, stop, 'XX')
|
||
actual = pc.binary_replace_slice(
|
||
arr, start=start, stop=stop, replacement='XX')
|
||
assert actual.tolist() == expected.tolist()
|
||
# Positional options
|
||
assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual
|
||
|
||
arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde'])
|
||
series = arr.to_pandas()
|
||
for start in offsets:
|
||
for stop in offsets:
|
||
expected = series.str.slice_replace(start, stop, 'XX')
|
||
actual = pc.utf8_replace_slice(
|
||
arr, start=start, stop=stop, replacement='XX')
|
||
assert actual.tolist() == expected.tolist()
|
||
|
||
|
||
def test_replace_plain():
|
||
data = pa.array(['foozfoo', 'food', None])
|
||
ar = pc.replace_substring(data, pattern='foo', replacement='bar')
|
||
assert ar.tolist() == ['barzbar', 'bard', None]
|
||
ar = pc.replace_substring(data, 'foo', 'bar')
|
||
assert ar.tolist() == ['barzbar', 'bard', None]
|
||
|
||
ar = pc.replace_substring(data, pattern='foo', replacement='bar',
|
||
max_replacements=1)
|
||
assert ar.tolist() == ['barzfoo', 'bard', None]
|
||
ar = pc.replace_substring(data, 'foo', 'bar', max_replacements=1)
|
||
assert ar.tolist() == ['barzfoo', 'bard', None]
|
||
|
||
|
||
def test_replace_regex():
|
||
data = pa.array(['foo', 'mood', None])
|
||
expected = ['f00', 'm00d', None]
|
||
ar = pc.replace_substring_regex(data, pattern='(.)oo', replacement=r'\100')
|
||
assert ar.tolist() == expected
|
||
ar = pc.replace_substring_regex(data, '(.)oo', replacement=r'\100')
|
||
assert ar.tolist() == expected
|
||
ar = pc.replace_substring_regex(data, '(.)oo', r'\100')
|
||
assert ar.tolist() == expected
|
||
|
||
|
||
def test_extract_regex():
|
||
ar = pa.array(['a1', 'zb2z'])
|
||
expected = [{'letter': 'a', 'digit': '1'}, {'letter': 'b', 'digit': '2'}]
|
||
struct = pc.extract_regex(ar, pattern=r'(?P<letter>[ab])(?P<digit>\d)')
|
||
assert struct.tolist() == expected
|
||
struct = pc.extract_regex(ar, r'(?P<letter>[ab])(?P<digit>\d)')
|
||
assert struct.tolist() == expected
|
||
|
||
|
||
def test_binary_join():
|
||
ar_list = pa.array([['foo', 'bar'], None, []])
|
||
expected = pa.array(['foo-bar', None, ''])
|
||
assert pc.binary_join(ar_list, '-').equals(expected)
|
||
|
||
separator_array = pa.array(['1', '2'], type=pa.binary())
|
||
expected = pa.array(['a1b', 'c2d'], type=pa.binary())
|
||
ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary()))
|
||
assert pc.binary_join(ar_list, separator_array).equals(expected)
|
||
|
||
|
||
def test_binary_join_element_wise():
|
||
null = pa.scalar(None, type=pa.string())
|
||
arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']]
|
||
assert pc.binary_join_element_wise(*arrs).to_pylist() == \
|
||
[None, None, 'b--d']
|
||
assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b'
|
||
assert pc.binary_join_element_wise('a', null, '-').as_py() is None
|
||
assert pc.binary_join_element_wise('a', 'b', null).as_py() is None
|
||
|
||
skip = pc.JoinOptions(null_handling='skip')
|
||
assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \
|
||
[None, 'a', 'b--d']
|
||
assert pc.binary_join_element_wise(
|
||
'a', 'b', '-', options=skip).as_py() == 'a-b'
|
||
assert pc.binary_join_element_wise(
|
||
'a', null, '-', options=skip).as_py() == 'a'
|
||
assert pc.binary_join_element_wise(
|
||
'a', 'b', null, options=skip).as_py() is None
|
||
|
||
replace = pc.JoinOptions(null_handling='replace', null_replacement='spam')
|
||
assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \
|
||
[None, 'a-spam', 'b--d']
|
||
assert pc.binary_join_element_wise(
|
||
'a', 'b', '-', options=replace).as_py() == 'a-b'
|
||
assert pc.binary_join_element_wise(
|
||
'a', null, '-', options=replace).as_py() == 'a-spam'
|
||
assert pc.binary_join_element_wise(
|
||
'a', 'b', null, options=replace).as_py() is None
|
||
|
||
|
||
@pytest.mark.parametrize(('ty', 'values'), all_array_types)
|
||
def test_take(ty, values):
|
||
arr = pa.array(values, type=ty)
|
||
for indices_type in [pa.int8(), pa.int64()]:
|
||
indices = pa.array([0, 4, 2, None], type=indices_type)
|
||
result = arr.take(indices)
|
||
result.validate()
|
||
expected = pa.array([values[0], values[4], values[2], None], type=ty)
|
||
assert result.equals(expected)
|
||
|
||
# empty indices
|
||
indices = pa.array([], type=indices_type)
|
||
result = arr.take(indices)
|
||
result.validate()
|
||
expected = pa.array([], type=ty)
|
||
assert result.equals(expected)
|
||
|
||
indices = pa.array([2, 5])
|
||
with pytest.raises(IndexError):
|
||
arr.take(indices)
|
||
|
||
indices = pa.array([2, -1])
|
||
with pytest.raises(IndexError):
|
||
arr.take(indices)
|
||
|
||
|
||
def test_take_indices_types():
|
||
arr = pa.array(range(5))
|
||
|
||
for indices_type in ['uint8', 'int8', 'uint16', 'int16',
|
||
'uint32', 'int32', 'uint64', 'int64']:
|
||
indices = pa.array([0, 4, 2, None], type=indices_type)
|
||
result = arr.take(indices)
|
||
result.validate()
|
||
expected = pa.array([0, 4, 2, None])
|
||
assert result.equals(expected)
|
||
|
||
for indices_type in [pa.float32(), pa.float64()]:
|
||
indices = pa.array([0, 4, 2], type=indices_type)
|
||
with pytest.raises(NotImplementedError):
|
||
arr.take(indices)
|
||
|
||
|
||
def test_take_on_chunked_array():
|
||
# ARROW-9504
|
||
arr = pa.chunked_array([
|
||
[
|
||
"a",
|
||
"b",
|
||
"c",
|
||
"d",
|
||
"e"
|
||
],
|
||
[
|
||
"f",
|
||
"g",
|
||
"h",
|
||
"i",
|
||
"j"
|
||
]
|
||
])
|
||
|
||
indices = np.array([0, 5, 1, 6, 9, 2])
|
||
result = arr.take(indices)
|
||
expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]])
|
||
assert result.equals(expected)
|
||
|
||
indices = pa.chunked_array([[1], [9, 2]])
|
||
result = arr.take(indices)
|
||
expected = pa.chunked_array([
|
||
[
|
||
"b"
|
||
],
|
||
[
|
||
"j",
|
||
"c"
|
||
]
|
||
])
|
||
assert result.equals(expected)
|
||
|
||
|
||
@pytest.mark.parametrize('ordered', [False, True])
|
||
def test_take_dictionary(ordered):
|
||
arr = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
|
||
ordered=ordered)
|
||
result = arr.take(pa.array([0, 1, 3]))
|
||
result.validate()
|
||
assert result.to_pylist() == ['a', 'b', 'a']
|
||
assert result.dictionary.to_pylist() == ['a', 'b', 'c']
|
||
assert result.type.ordered is ordered
|
||
|
||
|
||
def test_take_null_type():
|
||
# ARROW-10027
|
||
arr = pa.array([None] * 10)
|
||
chunked_arr = pa.chunked_array([[None] * 5] * 2)
|
||
batch = pa.record_batch([arr], names=['a'])
|
||
table = pa.table({'a': arr})
|
||
|
||
indices = pa.array([1, 3, 7, None])
|
||
assert len(arr.take(indices)) == 4
|
||
assert len(chunked_arr.take(indices)) == 4
|
||
assert len(batch.take(indices).column(0)) == 4
|
||
assert len(table.take(indices).column(0)) == 4
|
||
|
||
|
||
@pytest.mark.parametrize(('ty', 'values'), all_array_types)
|
||
def test_drop_null(ty, values):
|
||
arr = pa.array(values, type=ty)
|
||
result = arr.drop_null()
|
||
result.validate(full=True)
|
||
indices = [i for i in range(len(arr)) if arr[i].is_valid]
|
||
expected = arr.take(pa.array(indices))
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_drop_null_chunked_array():
|
||
arr = pa.chunked_array([["a", None], ["c", "d", None], [None], []])
|
||
expected_drop = pa.chunked_array([["a"], ["c", "d"], [], []])
|
||
|
||
result = arr.drop_null()
|
||
assert result.equals(expected_drop)
|
||
|
||
|
||
def test_drop_null_record_batch():
|
||
batch = pa.record_batch(
|
||
[pa.array(["a", None, "c", "d", None])], names=["a'"])
|
||
result = batch.drop_null()
|
||
expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"])
|
||
assert result.equals(expected)
|
||
|
||
batch = pa.record_batch(
|
||
[pa.array(["a", None, "c", "d", None]),
|
||
pa.array([None, None, "c", None, "e"])], names=["a'", "b'"])
|
||
|
||
result = batch.drop_null()
|
||
expected = pa.record_batch(
|
||
[pa.array(["c"]), pa.array(["c"])], names=["a'", "b'"])
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_drop_null_table():
|
||
table = pa.table([pa.array(["a", None, "c", "d", None])], names=["a"])
|
||
expected = pa.table([pa.array(["a", "c", "d"])], names=["a"])
|
||
result = table.drop_null()
|
||
assert result.equals(expected)
|
||
|
||
table = pa.table([pa.chunked_array([["a", None], ["c", "d", None]]),
|
||
pa.chunked_array([["a", None], [None, "d", None]]),
|
||
pa.chunked_array([["a"], ["b"], [None], ["d", None]])],
|
||
names=["a", "b", "c"])
|
||
expected = pa.table([pa.array(["a", "d"]),
|
||
pa.array(["a", "d"]),
|
||
pa.array(["a", "d"])],
|
||
names=["a", "b", "c"])
|
||
result = table.drop_null()
|
||
assert result.equals(expected)
|
||
|
||
table = pa.table([pa.chunked_array([["a", "b"], ["c", "d", "e"]]),
|
||
pa.chunked_array([["A"], ["B"], [None], ["D", None]]),
|
||
pa.chunked_array([["a`", None], ["c`", "d`", None]])],
|
||
names=["a", "b", "c"])
|
||
expected = pa.table([pa.array(["a", "d"]),
|
||
pa.array(["A", "D"]),
|
||
pa.array(["a`", "d`"])],
|
||
names=["a", "b", "c"])
|
||
result = table.drop_null()
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_drop_null_null_type():
|
||
arr = pa.array([None] * 10)
|
||
chunked_arr = pa.chunked_array([[None] * 5] * 2)
|
||
batch = pa.record_batch([arr], names=['a'])
|
||
table = pa.table({'a': arr})
|
||
|
||
assert len(arr.drop_null()) == 0
|
||
assert len(chunked_arr.drop_null()) == 0
|
||
assert len(batch.drop_null().column(0)) == 0
|
||
assert len(table.drop_null().column(0)) == 0
|
||
|
||
|
||
@pytest.mark.parametrize(('ty', 'values'), all_array_types)
|
||
def test_filter(ty, values):
|
||
arr = pa.array(values, type=ty)
|
||
|
||
mask = pa.array([True, False, False, True, None])
|
||
result = arr.filter(mask, null_selection_behavior='drop')
|
||
result.validate()
|
||
assert result.equals(pa.array([values[0], values[3]], type=ty))
|
||
result = arr.filter(mask, null_selection_behavior='emit_null')
|
||
result.validate()
|
||
assert result.equals(pa.array([values[0], values[3], None], type=ty))
|
||
|
||
# non-boolean dtype
|
||
mask = pa.array([0, 1, 0, 1, 0])
|
||
with pytest.raises(NotImplementedError):
|
||
arr.filter(mask)
|
||
|
||
# wrong length
|
||
mask = pa.array([True, False, True])
|
||
with pytest.raises(ValueError, match="must all be the same length"):
|
||
arr.filter(mask)
|
||
|
||
|
||
def test_filter_chunked_array():
|
||
arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
|
||
expected_drop = pa.chunked_array([["a"], ["e"]])
|
||
expected_null = pa.chunked_array([["a"], [None, "e"]])
|
||
|
||
for mask in [
|
||
# mask is array
|
||
pa.array([True, False, None, False, True]),
|
||
# mask is chunked array
|
||
pa.chunked_array([[True, False, None], [False, True]]),
|
||
# mask is python object
|
||
[True, False, None, False, True]
|
||
]:
|
||
result = arr.filter(mask)
|
||
assert result.equals(expected_drop)
|
||
result = arr.filter(mask, null_selection_behavior="emit_null")
|
||
assert result.equals(expected_null)
|
||
|
||
|
||
def test_filter_record_batch():
|
||
batch = pa.record_batch(
|
||
[pa.array(["a", None, "c", "d", "e"])], names=["a'"])
|
||
|
||
# mask is array
|
||
mask = pa.array([True, False, None, False, True])
|
||
result = batch.filter(mask)
|
||
expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"])
|
||
assert result.equals(expected)
|
||
|
||
result = batch.filter(mask, null_selection_behavior="emit_null")
|
||
expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"])
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_filter_table():
|
||
table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])
|
||
expected_drop = pa.table([pa.array(["a", "e"])], names=["a"])
|
||
expected_null = pa.table([pa.array(["a", None, "e"])], names=["a"])
|
||
|
||
for mask in [
|
||
# mask is array
|
||
pa.array([True, False, None, False, True]),
|
||
# mask is chunked array
|
||
pa.chunked_array([[True, False], [None, False, True]]),
|
||
# mask is python object
|
||
[True, False, None, False, True]
|
||
]:
|
||
result = table.filter(mask)
|
||
assert result.equals(expected_drop)
|
||
result = table.filter(mask, null_selection_behavior="emit_null")
|
||
assert result.equals(expected_null)
|
||
|
||
|
||
def test_filter_errors():
|
||
arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
|
||
batch = pa.record_batch(
|
||
[pa.array(["a", None, "c", "d", "e"])], names=["a'"])
|
||
table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])
|
||
|
||
for obj in [arr, batch, table]:
|
||
# non-boolean dtype
|
||
mask = pa.array([0, 1, 0, 1, 0])
|
||
with pytest.raises(NotImplementedError):
|
||
obj.filter(mask)
|
||
|
||
# wrong length
|
||
mask = pa.array([True, False, True])
|
||
with pytest.raises(pa.ArrowInvalid,
|
||
match="must all be the same length"):
|
||
obj.filter(mask)
|
||
|
||
|
||
def test_filter_null_type():
|
||
# ARROW-10027
|
||
arr = pa.array([None] * 10)
|
||
chunked_arr = pa.chunked_array([[None] * 5] * 2)
|
||
batch = pa.record_batch([arr], names=['a'])
|
||
table = pa.table({'a': arr})
|
||
|
||
mask = pa.array([True, False] * 5)
|
||
assert len(arr.filter(mask)) == 5
|
||
assert len(chunked_arr.filter(mask)) == 5
|
||
assert len(batch.filter(mask).column(0)) == 5
|
||
assert len(table.filter(mask).column(0)) == 5
|
||
|
||
|
||
@pytest.mark.parametrize("typ", ["array", "chunked_array"])
|
||
def test_compare_array(typ):
|
||
if typ == "array":
|
||
def con(values):
|
||
return pa.array(values)
|
||
else:
|
||
def con(values):
|
||
return pa.chunked_array([values])
|
||
|
||
arr1 = con([1, 2, 3, 4, None])
|
||
arr2 = con([1, 1, 4, None, 4])
|
||
|
||
result = pc.equal(arr1, arr2)
|
||
assert result.equals(con([True, False, False, None, None]))
|
||
|
||
result = pc.not_equal(arr1, arr2)
|
||
assert result.equals(con([False, True, True, None, None]))
|
||
|
||
result = pc.less(arr1, arr2)
|
||
assert result.equals(con([False, False, True, None, None]))
|
||
|
||
result = pc.less_equal(arr1, arr2)
|
||
assert result.equals(con([True, False, True, None, None]))
|
||
|
||
result = pc.greater(arr1, arr2)
|
||
assert result.equals(con([False, True, False, None, None]))
|
||
|
||
result = pc.greater_equal(arr1, arr2)
|
||
assert result.equals(con([True, True, False, None, None]))
|
||
|
||
|
||
@pytest.mark.parametrize("typ", ["array", "chunked_array"])
|
||
def test_compare_string_scalar(typ):
|
||
if typ == "array":
|
||
def con(values):
|
||
return pa.array(values)
|
||
else:
|
||
def con(values):
|
||
return pa.chunked_array([values])
|
||
|
||
arr = con(['a', 'b', 'c', None])
|
||
scalar = pa.scalar('b')
|
||
|
||
result = pc.equal(arr, scalar)
|
||
assert result.equals(con([False, True, False, None]))
|
||
|
||
if typ == "array":
|
||
nascalar = pa.scalar(None, type="string")
|
||
result = pc.equal(arr, nascalar)
|
||
isnull = pc.is_null(result)
|
||
assert isnull.equals(con([True, True, True, True]))
|
||
|
||
result = pc.not_equal(arr, scalar)
|
||
assert result.equals(con([True, False, True, None]))
|
||
|
||
result = pc.less(arr, scalar)
|
||
assert result.equals(con([True, False, False, None]))
|
||
|
||
result = pc.less_equal(arr, scalar)
|
||
assert result.equals(con([True, True, False, None]))
|
||
|
||
result = pc.greater(arr, scalar)
|
||
assert result.equals(con([False, False, True, None]))
|
||
|
||
result = pc.greater_equal(arr, scalar)
|
||
assert result.equals(con([False, True, True, None]))
|
||
|
||
|
||
@pytest.mark.parametrize("typ", ["array", "chunked_array"])
|
||
def test_compare_scalar(typ):
|
||
if typ == "array":
|
||
def con(values):
|
||
return pa.array(values)
|
||
else:
|
||
def con(values):
|
||
return pa.chunked_array([values])
|
||
|
||
arr = con([1, 2, 3, None])
|
||
scalar = pa.scalar(2)
|
||
|
||
result = pc.equal(arr, scalar)
|
||
assert result.equals(con([False, True, False, None]))
|
||
|
||
if typ == "array":
|
||
nascalar = pa.scalar(None, type="int64")
|
||
result = pc.equal(arr, nascalar)
|
||
assert result.to_pylist() == [None, None, None, None]
|
||
|
||
result = pc.not_equal(arr, scalar)
|
||
assert result.equals(con([True, False, True, None]))
|
||
|
||
result = pc.less(arr, scalar)
|
||
assert result.equals(con([True, False, False, None]))
|
||
|
||
result = pc.less_equal(arr, scalar)
|
||
assert result.equals(con([True, True, False, None]))
|
||
|
||
result = pc.greater(arr, scalar)
|
||
assert result.equals(con([False, False, True, None]))
|
||
|
||
result = pc.greater_equal(arr, scalar)
|
||
assert result.equals(con([False, True, True, None]))
|
||
|
||
|
||
def test_compare_chunked_array_mixed():
|
||
arr = pa.array([1, 2, 3, 4, None])
|
||
arr_chunked = pa.chunked_array([[1, 2, 3], [4, None]])
|
||
arr_chunked2 = pa.chunked_array([[1, 2], [3, 4, None]])
|
||
|
||
expected = pa.chunked_array([[True, True, True, True, None]])
|
||
|
||
for left, right in [
|
||
(arr, arr_chunked),
|
||
(arr_chunked, arr),
|
||
(arr_chunked, arr_chunked2),
|
||
]:
|
||
result = pc.equal(left, right)
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_arithmetic_add():
|
||
left = pa.array([1, 2, 3, 4, 5])
|
||
right = pa.array([0, -1, 1, 2, 3])
|
||
result = pc.add(left, right)
|
||
expected = pa.array([1, 1, 4, 6, 8])
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_arithmetic_subtract():
|
||
left = pa.array([1, 2, 3, 4, 5])
|
||
right = pa.array([0, -1, 1, 2, 3])
|
||
result = pc.subtract(left, right)
|
||
expected = pa.array([1, 3, 2, 2, 2])
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_arithmetic_multiply():
|
||
left = pa.array([1, 2, 3, 4, 5])
|
||
right = pa.array([0, -1, 1, 2, 3])
|
||
result = pc.multiply(left, right)
|
||
expected = pa.array([0, -2, 3, 8, 15])
|
||
assert result.equals(expected)
|
||
|
||
|
||
@pytest.mark.parametrize("ty", ["round", "round_to_multiple"])
|
||
def test_round_to_integer(ty):
|
||
if ty == "round":
|
||
round = pc.round
|
||
RoundOptions = partial(pc.RoundOptions, ndigits=0)
|
||
elif ty == "round_to_multiple":
|
||
round = pc.round_to_multiple
|
||
RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1)
|
||
|
||
values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None]
|
||
rmode_and_expected = {
|
||
"down": [3, 3, 3, 4, -4, -4, -4, None],
|
||
"up": [4, 4, 4, 5, -3, -3, -3, None],
|
||
"towards_zero": [3, 3, 3, 4, -3, -3, -3, None],
|
||
"towards_infinity": [4, 4, 4, 5, -4, -4, -4, None],
|
||
"half_down": [3, 3, 4, 4, -3, -4, -4, None],
|
||
"half_up": [3, 4, 4, 5, -3, -3, -4, None],
|
||
"half_towards_zero": [3, 3, 4, 4, -3, -3, -4, None],
|
||
"half_towards_infinity": [3, 4, 4, 5, -3, -4, -4, None],
|
||
"half_to_even": [3, 4, 4, 4, -3, -4, -4, None],
|
||
"half_to_odd": [3, 3, 4, 5, -3, -3, -4, None],
|
||
}
|
||
for round_mode, expected in rmode_and_expected.items():
|
||
options = RoundOptions(round_mode=round_mode)
|
||
result = round(values, options=options)
|
||
np.testing.assert_array_equal(result, pa.array(expected))
|
||
|
||
|
||
def test_round():
|
||
values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
|
||
ndigits_and_expected = {
|
||
-2: [300, 0, 0, 0, -0, -0, -0, None],
|
||
-1: [320, 0, 0, 0, -0, -40, -0, None],
|
||
0: [320, 4, 3, 5, -3, -35, -3, None],
|
||
1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None],
|
||
2: [320, 3.5, 3.08, 4.5, -3.21, -35.12, -3.05, None],
|
||
}
|
||
for ndigits, expected in ndigits_and_expected.items():
|
||
options = pc.RoundOptions(ndigits, "half_towards_infinity")
|
||
result = pc.round(values, options=options)
|
||
np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
|
||
assert pc.round(values, ndigits,
|
||
round_mode="half_towards_infinity") == result
|
||
assert pc.round(values, ndigits, "half_towards_infinity") == result
|
||
|
||
|
||
def test_round_to_multiple():
|
||
values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
|
||
multiple_and_expected = {
|
||
0.05: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3.05, None],
|
||
pa.scalar(0.1): [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None],
|
||
2: [320, 4, 4, 4, -4, -36, -4, None],
|
||
10: [320, 0, 0, 0, -0, -40, -0, None],
|
||
pa.scalar(100, type=pa.decimal256(10, 4)):
|
||
[300, 0, 0, 0, -0, -0, -0, None],
|
||
}
|
||
for multiple, expected in multiple_and_expected.items():
|
||
options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity")
|
||
result = pc.round_to_multiple(values, options=options)
|
||
np.testing.assert_allclose(result, pa.array(expected), equal_nan=True)
|
||
assert pc.round_to_multiple(values, multiple,
|
||
"half_towards_infinity") == result
|
||
|
||
for multiple in [0, -2, pa.scalar(-10.4)]:
|
||
with pytest.raises(pa.ArrowInvalid,
|
||
match="Rounding multiple must be positive"):
|
||
pc.round_to_multiple(values, multiple=multiple)
|
||
|
||
for multiple in [object, 99999999999999999999999]:
|
||
with pytest.raises(TypeError, match="is not a valid multiple type"):
|
||
pc.round_to_multiple(values, multiple=multiple)
|
||
|
||
|
||
def test_is_null():
|
||
arr = pa.array([1, 2, 3, None])
|
||
result = arr.is_null()
|
||
expected = pa.array([False, False, False, True])
|
||
assert result.equals(expected)
|
||
assert result.equals(pc.is_null(arr))
|
||
result = arr.is_valid()
|
||
expected = pa.array([True, True, True, False])
|
||
assert result.equals(expected)
|
||
assert result.equals(pc.is_valid(arr))
|
||
|
||
arr = pa.chunked_array([[1, 2], [3, None]])
|
||
result = arr.is_null()
|
||
expected = pa.chunked_array([[False, False], [False, True]])
|
||
assert result.equals(expected)
|
||
result = arr.is_valid()
|
||
expected = pa.chunked_array([[True, True], [True, False]])
|
||
assert result.equals(expected)
|
||
|
||
arr = pa.array([1, 2, 3, None, np.nan])
|
||
result = arr.is_null()
|
||
expected = pa.array([False, False, False, True, False])
|
||
assert result.equals(expected)
|
||
|
||
result = arr.is_null(nan_is_null=True)
|
||
expected = pa.array([False, False, False, True, True])
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_fill_null():
|
||
arr = pa.array([1, 2, None, 4], type=pa.int8())
|
||
fill_value = pa.array([5], type=pa.int8())
|
||
with pytest.raises(pa.ArrowInvalid,
|
||
match="Array arguments must all be the same length"):
|
||
arr.fill_null(fill_value)
|
||
|
||
arr = pa.array([None, None, None, None], type=pa.null())
|
||
fill_value = pa.scalar(None, type=pa.null())
|
||
result = arr.fill_null(fill_value)
|
||
expected = pa.array([None, None, None, None])
|
||
assert result.equals(expected)
|
||
|
||
arr = pa.array(['a', 'bb', None])
|
||
result = arr.fill_null('ccc')
|
||
expected = pa.array(['a', 'bb', 'ccc'])
|
||
assert result.equals(expected)
|
||
|
||
arr = pa.array([b'a', b'bb', None], type=pa.large_binary())
|
||
result = arr.fill_null('ccc')
|
||
expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary())
|
||
assert result.equals(expected)
|
||
|
||
arr = pa.array(['a', 'bb', None])
|
||
result = arr.fill_null(None)
|
||
expected = pa.array(['a', 'bb', None])
|
||
assert result.equals(expected)
|
||
|
||
|
||
@pytest.mark.parametrize('arrow_type', numerical_arrow_types)
|
||
def test_fill_null_array(arrow_type):
|
||
arr = pa.array([1, 2, None, 4], type=arrow_type)
|
||
fill_value = pa.scalar(5, type=arrow_type)
|
||
result = arr.fill_null(fill_value)
|
||
expected = pa.array([1, 2, 5, 4], type=arrow_type)
|
||
assert result.equals(expected)
|
||
|
||
# Implicit conversions
|
||
result = arr.fill_null(5)
|
||
assert result.equals(expected)
|
||
|
||
# ARROW-9451: Unsigned integers allow this for some reason
|
||
if not pa.types.is_unsigned_integer(arr.type):
|
||
with pytest.raises((ValueError, TypeError)):
|
||
arr.fill_null('5')
|
||
|
||
result = arr.fill_null(pa.scalar(5, type='int8'))
|
||
assert result.equals(expected)
|
||
|
||
|
||
@pytest.mark.parametrize('arrow_type', numerical_arrow_types)
|
||
def test_fill_null_chunked_array(arrow_type):
|
||
fill_value = pa.scalar(5, type=arrow_type)
|
||
arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)])
|
||
result = arr.fill_null(fill_value)
|
||
expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)])
|
||
assert result.equals(expected)
|
||
|
||
arr = pa.chunked_array([
|
||
pa.array([1, 2], type=arrow_type),
|
||
pa.array([], type=arrow_type),
|
||
pa.array([None, 4], type=arrow_type)
|
||
])
|
||
expected = pa.chunked_array([
|
||
pa.array([1, 2], type=arrow_type),
|
||
pa.array([], type=arrow_type),
|
||
pa.array([5, 4], type=arrow_type)
|
||
])
|
||
result = arr.fill_null(fill_value)
|
||
assert result.equals(expected)
|
||
|
||
# Implicit conversions
|
||
result = arr.fill_null(5)
|
||
assert result.equals(expected)
|
||
|
||
result = arr.fill_null(pa.scalar(5, type='int8'))
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_logical():
|
||
a = pa.array([True, False, False, None])
|
||
b = pa.array([True, True, False, True])
|
||
|
||
assert pc.and_(a, b) == pa.array([True, False, False, None])
|
||
assert pc.and_kleene(a, b) == pa.array([True, False, False, None])
|
||
|
||
assert pc.or_(a, b) == pa.array([True, True, False, None])
|
||
assert pc.or_kleene(a, b) == pa.array([True, True, False, True])
|
||
|
||
assert pc.xor(a, b) == pa.array([False, True, False, None])
|
||
|
||
assert pc.invert(a) == pa.array([False, True, True, None])
|
||
|
||
|
||
def test_cast():
|
||
arr = pa.array([2 ** 63 - 1], type='int64')
|
||
|
||
with pytest.raises(pa.ArrowInvalid):
|
||
pc.cast(arr, 'int32')
|
||
|
||
assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32')
|
||
|
||
arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
|
||
expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]')
|
||
assert pc.cast(arr, 'timestamp[ms]') == expected
|
||
|
||
arr = pa.array([[1, 2], [3, 4, 5]], type=pa.large_list(pa.int8()))
|
||
expected = pa.array([["1", "2"], ["3", "4", "5"]],
|
||
type=pa.list_(pa.utf8()))
|
||
assert pc.cast(arr, expected.type) == expected
|
||
|
||
|
||
def test_strptime():
|
||
arr = pa.array(["5/1/2020", None, "12/13/1900"])
|
||
|
||
got = pc.strptime(arr, format='%m/%d/%Y', unit='s')
|
||
expected = pa.array([datetime(2020, 5, 1), None, datetime(1900, 12, 13)],
|
||
type=pa.timestamp('s'))
|
||
assert got == expected
|
||
# Positional format
|
||
assert pc.strptime(arr, '%m/%d/%Y', unit='s') == got
|
||
|
||
expected = pa.array([datetime(2020, 1, 5), None, None],
|
||
type=pa.timestamp('s'))
|
||
got = pc.strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True)
|
||
assert got == expected
|
||
|
||
with pytest.raises(pa.ArrowInvalid,
|
||
match="Failed to parse string: '5/1/2020'"):
|
||
pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=False)
|
||
|
||
with pytest.raises(pa.ArrowInvalid,
|
||
match="Failed to parse string: '5/1/2020'"):
|
||
pc.strptime(arr, format='%Y-%m-%d', unit='s')
|
||
|
||
got = pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=True)
|
||
assert got == pa.array([None, None, None], type=pa.timestamp('s'))
|
||
|
||
|
||
# TODO: We should test on windows once ARROW-13168 is resolved.
|
||
@pytest.mark.pandas
|
||
@pytest.mark.skipif(sys.platform == 'win32',
|
||
reason="Timezone database is not available on Windows yet")
|
||
def test_strftime():
|
||
from pyarrow.vendored.version import Version
|
||
|
||
def _fix_timestamp(s):
|
||
if Version(pd.__version__) < Version("1.0.0"):
|
||
return s.to_series().replace("NaT", pd.NaT)
|
||
else:
|
||
return s
|
||
|
||
times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
|
||
timezones = ["CET", "UTC", "Europe/Ljubljana"]
|
||
|
||
formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H",
|
||
"%I", "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%c", "%x",
|
||
"%X", "%%", "%G", "%V", "%u"]
|
||
|
||
for timezone in timezones:
|
||
ts = pd.to_datetime(times).tz_localize(timezone)
|
||
for unit in ["s", "ms", "us", "ns"]:
|
||
tsa = pa.array(ts, type=pa.timestamp(unit, timezone))
|
||
for fmt in formats:
|
||
options = pc.StrftimeOptions(fmt)
|
||
result = pc.strftime(tsa, options=options)
|
||
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
|
||
assert result.equals(expected)
|
||
|
||
fmt = "%Y-%m-%dT%H:%M:%S"
|
||
|
||
# Default format
|
||
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
|
||
result = pc.strftime(tsa, options=pc.StrftimeOptions())
|
||
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
|
||
assert result.equals(expected)
|
||
|
||
# Default format plus timezone
|
||
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
|
||
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
|
||
expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z")))
|
||
assert result.equals(expected)
|
||
|
||
# Pandas %S is equivalent to %S in arrow for unit="s"
|
||
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
|
||
options = pc.StrftimeOptions("%S")
|
||
result = pc.strftime(tsa, options=options)
|
||
expected = pa.array(_fix_timestamp(ts.strftime("%S")))
|
||
assert result.equals(expected)
|
||
|
||
# Pandas %S.%f is equivalent to %S in arrow for unit="us"
|
||
tsa = pa.array(ts, type=pa.timestamp("us", timezone))
|
||
options = pc.StrftimeOptions("%S")
|
||
result = pc.strftime(tsa, options=options)
|
||
expected = pa.array(_fix_timestamp(ts.strftime("%S.%f")))
|
||
assert result.equals(expected)
|
||
|
||
# Test setting locale
|
||
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
|
||
options = pc.StrftimeOptions(fmt, locale="C")
|
||
result = pc.strftime(tsa, options=options)
|
||
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
|
||
assert result.equals(expected)
|
||
|
||
# Test timestamps without timezone
|
||
fmt = "%Y-%m-%dT%H:%M:%S"
|
||
ts = pd.to_datetime(times)
|
||
tsa = pa.array(ts, type=pa.timestamp("s"))
|
||
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
|
||
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
|
||
# Positional format
|
||
assert pc.strftime(tsa, fmt) == result
|
||
|
||
assert result.equals(expected)
|
||
with pytest.raises(pa.ArrowInvalid,
|
||
match="Timezone not present, cannot convert to string"):
|
||
pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
|
||
with pytest.raises(pa.ArrowInvalid,
|
||
match="Timezone not present, cannot convert to string"):
|
||
pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z"))
|
||
|
||
|
||
def _check_datetime_components(timestamps, timezone=None):
|
||
from pyarrow.vendored.version import Version
|
||
|
||
ts = pd.to_datetime(timestamps).tz_localize(
|
||
"UTC").tz_convert(timezone).to_series()
|
||
tsa = pa.array(ts, pa.timestamp("ns", tz=timezone))
|
||
|
||
subseconds = ((ts.dt.microsecond * 10 ** 3 +
|
||
ts.dt.nanosecond) * 10 ** -9).round(9)
|
||
iso_calendar_fields = [
|
||
pa.field('iso_year', pa.int64()),
|
||
pa.field('iso_week', pa.int64()),
|
||
pa.field('iso_day_of_week', pa.int64())
|
||
]
|
||
|
||
if Version(pd.__version__) < Version("1.1.0"):
|
||
# https://github.com/pandas-dev/pandas/issues/33206
|
||
iso_year = ts.map(lambda x: x.isocalendar()[0]).astype("int64")
|
||
iso_week = ts.map(lambda x: x.isocalendar()[1]).astype("int64")
|
||
iso_day = ts.map(lambda x: x.isocalendar()[2]).astype("int64")
|
||
else:
|
||
# Casting is required because pandas isocalendar returns int32
|
||
# while arrow isocalendar returns int64.
|
||
iso_year = ts.dt.isocalendar()["year"].astype("int64")
|
||
iso_week = ts.dt.isocalendar()["week"].astype("int64")
|
||
iso_day = ts.dt.isocalendar()["day"].astype("int64")
|
||
|
||
iso_calendar = pa.StructArray.from_arrays(
|
||
[iso_year, iso_week, iso_day],
|
||
fields=iso_calendar_fields)
|
||
|
||
assert pc.year(tsa).equals(pa.array(ts.dt.year))
|
||
assert pc.is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year))
|
||
assert pc.month(tsa).equals(pa.array(ts.dt.month))
|
||
assert pc.day(tsa).equals(pa.array(ts.dt.day))
|
||
assert pc.day_of_week(tsa).equals(pa.array(ts.dt.dayofweek))
|
||
assert pc.day_of_year(tsa).equals(pa.array(ts.dt.dayofyear))
|
||
assert pc.iso_year(tsa).equals(pa.array(iso_year))
|
||
assert pc.iso_week(tsa).equals(pa.array(iso_week))
|
||
assert pc.iso_calendar(tsa).equals(iso_calendar)
|
||
assert pc.quarter(tsa).equals(pa.array(ts.dt.quarter))
|
||
assert pc.hour(tsa).equals(pa.array(ts.dt.hour))
|
||
assert pc.minute(tsa).equals(pa.array(ts.dt.minute))
|
||
assert pc.second(tsa).equals(pa.array(ts.dt.second.values))
|
||
assert pc.millisecond(tsa).equals(pa.array(ts.dt.microsecond // 10 ** 3))
|
||
assert pc.microsecond(tsa).equals(pa.array(ts.dt.microsecond % 10 ** 3))
|
||
assert pc.nanosecond(tsa).equals(pa.array(ts.dt.nanosecond))
|
||
assert pc.subsecond(tsa).equals(pa.array(subseconds))
|
||
|
||
if ts.dt.tz:
|
||
is_dst = ts.apply(lambda x: x.dst().seconds > 0)
|
||
assert pc.is_dst(tsa).equals(pa.array(is_dst))
|
||
|
||
day_of_week_options = pc.DayOfWeekOptions(
|
||
count_from_zero=False, week_start=1)
|
||
assert pc.day_of_week(tsa, options=day_of_week_options).equals(
|
||
pa.array(ts.dt.dayofweek + 1))
|
||
|
||
week_options = pc.WeekOptions(
|
||
week_starts_monday=True, count_from_zero=False,
|
||
first_week_is_fully_in_year=False)
|
||
assert pc.week(tsa, options=week_options).equals(pa.array(iso_week))
|
||
|
||
|
||
@pytest.mark.pandas
|
||
def test_extract_datetime_components():
|
||
from pyarrow.vendored.version import Version
|
||
|
||
timestamps = ["1970-01-01T00:00:59.123456789",
|
||
"2000-02-29T23:23:23.999999999",
|
||
"2033-05-18T03:33:20.000000000",
|
||
"2020-01-01T01:05:05.001",
|
||
"2019-12-31T02:10:10.002",
|
||
"2019-12-30T03:15:15.003",
|
||
"2009-12-31T04:20:20.004132",
|
||
"2010-01-01T05:25:25.005321",
|
||
"2010-01-03T06:30:30.006163",
|
||
"2010-01-04T07:35:35",
|
||
"2006-01-01T08:40:40",
|
||
"2005-12-31T09:45:45",
|
||
"2008-12-28",
|
||
"2008-12-29",
|
||
"2012-01-01 01:02:03"]
|
||
timezones = ["UTC", "US/Central", "Asia/Kolkata",
|
||
"Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"]
|
||
|
||
# Test timezone naive timestamp array
|
||
_check_datetime_components(timestamps)
|
||
|
||
# Test timezone aware timestamp array
|
||
if sys.platform == 'win32':
|
||
# TODO: We should test on windows once ARROW-13168 is resolved.
|
||
pytest.skip('Timezone database is not available on Windows yet')
|
||
elif Version(pd.__version__) < Version('1.0.0'):
|
||
pytest.skip('Pandas < 1.0 extracts time components incorrectly.')
|
||
else:
|
||
for timezone in timezones:
|
||
_check_datetime_components(timestamps, timezone)
|
||
|
||
|
||
# TODO: We should test on windows once ARROW-13168 is resolved.
|
||
@pytest.mark.pandas
|
||
@pytest.mark.skipif(sys.platform == 'win32',
|
||
reason="Timezone database is not available on Windows yet")
|
||
def test_assume_timezone():
|
||
from pyarrow.vendored.version import Version
|
||
|
||
ts_type = pa.timestamp("ns")
|
||
timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789",
|
||
"2000-02-29T23:23:23.999999999",
|
||
"2033-05-18T03:33:20.000000000",
|
||
"2020-01-01T01:05:05.001",
|
||
"2019-12-31T02:10:10.002",
|
||
"2019-12-30T03:15:15.003",
|
||
"2009-12-31T04:20:20.004132",
|
||
"2010-01-01T05:25:25.005321",
|
||
"2010-01-03T06:30:30.006163",
|
||
"2010-01-04T07:35:35",
|
||
"2006-01-01T08:40:40",
|
||
"2005-12-31T09:45:45",
|
||
"2008-12-28",
|
||
"2008-12-29",
|
||
"2012-01-01 01:02:03"])
|
||
nonexistent = pd.to_datetime(["2015-03-29 02:30:00",
|
||
"2015-03-29 03:30:00"])
|
||
ambiguous = pd.to_datetime(["2018-10-28 01:20:00",
|
||
"2018-10-28 02:36:00",
|
||
"2018-10-28 03:46:00"])
|
||
ambiguous_array = pa.array(ambiguous, type=ts_type)
|
||
nonexistent_array = pa.array(nonexistent, type=ts_type)
|
||
|
||
for timezone in ["UTC", "US/Central", "Asia/Kolkata"]:
|
||
options = pc.AssumeTimezoneOptions(timezone)
|
||
ta = pa.array(timestamps, type=ts_type)
|
||
expected = timestamps.tz_localize(timezone)
|
||
result = pc.assume_timezone(ta, options=options)
|
||
assert result.equals(pa.array(expected))
|
||
result = pc.assume_timezone(ta, timezone) # Positional option
|
||
assert result.equals(pa.array(expected))
|
||
|
||
ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone))
|
||
with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"):
|
||
pc.assume_timezone(ta_zoned, options=options)
|
||
|
||
invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss")
|
||
with pytest.raises(ValueError, match="not found in timezone database"):
|
||
pc.assume_timezone(ta, options=invalid_options)
|
||
|
||
timezone = "Europe/Brussels"
|
||
|
||
# nonexistent parameter was introduced in Pandas 0.24.0
|
||
if Version(pd.__version__) >= Version("0.24.0"):
|
||
options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
|
||
options_nonexistent_earliest = pc.AssumeTimezoneOptions(
|
||
timezone, ambiguous="raise", nonexistent="earliest")
|
||
options_nonexistent_latest = pc.AssumeTimezoneOptions(
|
||
timezone, ambiguous="raise", nonexistent="latest")
|
||
|
||
with pytest.raises(ValueError,
|
||
match="Timestamp doesn't exist in "
|
||
f"timezone '{timezone}'"):
|
||
pc.assume_timezone(nonexistent_array,
|
||
options=options_nonexistent_raise)
|
||
|
||
expected = pa.array(nonexistent.tz_localize(
|
||
timezone, nonexistent="shift_forward"))
|
||
result = pc.assume_timezone(
|
||
nonexistent_array, options=options_nonexistent_latest)
|
||
expected.equals(result)
|
||
|
||
expected = pa.array(nonexistent.tz_localize(
|
||
timezone, nonexistent="shift_backward"))
|
||
result = pc.assume_timezone(
|
||
nonexistent_array, options=options_nonexistent_earliest)
|
||
expected.equals(result)
|
||
|
||
options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone)
|
||
options_ambiguous_latest = pc.AssumeTimezoneOptions(
|
||
timezone, ambiguous="latest", nonexistent="raise")
|
||
options_ambiguous_earliest = pc.AssumeTimezoneOptions(
|
||
timezone, ambiguous="earliest", nonexistent="raise")
|
||
|
||
with pytest.raises(ValueError,
|
||
match="Timestamp is ambiguous in "
|
||
f"timezone '{timezone}'"):
|
||
pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise)
|
||
|
||
expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True])
|
||
result = pc.assume_timezone(
|
||
ambiguous_array, options=options_ambiguous_earliest)
|
||
result.equals(pa.array(expected))
|
||
|
||
expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False])
|
||
result = pc.assume_timezone(
|
||
ambiguous_array, options=options_ambiguous_latest)
|
||
result.equals(pa.array(expected))
|
||
|
||
|
||
def _check_temporal_rounding(ts, values, unit):
|
||
unit_shorthand = {
|
||
"nanosecond": "ns",
|
||
"microsecond": "us",
|
||
"millisecond": "L",
|
||
"second": "s",
|
||
"minute": "min",
|
||
"hour": "H",
|
||
"day": "D"
|
||
}
|
||
ta = pa.array(ts)
|
||
|
||
for value in values:
|
||
frequency = str(value) + unit_shorthand[unit]
|
||
options = pc.RoundTemporalOptions(value, unit)
|
||
|
||
result = pc.ceil_temporal(ta, options=options).to_pandas()
|
||
expected = ts.dt.ceil(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
result = pc.floor_temporal(ta, options=options).to_pandas()
|
||
expected = ts.dt.floor(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
result = pc.round_temporal(ta, options=options).to_pandas()
|
||
expected = ts.dt.round(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
# Check RoundTemporalOptions partial defaults
|
||
if unit == "day":
|
||
result = pc.ceil_temporal(ta, multiple=value).to_pandas()
|
||
expected = ts.dt.ceil(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
result = pc.floor_temporal(ta, multiple=value).to_pandas()
|
||
expected = ts.dt.floor(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
result = pc.round_temporal(ta, multiple=value).to_pandas()
|
||
expected = ts.dt.round(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
# Check RoundTemporalOptions defaults
|
||
if unit == "day":
|
||
frequency = "1D"
|
||
|
||
result = pc.ceil_temporal(ta).to_pandas()
|
||
expected = ts.dt.ceil(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
result = pc.floor_temporal(ta).to_pandas()
|
||
expected = ts.dt.floor(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
result = pc.round_temporal(ta).to_pandas()
|
||
expected = ts.dt.round(frequency)
|
||
np.testing.assert_array_equal(result, expected)
|
||
|
||
|
||
# TODO: We should test on windows once ARROW-13168 is resolved.
|
||
@pytest.mark.skipif(sys.platform == 'win32',
|
||
reason="Timezone database is not available on Windows yet")
|
||
@pytest.mark.parametrize('unit', ("nanosecond", "microsecond", "millisecond",
|
||
"second", "minute", "hour", "day"))
|
||
@pytest.mark.pandas
|
||
def test_round_temporal(unit):
|
||
from pyarrow.vendored.version import Version
|
||
|
||
if Version(pd.__version__) < Version('1.0.0') and \
|
||
unit in ("nanosecond", "microsecond"):
|
||
pytest.skip('Pandas < 1.0 rounds zoned small units differently.')
|
||
|
||
values = (1, 2, 3, 4, 5, 6, 7, 10, 15, 24, 60, 250, 500, 750)
|
||
timestamps = [
|
||
"1923-07-07 08:52:35.203790336",
|
||
"1931-03-17 10:45:00.641559040",
|
||
"1932-06-16 01:16:42.911994368",
|
||
"1941-05-27 11:46:43.822831872",
|
||
"1943-12-14 07:32:05.424766464",
|
||
"1954-04-12 04:31:50.699881472",
|
||
"1966-02-12 17:41:28.693282560",
|
||
"1967-02-26 05:56:46.922376960",
|
||
"1975-11-01 10:55:37.016146432",
|
||
"1982-01-21 18:43:44.517366784",
|
||
"1999-12-04 05:55:34.794991104",
|
||
"2026-10-26 08:39:00.316686848"]
|
||
ts = pd.Series([pd.Timestamp(x, unit="ns") for x in timestamps])
|
||
_check_temporal_rounding(ts, values, unit)
|
||
|
||
timezones = ["Asia/Kolkata", "America/New_York", "Etc/GMT-4", "Etc/GMT+4",
|
||
"Europe/Brussels", "Pacific/Marquesas", "US/Central", "UTC"]
|
||
|
||
for timezone in timezones:
|
||
ts_zoned = ts.dt.tz_localize("UTC").dt.tz_convert(timezone)
|
||
_check_temporal_rounding(ts_zoned, values, unit)
|
||
|
||
|
||
def test_count():
|
||
arr = pa.array([1, 2, 3, None, None])
|
||
assert pc.count(arr).as_py() == 3
|
||
assert pc.count(arr, mode='only_valid').as_py() == 3
|
||
assert pc.count(arr, mode='only_null').as_py() == 2
|
||
assert pc.count(arr, mode='all').as_py() == 5
|
||
assert pc.count(arr, 'all').as_py() == 5
|
||
|
||
with pytest.raises(ValueError,
|
||
match='"something else" is not a valid count mode'):
|
||
pc.count(arr, 'something else')
|
||
|
||
|
||
def test_index():
|
||
arr = pa.array([0, 1, None, 3, 4], type=pa.int64())
|
||
assert pc.index(arr, pa.scalar(0)).as_py() == 0
|
||
assert pc.index(arr, pa.scalar(2, type=pa.int8())).as_py() == -1
|
||
assert pc.index(arr, 4).as_py() == 4
|
||
assert arr.index(3, start=2).as_py() == 3
|
||
assert arr.index(None).as_py() == -1
|
||
|
||
arr = pa.chunked_array([[1, 2], [1, 3]], type=pa.int64())
|
||
assert arr.index(1).as_py() == 0
|
||
assert arr.index(1, start=2).as_py() == 2
|
||
assert arr.index(1, start=1, end=2).as_py() == -1
|
||
|
||
|
||
def check_partition_nth(data, indices, pivot, null_placement):
|
||
indices = indices.to_pylist()
|
||
assert len(indices) == len(data)
|
||
assert sorted(indices) == list(range(len(data)))
|
||
until_pivot = [data[indices[i]] for i in range(pivot)]
|
||
after_pivot = [data[indices[i]] for i in range(pivot, len(data))]
|
||
p = data[indices[pivot]]
|
||
if p is None:
|
||
if null_placement == "at_start":
|
||
assert all(v is None for v in until_pivot)
|
||
else:
|
||
assert all(v is None for v in after_pivot)
|
||
else:
|
||
if null_placement == "at_start":
|
||
assert all(v is None or v <= p for v in until_pivot)
|
||
assert all(v >= p for v in after_pivot)
|
||
else:
|
||
assert all(v <= p for v in until_pivot)
|
||
assert all(v is None or v >= p for v in after_pivot)
|
||
|
||
|
||
def test_partition_nth():
|
||
data = list(range(100, 140))
|
||
random.shuffle(data)
|
||
pivot = 10
|
||
indices = pc.partition_nth_indices(data, pivot=pivot)
|
||
check_partition_nth(data, indices, pivot, "at_end")
|
||
# Positional pivot argument
|
||
assert pc.partition_nth_indices(data, pivot) == indices
|
||
|
||
with pytest.raises(
|
||
ValueError,
|
||
match="'partition_nth_indices' cannot be called without options"):
|
||
pc.partition_nth_indices(data)
|
||
|
||
|
||
def test_partition_nth_null_placement():
|
||
data = list(range(10)) + [None] * 10
|
||
random.shuffle(data)
|
||
|
||
for pivot in (0, 7, 13, 19):
|
||
for null_placement in ("at_start", "at_end"):
|
||
indices = pc.partition_nth_indices(data, pivot=pivot,
|
||
null_placement=null_placement)
|
||
check_partition_nth(data, indices, pivot, null_placement)
|
||
|
||
|
||
def test_select_k_array():
|
||
def validate_select_k(select_k_indices, arr, order, stable_sort=False):
|
||
sorted_indices = pc.sort_indices(arr, sort_keys=[("dummy", order)])
|
||
head_k_indices = sorted_indices.slice(0, len(select_k_indices))
|
||
if stable_sort:
|
||
assert select_k_indices == head_k_indices
|
||
else:
|
||
expected = pc.take(arr, head_k_indices)
|
||
actual = pc.take(arr, select_k_indices)
|
||
assert actual == expected
|
||
|
||
arr = pa.array([1, 2, None, 0])
|
||
for k in [0, 2, 4]:
|
||
for order in ["descending", "ascending"]:
|
||
result = pc.select_k_unstable(
|
||
arr, k=k, sort_keys=[("dummy", order)])
|
||
validate_select_k(result, arr, order)
|
||
|
||
result = pc.top_k_unstable(arr, k=k)
|
||
validate_select_k(result, arr, "descending")
|
||
|
||
result = pc.bottom_k_unstable(arr, k=k)
|
||
validate_select_k(result, arr, "ascending")
|
||
|
||
result = pc.select_k_unstable(
|
||
arr, options=pc.SelectKOptions(
|
||
k=2, sort_keys=[("dummy", "descending")])
|
||
)
|
||
validate_select_k(result, arr, "descending")
|
||
|
||
result = pc.select_k_unstable(
|
||
arr, options=pc.SelectKOptions(k=2, sort_keys=[("dummy", "ascending")])
|
||
)
|
||
validate_select_k(result, arr, "ascending")
|
||
|
||
# Position options
|
||
assert pc.select_k_unstable(arr, 2,
|
||
sort_keys=[("dummy", "ascending")]) == result
|
||
assert pc.select_k_unstable(arr, 2, [("dummy", "ascending")]) == result
|
||
|
||
|
||
def test_select_k_table():
|
||
def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False):
|
||
sorted_indices = pc.sort_indices(tbl, sort_keys=sort_keys)
|
||
head_k_indices = sorted_indices.slice(0, len(select_k_indices))
|
||
if stable_sort:
|
||
assert select_k_indices == head_k_indices
|
||
else:
|
||
expected = pc.take(tbl, head_k_indices)
|
||
actual = pc.take(tbl, select_k_indices)
|
||
assert actual == expected
|
||
|
||
table = pa.table({"a": [1, 2, 0], "b": [1, 0, 1]})
|
||
for k in [0, 2, 4]:
|
||
result = pc.select_k_unstable(
|
||
table, k=k, sort_keys=[("a", "ascending")])
|
||
validate_select_k(result, table, sort_keys=[("a", "ascending")])
|
||
|
||
result = pc.select_k_unstable(
|
||
table, k=k, sort_keys=[("a", "ascending"), ("b", "ascending")])
|
||
validate_select_k(
|
||
result, table, sort_keys=[("a", "ascending"), ("b", "ascending")])
|
||
|
||
result = pc.top_k_unstable(table, k=k, sort_keys=["a"])
|
||
validate_select_k(result, table, sort_keys=[("a", "descending")])
|
||
|
||
result = pc.bottom_k_unstable(table, k=k, sort_keys=["a", "b"])
|
||
validate_select_k(
|
||
result, table, sort_keys=[("a", "ascending"), ("b", "ascending")])
|
||
|
||
with pytest.raises(
|
||
ValueError,
|
||
match="'select_k_unstable' cannot be called without options"):
|
||
pc.select_k_unstable(table)
|
||
|
||
with pytest.raises(ValueError,
|
||
match="select_k_unstable requires a nonnegative `k`"):
|
||
pc.select_k_unstable(table, k=-1, sort_keys=[("a", "ascending")])
|
||
|
||
with pytest.raises(ValueError,
|
||
match="select_k_unstable requires a "
|
||
"non-empty `sort_keys`"):
|
||
pc.select_k_unstable(table, k=2, sort_keys=[])
|
||
|
||
with pytest.raises(ValueError, match="not a valid sort order"):
|
||
pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")])
|
||
|
||
with pytest.raises(ValueError,
|
||
match="Invalid sort key column: No match for.*unknown"):
|
||
pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")])
|
||
|
||
|
||
def test_array_sort_indices():
|
||
arr = pa.array([1, 2, None, 0])
|
||
result = pc.array_sort_indices(arr)
|
||
assert result.to_pylist() == [3, 0, 1, 2]
|
||
result = pc.array_sort_indices(arr, order="ascending")
|
||
assert result.to_pylist() == [3, 0, 1, 2]
|
||
result = pc.array_sort_indices(arr, order="descending")
|
||
assert result.to_pylist() == [1, 0, 3, 2]
|
||
result = pc.array_sort_indices(arr, order="descending",
|
||
null_placement="at_start")
|
||
assert result.to_pylist() == [2, 1, 0, 3]
|
||
result = pc.array_sort_indices(arr, "descending",
|
||
null_placement="at_start")
|
||
assert result.to_pylist() == [2, 1, 0, 3]
|
||
|
||
with pytest.raises(ValueError, match="not a valid sort order"):
|
||
pc.array_sort_indices(arr, order="nonscending")
|
||
|
||
|
||
def test_sort_indices_array():
|
||
arr = pa.array([1, 2, None, 0])
|
||
result = pc.sort_indices(arr)
|
||
assert result.to_pylist() == [3, 0, 1, 2]
|
||
result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")])
|
||
assert result.to_pylist() == [3, 0, 1, 2]
|
||
result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")])
|
||
assert result.to_pylist() == [1, 0, 3, 2]
|
||
result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")],
|
||
null_placement="at_start")
|
||
assert result.to_pylist() == [2, 1, 0, 3]
|
||
# Positional `sort_keys`
|
||
result = pc.sort_indices(arr, [("dummy", "descending")],
|
||
null_placement="at_start")
|
||
assert result.to_pylist() == [2, 1, 0, 3]
|
||
# Using SortOptions
|
||
result = pc.sort_indices(
|
||
arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")])
|
||
)
|
||
assert result.to_pylist() == [1, 0, 3, 2]
|
||
result = pc.sort_indices(
|
||
arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")],
|
||
null_placement="at_start")
|
||
)
|
||
assert result.to_pylist() == [2, 1, 0, 3]
|
||
|
||
|
||
def test_sort_indices_table():
|
||
table = pa.table({"a": [1, 1, None, 0], "b": [1, 0, 0, 1]})
|
||
|
||
result = pc.sort_indices(table, sort_keys=[("a", "ascending")])
|
||
assert result.to_pylist() == [3, 0, 1, 2]
|
||
result = pc.sort_indices(table, sort_keys=[("a", "ascending")],
|
||
null_placement="at_start")
|
||
assert result.to_pylist() == [2, 3, 0, 1]
|
||
|
||
result = pc.sort_indices(
|
||
table, sort_keys=[("a", "descending"), ("b", "ascending")]
|
||
)
|
||
assert result.to_pylist() == [1, 0, 3, 2]
|
||
result = pc.sort_indices(
|
||
table, sort_keys=[("a", "descending"), ("b", "ascending")],
|
||
null_placement="at_start"
|
||
)
|
||
assert result.to_pylist() == [2, 1, 0, 3]
|
||
# Positional `sort_keys`
|
||
result = pc.sort_indices(
|
||
table, [("a", "descending"), ("b", "ascending")],
|
||
null_placement="at_start"
|
||
)
|
||
assert result.to_pylist() == [2, 1, 0, 3]
|
||
|
||
with pytest.raises(ValueError, match="Must specify one or more sort keys"):
|
||
pc.sort_indices(table)
|
||
|
||
with pytest.raises(ValueError, match="Nonexistent sort key column"):
|
||
pc.sort_indices(table, sort_keys=[("unknown", "ascending")])
|
||
|
||
with pytest.raises(ValueError, match="not a valid sort order"):
|
||
pc.sort_indices(table, sort_keys=[("a", "nonscending")])
|
||
|
||
|
||
def test_is_in():
|
||
arr = pa.array([1, 2, None, 1, 2, 3])
|
||
|
||
result = pc.is_in(arr, value_set=pa.array([1, 3, None]))
|
||
assert result.to_pylist() == [True, False, True, True, False, True]
|
||
|
||
result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True)
|
||
assert result.to_pylist() == [True, False, False, True, False, True]
|
||
|
||
result = pc.is_in(arr, value_set=pa.array([1, 3]))
|
||
assert result.to_pylist() == [True, False, False, True, False, True]
|
||
|
||
result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
|
||
assert result.to_pylist() == [True, False, False, True, False, True]
|
||
|
||
|
||
def test_index_in():
|
||
arr = pa.array([1, 2, None, 1, 2, 3])
|
||
|
||
result = pc.index_in(arr, value_set=pa.array([1, 3, None]))
|
||
assert result.to_pylist() == [0, None, 2, 0, None, 1]
|
||
|
||
result = pc.index_in(arr, value_set=pa.array([1, 3, None]),
|
||
skip_nulls=True)
|
||
assert result.to_pylist() == [0, None, None, 0, None, 1]
|
||
|
||
result = pc.index_in(arr, value_set=pa.array([1, 3]))
|
||
assert result.to_pylist() == [0, None, None, 0, None, 1]
|
||
|
||
result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
|
||
assert result.to_pylist() == [0, None, None, 0, None, 1]
|
||
|
||
# Positional value_set
|
||
result = pc.index_in(arr, pa.array([1, 3]), skip_nulls=True)
|
||
assert result.to_pylist() == [0, None, None, 0, None, 1]
|
||
|
||
|
||
def test_quantile():
|
||
arr = pa.array([1, 2, 3, 4])
|
||
|
||
result = pc.quantile(arr)
|
||
assert result.to_pylist() == [2.5]
|
||
|
||
result = pc.quantile(arr, interpolation='lower')
|
||
assert result.to_pylist() == [2]
|
||
result = pc.quantile(arr, interpolation='higher')
|
||
assert result.to_pylist() == [3]
|
||
result = pc.quantile(arr, interpolation='nearest')
|
||
assert result.to_pylist() == [3]
|
||
result = pc.quantile(arr, interpolation='midpoint')
|
||
assert result.to_pylist() == [2.5]
|
||
result = pc.quantile(arr, interpolation='linear')
|
||
assert result.to_pylist() == [2.5]
|
||
|
||
arr = pa.array([1, 2])
|
||
|
||
result = pc.quantile(arr, q=[0.25, 0.5, 0.75])
|
||
assert result.to_pylist() == [1.25, 1.5, 1.75]
|
||
|
||
result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower')
|
||
assert result.to_pylist() == [1, 1, 1]
|
||
result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher')
|
||
assert result.to_pylist() == [2, 2, 2]
|
||
result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint')
|
||
assert result.to_pylist() == [1.5, 1.5, 1.5]
|
||
result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest')
|
||
assert result.to_pylist() == [1, 1, 2]
|
||
result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear')
|
||
assert result.to_pylist() == [1.25, 1.5, 1.75]
|
||
|
||
# Positional `q`
|
||
result = pc.quantile(arr, [0.25, 0.5, 0.75], interpolation='linear')
|
||
assert result.to_pylist() == [1.25, 1.5, 1.75]
|
||
|
||
with pytest.raises(ValueError, match="Quantile must be between 0 and 1"):
|
||
pc.quantile(arr, q=1.1)
|
||
with pytest.raises(ValueError, match="not a valid quantile interpolation"):
|
||
pc.quantile(arr, interpolation='zzz')
|
||
|
||
|
||
def test_tdigest():
|
||
arr = pa.array([1, 2, 3, 4])
|
||
result = pc.tdigest(arr)
|
||
assert result.to_pylist() == [2.5]
|
||
|
||
arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])])
|
||
result = pc.tdigest(arr)
|
||
assert result.to_pylist() == [2.5]
|
||
|
||
arr = pa.array([1, 2, 3, 4])
|
||
result = pc.tdigest(arr, q=[0, 0.5, 1])
|
||
assert result.to_pylist() == [1, 2.5, 4]
|
||
|
||
arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])])
|
||
result = pc.tdigest(arr, [0, 0.5, 1]) # positional `q`
|
||
assert result.to_pylist() == [1, 2.5, 4]
|
||
|
||
|
||
def test_fill_null_segfault():
|
||
# ARROW-12672
|
||
arr = pa.array([None], pa.bool_()).fill_null(False)
|
||
result = arr.cast(pa.int8())
|
||
assert result == pa.array([0], pa.int8())
|
||
|
||
|
||
def test_min_max_element_wise():
|
||
arr1 = pa.array([1, 2, 3])
|
||
arr2 = pa.array([3, 1, 2])
|
||
arr3 = pa.array([2, 3, None])
|
||
|
||
result = pc.max_element_wise(arr1, arr2)
|
||
assert result == pa.array([3, 2, 3])
|
||
result = pc.min_element_wise(arr1, arr2)
|
||
assert result == pa.array([1, 1, 2])
|
||
|
||
result = pc.max_element_wise(arr1, arr2, arr3)
|
||
assert result == pa.array([3, 3, 3])
|
||
result = pc.min_element_wise(arr1, arr2, arr3)
|
||
assert result == pa.array([1, 1, 2])
|
||
|
||
# with specifying the option
|
||
result = pc.max_element_wise(arr1, arr3, skip_nulls=True)
|
||
assert result == pa.array([2, 3, 3])
|
||
result = pc.min_element_wise(arr1, arr3, skip_nulls=True)
|
||
assert result == pa.array([1, 2, 3])
|
||
result = pc.max_element_wise(
|
||
arr1, arr3, options=pc.ElementWiseAggregateOptions())
|
||
assert result == pa.array([2, 3, 3])
|
||
result = pc.min_element_wise(
|
||
arr1, arr3, options=pc.ElementWiseAggregateOptions())
|
||
assert result == pa.array([1, 2, 3])
|
||
|
||
# not skipping nulls
|
||
result = pc.max_element_wise(arr1, arr3, skip_nulls=False)
|
||
assert result == pa.array([2, 3, None])
|
||
result = pc.min_element_wise(arr1, arr3, skip_nulls=False)
|
||
assert result == pa.array([1, 2, None])
|
||
|
||
|
||
def test_make_struct():
|
||
assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'}
|
||
|
||
assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == {
|
||
'i': 1, 's': 'a'}
|
||
|
||
assert pc.make_struct([1, 2, 3],
|
||
"a b c".split()) == pa.StructArray.from_arrays([
|
||
[1, 2, 3],
|
||
"a b c".split()], names='0 1'.split())
|
||
|
||
with pytest.raises(ValueError,
|
||
match="Array arguments must all be the same length"):
|
||
pc.make_struct([1, 2, 3, 4], "a b c".split())
|
||
|
||
with pytest.raises(ValueError, match="0 arguments but 2 field names"):
|
||
pc.make_struct(field_names=['one', 'two'])
|
||
|
||
|
||
def test_map_lookup():
|
||
ty = pa.map_(pa.utf8(), pa.int32())
|
||
arr = pa.array([[('one', 1), ('two', 2)], [('none', 3)],
|
||
[], [('one', 5), ('one', 7)], None], type=ty)
|
||
result_first = pa.array([1, None, None, 5, None], type=pa.int32())
|
||
result_last = pa.array([1, None, None, 7, None], type=pa.int32())
|
||
result_all = pa.array([[1], None, None, [5, 7], None],
|
||
type=pa.list_(pa.int32()))
|
||
|
||
assert pc.map_lookup(arr, pa.scalar(
|
||
'one', type=pa.utf8()), 'first') == result_first
|
||
assert pc.map_lookup(arr, pa.scalar(
|
||
'one', type=pa.utf8()), 'last') == result_last
|
||
assert pc.map_lookup(arr, pa.scalar(
|
||
'one', type=pa.utf8()), 'all') == result_all
|
||
|
||
|
||
def test_struct_fields_options():
|
||
a = pa.array([4, 5, 6], type=pa.int64())
|
||
b = pa.array(["bar", None, ""])
|
||
c = pa.StructArray.from_arrays([a, b], ["a", "b"])
|
||
arr = pa.StructArray.from_arrays([a, c], ["a", "c"])
|
||
|
||
assert pc.struct_field(arr,
|
||
indices=[1, 1]) == pa.array(["bar", None, ""])
|
||
assert pc.struct_field(arr, [1, 1]) == pa.array(["bar", None, ""])
|
||
assert pc.struct_field(arr, [0]) == pa.array([4, 5, 6], type=pa.int64())
|
||
assert pc.struct_field(arr, []) == arr
|
||
|
||
with pytest.raises(TypeError, match="an integer is required"):
|
||
pc.struct_field(arr, indices=['a'])
|
||
|
||
# TODO: https://issues.apache.org/jira/browse/ARROW-14853
|
||
# assert pc.struct_field(arr) == arr
|
||
|
||
|
||
def test_case_when():
|
||
assert pc.case_when(pc.make_struct([True, False, None],
|
||
[False, True, None]),
|
||
[1, 2, 3],
|
||
[11, 12, 13]) == pa.array([1, 12, None])
|
||
|
||
|
||
def test_list_element():
|
||
element_type = pa.struct([('a', pa.float64()), ('b', pa.int8())])
|
||
list_type = pa.list_(element_type)
|
||
l1 = [{'a': .4, 'b': 2}, None, {'a': .2, 'b': 4}, None, {'a': 5.6, 'b': 6}]
|
||
l2 = [None, {'a': .52, 'b': 3}, {'a': .7, 'b': 4}, None, {'a': .6, 'b': 8}]
|
||
lists = pa.array([l1, l2], list_type)
|
||
|
||
index = 1
|
||
result = pa.compute.list_element(lists, index)
|
||
expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type)
|
||
assert result.equals(expected)
|
||
|
||
index = 4
|
||
result = pa.compute.list_element(lists, index)
|
||
expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type)
|
||
assert result.equals(expected)
|
||
|
||
|
||
def test_count_distinct():
|
||
seed = datetime.now()
|
||
samples = [seed.replace(year=y) for y in range(1992, 2092)]
|
||
arr = pa.array(samples, pa.timestamp("ns"))
|
||
assert pc.count_distinct(arr) == pa.scalar(len(samples), type=pa.int64())
|
||
|
||
|
||
def test_count_distinct_options():
|
||
arr = pa.array([1, 2, 3, None, None])
|
||
assert pc.count_distinct(arr).as_py() == 3
|
||
assert pc.count_distinct(arr, mode='only_valid').as_py() == 3
|
||
assert pc.count_distinct(arr, mode='only_null').as_py() == 1
|
||
assert pc.count_distinct(arr, mode='all').as_py() == 4
|
||
assert pc.count_distinct(arr, 'all').as_py() == 4
|
||
|
||
|
||
def test_utf8_normalize():
|
||
arr = pa.array(["01²3"])
|
||
assert pc.utf8_normalize(arr, form="NFC") == arr
|
||
assert pc.utf8_normalize(arr, form="NFKC") == pa.array(["0123"])
|
||
assert pc.utf8_normalize(arr, "NFD") == arr
|
||
assert pc.utf8_normalize(arr, "NFKD") == pa.array(["0123"])
|
||
with pytest.raises(
|
||
ValueError,
|
||
match='"NFZ" is not a valid Unicode normalization form'):
|
||
pc.utf8_normalize(arr, form="NFZ")
|
||
|
||
|
||
def test_random():
|
||
# (note negative integer initializers are accepted)
|
||
for initializer in ['system', 42, -42, b"abcdef"]:
|
||
assert pc.random(0, initializer=initializer) == \
|
||
pa.array([], type=pa.float64())
|
||
|
||
# System random initialization => outputs all distinct
|
||
arrays = [tuple(pc.random(100).to_pylist()) for i in range(10)]
|
||
assert len(set(arrays)) == len(arrays)
|
||
|
||
arrays = [tuple(pc.random(100, initializer=i % 7).to_pylist())
|
||
for i in range(0, 100)]
|
||
assert len(set(arrays)) == 7
|
||
|
||
# Arbitrary hashable objects can be given as initializer
|
||
initializers = [object(), (4, 5, 6), "foo"]
|
||
initializers.extend(os.urandom(10) for i in range(10))
|
||
arrays = [tuple(pc.random(100, initializer=i).to_pylist())
|
||
for i in initializers]
|
||
assert len(set(arrays)) == len(arrays)
|
||
|
||
with pytest.raises(TypeError,
|
||
match=r"initializer should be 'system', an integer, "
|
||
r"or a hashable object; got \[\]"):
|
||
pc.random(100, initializer=[])
|
||
|
||
|
||
def test_expression_serialization():
|
||
a = pc.scalar(1)
|
||
b = pc.scalar(1.1)
|
||
c = pc.scalar(True)
|
||
d = pc.scalar("string")
|
||
e = pc.scalar(None)
|
||
f = pc.scalar({'a': 1})
|
||
g = pc.scalar(pa.scalar(1))
|
||
h = pc.scalar(np.int64(2))
|
||
|
||
all_exprs = [a, b, c, d, e, f, g, h, a == b, a > b, a & b, a | b, ~c,
|
||
d.is_valid(), a.cast(pa.int32(), safe=False),
|
||
a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]),
|
||
pc.field('i64') > 5, pc.field('i64') == 5,
|
||
pc.field('i64') == 7, pc.field('i64').is_null(),
|
||
pc.field(('foo', 'bar')) == 'value',
|
||
pc.field('foo', 'bar') == 'value']
|
||
for expr in all_exprs:
|
||
assert isinstance(expr, pc.Expression)
|
||
restored = pickle.loads(pickle.dumps(expr))
|
||
assert expr.equals(restored)
|
||
|
||
|
||
def test_expression_construction():
|
||
zero = pc.scalar(0)
|
||
one = pc.scalar(1)
|
||
true = pc.scalar(True)
|
||
false = pc.scalar(False)
|
||
string = pc.scalar("string")
|
||
field = pc.field("field")
|
||
nested_field = pc.field(("nested", "field"))
|
||
nested_field2 = pc.field("nested", "field")
|
||
|
||
zero | one == string
|
||
~true == false
|
||
for typ in ("bool", pa.bool_()):
|
||
field.cast(typ) == true
|
||
|
||
field.isin([1, 2])
|
||
nested_field.isin(["foo", "bar"])
|
||
nested_field2.isin(["foo", "bar"])
|
||
|
||
with pytest.raises(TypeError):
|
||
field.isin(1)
|
||
|
||
with pytest.raises(pa.ArrowInvalid):
|
||
field != object()
|
||
|
||
|
||
def test_expression_boolean_operators():
|
||
# https://issues.apache.org/jira/browse/ARROW-11412
|
||
true = pc.scalar(True)
|
||
false = pc.scalar(False)
|
||
|
||
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
|
||
true and false
|
||
|
||
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
|
||
true or false
|
||
|
||
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
|
||
bool(true)
|
||
|
||
with pytest.raises(ValueError, match="cannot be evaluated to python True"):
|
||
not true
|
||
|
||
|
||
def test_expression_call_function():
|
||
field = pc.field("field")
|
||
|
||
# no options
|
||
assert str(pc.hour(field)) == "hour(field)"
|
||
|
||
# default options
|
||
assert str(pc.round(field)) == "round(field)"
|
||
# specified options
|
||
assert str(pc.round(field, ndigits=1)) == \
|
||
"round(field, {ndigits=1, round_mode=HALF_TO_EVEN})"
|
||
|
||
# mixed types are not (yet) allowed
|
||
with pytest.raises(TypeError):
|
||
pc.add(field, 1)
|
||
|
||
with pytest.raises(TypeError):
|
||
pc.add(1, field)
|