AzSuicideDataVisualization/.venv/Lib/site-packages/pyarrow/tests/test_exec_plan.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import pytest
import pyarrow as pa

try:
    import pyarrow.dataset as ds
    import pyarrow._exec_plan as ep
except ImportError:
    pass

pytestmark = pytest.mark.dataset


def test_joins_corner_cases():
    t1 = pa.Table.from_pydict({
        "colA": [1, 2, 3, 4, 5, 6],
        "col2": ["a", "b", "c", "d", "e", "f"]
    })

    t2 = pa.Table.from_pydict({
        "colB": [1, 2, 3, 4, 5],
        "col3": ["A", "B", "C", "D", "E"]
    })

    with pytest.raises(pa.ArrowInvalid):
        ep._perform_join("left outer", t1, "", t2, "")

    with pytest.raises(TypeError):
        ep._perform_join("left outer", None, "colA", t2, "colB")

    with pytest.raises(ValueError):
        ep._perform_join("super mario join", t1, "colA", t2, "colB")


@pytest.mark.parametrize("jointype,expected", [
    ("left semi", {
        "colA": [1, 2],
        "col2": ["a", "b"]
    }),
    ("right semi", {
        "colB": [1, 2],
        "col3": ["A", "B"]
    }),
    ("left anti", {
        "colA": [6],
        "col2": ["f"]
    }),
    ("right anti", {
        "colB": [99],
        "col3": ["Z"]
    }),
    ("inner", {
        "colA": [1, 2],
        "col2": ["a", "b"],
        "col3": ["A", "B"]
    }),
    ("left outer", {
        "colA": [1, 2, 6],
        "col2": ["a", "b", "f"],
        "col3": ["A", "B", None]
    }),
    ("right outer", {
        "col2": ["a", "b", None],
        "colB": [1, 2, 99],
        "col3": ["A", "B", "Z"]
    }),
    ("full outer", {
        "colA": [1, 2, 6, 99],
        "col2": ["a", "b", "f", None],
        "col3": ["A", "B", None, "Z"]
    })
])
@pytest.mark.parametrize("use_threads", [True, False])
@pytest.mark.parametrize("use_datasets", [False, True])
def test_joins(jointype, expected, use_threads, use_datasets):
    # Allocate table here instead of using parametrize
    # this prevents having arrow allocated memory forever around.
    expected = pa.table(expected)

    t1 = pa.Table.from_pydict({
        "colA": [1, 2, 6],
        "col2": ["a", "b", "f"]
    })

    t2 = pa.Table.from_pydict({
        "colB": [99, 2, 1],
        "col3": ["Z", "B", "A"]
    })

    if use_datasets:
        t1 = ds.dataset([t1])
        t2 = ds.dataset([t2])

    r = ep._perform_join(jointype, t1, "colA", t2, "colB",
                         use_threads=use_threads, coalesce_keys=True)
    r = r.combine_chunks()
    if "right" in jointype:
        r = r.sort_by("colB")
    else:
        r = r.sort_by("colA")
    assert r == expected


def test_table_join_collisions():
    t1 = pa.table({
        "colA": [1, 2, 6],
        "colB": [10, 20, 60],
        "colVals": ["a", "b", "f"]
    })

    t2 = pa.table({
        "colB": [99, 20, 10],
        "colVals": ["Z", "B", "A"],
        "colUniq": [100, 200, 300],
        "colA": [99, 2, 1],
    })

    result = ep._perform_join(
        "full outer", t1, ["colA", "colB"], t2, ["colA", "colB"])
    assert result.combine_chunks() == pa.table([
        [1, 2, 6, None],
        [10, 20, 60, None],
        ["a", "b", "f", None],
        [10, 20, None, 99],
        ["A", "B", None, "Z"],
        [300, 200, None, 100],
        [1, 2, None, 99],
    ], names=["colA", "colB", "colVals", "colB", "colVals", "colUniq", "colA"])

    result = ep._perform_join("full outer", t1, "colA",
                              t2, "colA", right_suffix="_r",
                              coalesce_keys=False)
    assert result.combine_chunks() == pa.table({
        "colA": [1, 2, 6, None],
        "colB": [10, 20, 60, None],
        "colVals": ["a", "b", "f", None],
        "colB_r": [10, 20, None, 99],
        "colVals_r": ["A", "B", None, "Z"],
        "colUniq": [300, 200, None, 100],
        "colA_r": [1, 2, None, 99],
    })

    result = ep._perform_join("full outer", t1, "colA",
                              t2, "colA", right_suffix="_r",
                              coalesce_keys=True)
    assert result.combine_chunks() == pa.table({
        "colA": [1, 2, 6, 99],
        "colB": [10, 20, 60, None],
        "colVals": ["a", "b", "f", None],
        "colB_r": [10, 20, None, 99],
        "colVals_r": ["A", "B", None, "Z"],
        "colUniq": [300, 200, None, 100]
    })


def test_table_join_keys_order():
    t1 = pa.table({
        "colB": [10, 20, 60],
        "colA": [1, 2, 6],
        "colVals": ["a", "b", "f"]
    })

    t2 = pa.table({
        "colVals": ["Z", "B", "A"],
        "colX": [99, 2, 1],
    })

    result = ep._perform_join("full outer", t1, "colA", t2, "colX",
                              left_suffix="_l", right_suffix="_r",
                              coalesce_keys=True)
    assert result.combine_chunks() == pa.table({
        "colB": [10, 20, 60, None],
        "colA": [1, 2, 6, 99],
        "colVals_l": ["a", "b", "f", None],
        "colVals_r": ["A", "B", None, "Z"],
    })