2022-05-23 00:16:32 +04:00

193 lines
5.5 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
import pyarrow as pa
try:
import pyarrow.dataset as ds
import pyarrow._exec_plan as ep
except ImportError:
pass
pytestmark = pytest.mark.dataset
def test_joins_corner_cases():
t1 = pa.Table.from_pydict({
"colA": [1, 2, 3, 4, 5, 6],
"col2": ["a", "b", "c", "d", "e", "f"]
})
t2 = pa.Table.from_pydict({
"colB": [1, 2, 3, 4, 5],
"col3": ["A", "B", "C", "D", "E"]
})
with pytest.raises(pa.ArrowInvalid):
ep._perform_join("left outer", t1, "", t2, "")
with pytest.raises(TypeError):
ep._perform_join("left outer", None, "colA", t2, "colB")
with pytest.raises(ValueError):
ep._perform_join("super mario join", t1, "colA", t2, "colB")
@pytest.mark.parametrize("jointype,expected", [
("left semi", {
"colA": [1, 2],
"col2": ["a", "b"]
}),
("right semi", {
"colB": [1, 2],
"col3": ["A", "B"]
}),
("left anti", {
"colA": [6],
"col2": ["f"]
}),
("right anti", {
"colB": [99],
"col3": ["Z"]
}),
("inner", {
"colA": [1, 2],
"col2": ["a", "b"],
"col3": ["A", "B"]
}),
("left outer", {
"colA": [1, 2, 6],
"col2": ["a", "b", "f"],
"col3": ["A", "B", None]
}),
("right outer", {
"col2": ["a", "b", None],
"colB": [1, 2, 99],
"col3": ["A", "B", "Z"]
}),
("full outer", {
"colA": [1, 2, 6, 99],
"col2": ["a", "b", "f", None],
"col3": ["A", "B", None, "Z"]
})
])
@pytest.mark.parametrize("use_threads", [True, False])
@pytest.mark.parametrize("use_datasets", [False, True])
def test_joins(jointype, expected, use_threads, use_datasets):
# Allocate table here instead of using parametrize
# this prevents having arrow allocated memory forever around.
expected = pa.table(expected)
t1 = pa.Table.from_pydict({
"colA": [1, 2, 6],
"col2": ["a", "b", "f"]
})
t2 = pa.Table.from_pydict({
"colB": [99, 2, 1],
"col3": ["Z", "B", "A"]
})
if use_datasets:
t1 = ds.dataset([t1])
t2 = ds.dataset([t2])
r = ep._perform_join(jointype, t1, "colA", t2, "colB",
use_threads=use_threads, coalesce_keys=True)
r = r.combine_chunks()
if "right" in jointype:
r = r.sort_by("colB")
else:
r = r.sort_by("colA")
assert r == expected
def test_table_join_collisions():
t1 = pa.table({
"colA": [1, 2, 6],
"colB": [10, 20, 60],
"colVals": ["a", "b", "f"]
})
t2 = pa.table({
"colB": [99, 20, 10],
"colVals": ["Z", "B", "A"],
"colUniq": [100, 200, 300],
"colA": [99, 2, 1],
})
result = ep._perform_join(
"full outer", t1, ["colA", "colB"], t2, ["colA", "colB"])
assert result.combine_chunks() == pa.table([
[1, 2, 6, None],
[10, 20, 60, None],
["a", "b", "f", None],
[10, 20, None, 99],
["A", "B", None, "Z"],
[300, 200, None, 100],
[1, 2, None, 99],
], names=["colA", "colB", "colVals", "colB", "colVals", "colUniq", "colA"])
result = ep._perform_join("full outer", t1, "colA",
t2, "colA", right_suffix="_r",
coalesce_keys=False)
assert result.combine_chunks() == pa.table({
"colA": [1, 2, 6, None],
"colB": [10, 20, 60, None],
"colVals": ["a", "b", "f", None],
"colB_r": [10, 20, None, 99],
"colVals_r": ["A", "B", None, "Z"],
"colUniq": [300, 200, None, 100],
"colA_r": [1, 2, None, 99],
})
result = ep._perform_join("full outer", t1, "colA",
t2, "colA", right_suffix="_r",
coalesce_keys=True)
assert result.combine_chunks() == pa.table({
"colA": [1, 2, 6, 99],
"colB": [10, 20, 60, None],
"colVals": ["a", "b", "f", None],
"colB_r": [10, 20, None, 99],
"colVals_r": ["A", "B", None, "Z"],
"colUniq": [300, 200, None, 100]
})
def test_table_join_keys_order():
t1 = pa.table({
"colB": [10, 20, 60],
"colA": [1, 2, 6],
"colVals": ["a", "b", "f"]
})
t2 = pa.table({
"colVals": ["Z", "B", "A"],
"colX": [99, 2, 1],
})
result = ep._perform_join("full outer", t1, "colA", t2, "colX",
left_suffix="_l", right_suffix="_r",
coalesce_keys=True)
assert result.combine_chunks() == pa.table({
"colB": [10, 20, 60, None],
"colA": [1, 2, 6, 99],
"colVals_l": ["a", "b", "f", None],
"colVals_r": ["A", "B", None, "Z"],
})