mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-02 14:27:31 +00:00
first commit
This commit is contained in:
883
.venv/Lib/site-packages/pandas/tests/reshape/merge/test_join.py
Normal file
883
.venv/Lib/site-packages/pandas/tests/reshape/merge/test_join.py
Normal file
@ -0,0 +1,883 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
concat,
|
||||
merge,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.reshape.merge.test_merge import (
|
||||
NGROUPS,
|
||||
N,
|
||||
get_test_data,
|
||||
)
|
||||
|
||||
a_ = np.array
|
||||
|
||||
|
||||
class TestJoin:
|
||||
def setup_method(self, method):
|
||||
# aggregate multiple columns
|
||||
self.df = DataFrame(
|
||||
{
|
||||
"key1": get_test_data(),
|
||||
"key2": get_test_data(),
|
||||
"data1": np.random.randn(N),
|
||||
"data2": np.random.randn(N),
|
||||
}
|
||||
)
|
||||
|
||||
# exclude a couple keys for fun
|
||||
self.df = self.df[self.df["key2"] > 1]
|
||||
|
||||
self.df2 = DataFrame(
|
||||
{
|
||||
"key1": get_test_data(n=N // 5),
|
||||
"key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5),
|
||||
"value": np.random.randn(N // 5),
|
||||
}
|
||||
)
|
||||
|
||||
index, data = tm.getMixedTypeDict()
|
||||
self.target = DataFrame(data, index=index)
|
||||
|
||||
# Join on string value
|
||||
self.source = DataFrame(
|
||||
{"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"]
|
||||
)
|
||||
|
||||
def test_left_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="left")
|
||||
|
||||
joined_both = merge(self.df, self.df2)
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left")
|
||||
|
||||
def test_right_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2", how="right")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="right")
|
||||
|
||||
joined_both = merge(self.df, self.df2, how="right")
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right")
|
||||
|
||||
def test_full_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")
|
||||
|
||||
joined_both = merge(self.df, self.df2, how="outer")
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer")
|
||||
|
||||
def test_inner_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")
|
||||
|
||||
joined_both = merge(self.df, self.df2, how="inner")
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner")
|
||||
|
||||
def test_handle_overlap(self):
|
||||
joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))
|
||||
|
||||
assert "key1.foo" in joined
|
||||
assert "key1.bar" in joined
|
||||
|
||||
def test_handle_overlap_arbitrary_key(self):
|
||||
joined = merge(
|
||||
self.df,
|
||||
self.df2,
|
||||
left_on="key2",
|
||||
right_on="key1",
|
||||
suffixes=(".foo", ".bar"),
|
||||
)
|
||||
assert "key1.foo" in joined
|
||||
assert "key2.bar" in joined
|
||||
|
||||
def test_join_on(self):
|
||||
target = self.target
|
||||
source = self.source
|
||||
|
||||
merged = target.join(source, on="C")
|
||||
tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
|
||||
tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False)
|
||||
|
||||
# join with duplicates (fix regression from DataFrame/Matrix merge)
|
||||
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
|
||||
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
|
||||
joined = df.join(df2, on="key")
|
||||
expected = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]}
|
||||
)
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
# Test when some are missing
|
||||
df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
|
||||
df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
|
||||
df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
|
||||
joined = df_a.join(df_b, on="one")
|
||||
joined = joined.join(df_c, on="one")
|
||||
assert np.isnan(joined["two"]["c"])
|
||||
assert np.isnan(joined["three"]["c"])
|
||||
|
||||
# merge column not p resent
|
||||
with pytest.raises(KeyError, match="^'E'$"):
|
||||
target.join(source, on="E")
|
||||
|
||||
# overlap
|
||||
source_copy = source.copy()
|
||||
source_copy["A"] = 0
|
||||
msg = (
|
||||
"You are trying to merge on float64 and object columns. If "
|
||||
"you wish to proceed you should use pd.concat"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
target.join(source_copy, on="A")
|
||||
|
||||
def test_join_on_fails_with_different_right_index(self):
|
||||
df = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2),
|
||||
)
|
||||
msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, left_on="a", right_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_left_index(self):
|
||||
df = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)},
|
||||
index=tm.makeCustomIndex(3, 2),
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}
|
||||
)
|
||||
msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, right_on="b", left_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_column_counts(self):
|
||||
df = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2),
|
||||
)
|
||||
msg = r"len\(right_on\) must equal len\(left_on\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, right_on="a", left_on=["a", "b"])
|
||||
|
||||
@pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
|
||||
def test_join_on_fails_with_wrong_object_type(self, wrong_type):
|
||||
# GH12081 - original issue
|
||||
|
||||
# GH21220 - merging of Series and DataFrame is now allowed
|
||||
# Edited test to remove the Series object from test parameters
|
||||
|
||||
df = DataFrame({"a": [1, 1]})
|
||||
msg = (
|
||||
"Can only merge Series or DataFrame objects, "
|
||||
f"a {type(wrong_type)} was passed"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
merge(wrong_type, df, left_on="a", right_on="a")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
merge(df, wrong_type, left_on="a", right_on="a")
|
||||
|
||||
def test_join_on_pass_vector(self):
|
||||
expected = self.target.join(self.source, on="C")
|
||||
del expected["C"]
|
||||
|
||||
join_col = self.target.pop("C")
|
||||
result = self.target.join(self.source, on=join_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_with_len0(self):
|
||||
# nothing to merge
|
||||
merged = self.target.join(self.source.reindex([]), on="C")
|
||||
for col in self.source:
|
||||
assert col in merged
|
||||
assert merged[col].isna().all()
|
||||
|
||||
merged2 = self.target.join(self.source.reindex([]), on="C", how="inner")
|
||||
tm.assert_index_equal(merged2.columns, merged.columns)
|
||||
assert len(merged2) == 0
|
||||
|
||||
def test_join_on_inner(self):
|
||||
df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
|
||||
df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])
|
||||
|
||||
joined = df.join(df2, on="key", how="inner")
|
||||
|
||||
expected = df.join(df2, on="key")
|
||||
expected = expected[expected["value"].notna()]
|
||||
tm.assert_series_equal(joined["key"], expected["key"])
|
||||
tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
|
||||
tm.assert_index_equal(joined.index, expected.index)
|
||||
|
||||
def test_join_on_singlekey_list(self):
|
||||
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
|
||||
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
|
||||
|
||||
# corner cases
|
||||
joined = df.join(df2, on=["key"])
|
||||
expected = df.join(df2, on="key")
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_on_series(self):
|
||||
result = self.target.join(self.source["MergedA"], on="C")
|
||||
expected = self.target.join(self.source[["MergedA"]], on="C")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_on_series_buglet(self):
|
||||
# GH #638
|
||||
df = DataFrame({"a": [1, 1]})
|
||||
ds = Series([2], index=[1], name="b")
|
||||
result = df.join(ds, on="a")
|
||||
expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_index_mixed(self, join_type):
|
||||
# no overlapping blocks
|
||||
df1 = DataFrame(index=np.arange(10))
|
||||
df1["bool"] = True
|
||||
df1["string"] = "foo"
|
||||
|
||||
df2 = DataFrame(index=np.arange(5, 15))
|
||||
df2["int"] = 1
|
||||
df2["float"] = 1.0
|
||||
|
||||
joined = df1.join(df2, how=join_type)
|
||||
expected = _join_by_hand(df1, df2, how=join_type)
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
joined = df2.join(df1, how=join_type)
|
||||
expected = _join_by_hand(df2, df1, how=join_type)
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_index_mixed_overlap(self):
|
||||
df1 = DataFrame(
|
||||
{"A": 1.0, "B": 2, "C": "foo", "D": True},
|
||||
index=np.arange(10),
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
assert df1["B"].dtype == np.int64
|
||||
assert df1["D"].dtype == np.bool_
|
||||
|
||||
df2 = DataFrame(
|
||||
{"A": 1.0, "B": 2, "C": "foo", "D": True},
|
||||
index=np.arange(0, 10, 2),
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
|
||||
# overlap
|
||||
joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
|
||||
expected_columns = [
|
||||
"A_one",
|
||||
"B_one",
|
||||
"C_one",
|
||||
"D_one",
|
||||
"A_two",
|
||||
"B_two",
|
||||
"C_two",
|
||||
"D_two",
|
||||
]
|
||||
df1.columns = expected_columns[:4]
|
||||
df2.columns = expected_columns[4:]
|
||||
expected = _join_by_hand(df1, df2)
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_empty_bug(self):
|
||||
# generated an exception in 0.4.3
|
||||
x = DataFrame()
|
||||
x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")
|
||||
|
||||
def test_join_unconsolidated(self):
|
||||
# GH #331
|
||||
a = DataFrame(np.random.randn(30, 2), columns=["a", "b"])
|
||||
c = Series(np.random.randn(30))
|
||||
a["c"] = c
|
||||
d = DataFrame(np.random.randn(30, 1), columns=["q"])
|
||||
|
||||
# it works!
|
||||
a.join(d)
|
||||
d.join(a)
|
||||
|
||||
def test_join_multiindex(self):
|
||||
index1 = MultiIndex.from_arrays(
|
||||
[["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
|
||||
index2 = MultiIndex.from_arrays(
|
||||
[["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
|
||||
df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
|
||||
df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])
|
||||
|
||||
df1 = df1.sort_index(level=0)
|
||||
df2 = df2.sort_index(level=0)
|
||||
|
||||
joined = df1.join(df2, how="outer")
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
df1 = df1.sort_index(level=1)
|
||||
df2 = df2.sort_index(level=1)
|
||||
|
||||
joined = df1.join(df2, how="outer").sort_index(level=0)
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex):
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = [
|
||||
"two",
|
||||
"one",
|
||||
"three",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"three",
|
||||
"one",
|
||||
]
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
data = DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
index = lexsorted_two_level_string_multiindex
|
||||
to_join = DataFrame(
|
||||
np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
|
||||
)
|
||||
|
||||
joined = data.join(to_join, on=["key1", "key2"], how="inner")
|
||||
expected = merge(
|
||||
data,
|
||||
to_join.reset_index(),
|
||||
left_on=["key1", "key2"],
|
||||
right_on=["first", "second"],
|
||||
how="inner",
|
||||
sort=False,
|
||||
)
|
||||
|
||||
expected2 = merge(
|
||||
to_join,
|
||||
data,
|
||||
right_on=["key1", "key2"],
|
||||
left_index=True,
|
||||
how="inner",
|
||||
sort=False,
|
||||
)
|
||||
tm.assert_frame_equal(joined, expected2.reindex_like(joined))
|
||||
|
||||
expected2 = merge(
|
||||
to_join,
|
||||
data,
|
||||
right_on=["key1", "key2"],
|
||||
left_index=True,
|
||||
how="inner",
|
||||
sort=False,
|
||||
)
|
||||
|
||||
expected = expected.drop(["first", "second"], axis=1)
|
||||
expected.index = joined.index
|
||||
|
||||
assert joined.index.is_monotonic
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
# _assert_same_contents(expected, expected2.loc[:, expected.columns])
|
||||
|
||||
def test_join_hierarchical_mixed(self):
|
||||
# GH 2024
|
||||
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
|
||||
new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
|
||||
other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
|
||||
other_df.set_index("a", inplace=True)
|
||||
# GH 9455, 12219
|
||||
msg = "merging between different levels is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = merge(new_df, other_df, left_index=True, right_index=True)
|
||||
assert ("b", "mean") in result
|
||||
assert "b" in result
|
||||
|
||||
def test_join_float64_float32(self):
|
||||
|
||||
a = DataFrame(np.random.randn(10, 2), columns=["a", "b"], dtype=np.float64)
|
||||
b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32)
|
||||
joined = a.join(b)
|
||||
assert joined.dtypes["a"] == "float64"
|
||||
assert joined.dtypes["b"] == "float64"
|
||||
assert joined.dtypes["c"] == "float32"
|
||||
|
||||
a = np.random.randint(0, 5, 100).astype("int64")
|
||||
b = np.random.random(100).astype("float64")
|
||||
c = np.random.random(100).astype("float32")
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
xpdf = DataFrame({"a": a, "b": b, "c": c})
|
||||
s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
|
||||
rs = df.merge(s, left_on="a", right_index=True)
|
||||
assert rs.dtypes["a"] == "int64"
|
||||
assert rs.dtypes["b"] == "float64"
|
||||
assert rs.dtypes["c"] == "float32"
|
||||
assert rs.dtypes["md"] == "float32"
|
||||
|
||||
xp = xpdf.merge(s, left_on="a", right_index=True)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_join_many_non_unique_index(self):
|
||||
df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
|
||||
df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
|
||||
df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
|
||||
result = idf1.join([idf2, idf3], how="outer")
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
|
||||
expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")
|
||||
|
||||
result = result.reset_index()
|
||||
expected = expected[result.columns]
|
||||
expected["a"] = expected.a.astype("int64")
|
||||
expected["b"] = expected.b.astype("int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
|
||||
df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
|
||||
df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
result = idf1.join([idf2, idf3], how="inner")
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
|
||||
expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")
|
||||
|
||||
result = result.reset_index()
|
||||
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
# GH 11519
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
s = Series(
|
||||
np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST"
|
||||
)
|
||||
inner = df.join(s, how="inner")
|
||||
outer = df.join(s, how="outer")
|
||||
left = df.join(s, how="left")
|
||||
right = df.join(s, how="right")
|
||||
tm.assert_frame_equal(inner, outer)
|
||||
tm.assert_frame_equal(inner, left)
|
||||
tm.assert_frame_equal(inner, right)
|
||||
|
||||
def test_join_sort(self):
|
||||
left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
|
||||
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
|
||||
|
||||
joined = left.join(right, on="key", sort=True)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["bar", "baz", "foo", "foo"],
|
||||
"value": [2, 3, 1, 4],
|
||||
"value2": ["a", "b", "c", "c"],
|
||||
},
|
||||
index=[1, 2, 0, 3],
|
||||
)
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
# smoke test
|
||||
joined = left.join(right, on="key", sort=False)
|
||||
tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
|
||||
|
||||
def test_join_mixed_non_unique_index(self):
|
||||
# GH 12814, unorderable types in py3 with a non-unique index
|
||||
df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
|
||||
df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
|
||||
result = df1.join(df2)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]},
|
||||
index=[1, 2, 3, 3, "a"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
|
||||
df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
|
||||
result = df3.join(df4)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_non_unique_period_index(self):
|
||||
# GH #16871
|
||||
index = pd.period_range("2016-01-01", periods=16, freq="M")
|
||||
df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
|
||||
df2 = concat([df, df])
|
||||
result = df.join(df2, how="inner", rsuffix="_df2")
|
||||
expected = DataFrame(
|
||||
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
|
||||
columns=["pnum", "pnum_df2"],
|
||||
index=df2.sort_index().index,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_type_join_with_suffix(self):
|
||||
# GH #916
|
||||
df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
|
||||
df.insert(0, "id", 0)
|
||||
df.insert(5, "dt", "foo")
|
||||
|
||||
grouped = df.groupby("id")
|
||||
mn = grouped.mean()
|
||||
cn = grouped.count()
|
||||
|
||||
# it works!
|
||||
mn.join(cn, rsuffix="_right")
|
||||
|
||||
def test_join_many(self):
|
||||
df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
|
||||
df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
tm.assert_frame_equal(joined, df)
|
||||
|
||||
df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]]
|
||||
|
||||
def _check_diff_index(df_list, result, exp_index):
|
||||
reindexed = [x.reindex(exp_index) for x in df_list]
|
||||
expected = reindexed[0].join(reindexed[1:])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# different join types
|
||||
joined = df_list[0].join(df_list[1:], how="outer")
|
||||
_check_diff_index(df_list, joined, df.index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
_check_diff_index(df_list, joined, df_list[0].index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:], how="inner")
|
||||
_check_diff_index(df_list, joined, df.index[2:8])
|
||||
|
||||
msg = "Joining multiple DataFrames only supported for joining on index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df_list[0].join(df_list[1:], on="a")
|
||||
|
||||
def test_join_many_mixed(self):
|
||||
df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
|
||||
df["key"] = ["foo", "bar"] * 4
|
||||
df1 = df.loc[:, ["A", "B"]]
|
||||
df2 = df.loc[:, ["C", "D"]]
|
||||
df3 = df.loc[:, ["key"]]
|
||||
|
||||
result = df1.join([df2, df3])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_join_dups(self):
|
||||
|
||||
# joining dups
|
||||
df = concat(
|
||||
[
|
||||
DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
|
||||
DataFrame(
|
||||
np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
|
||||
),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
expected = concat([df, df], axis=1)
|
||||
result = df.join(df, rsuffix="_2")
|
||||
result.columns = expected.columns
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 4975, invalid join on dups
|
||||
w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
|
||||
dta = x.merge(y, left_index=True, right_index=True).merge(
|
||||
z, left_index=True, right_index=True, how="outer"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
dta = dta.merge(w, left_index=True, right_index=True)
|
||||
expected = concat([x, y, z, w], axis=1)
|
||||
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
|
||||
tm.assert_frame_equal(dta, expected)
|
||||
|
||||
def test_join_multi_to_multi(self, join_type):
|
||||
# GH 20475
|
||||
leftindex = MultiIndex.from_product(
|
||||
[list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]
|
||||
)
|
||||
left = DataFrame({"v1": range(12)}, index=leftindex)
|
||||
|
||||
rightindex = MultiIndex.from_product(
|
||||
[list("abc"), list("xy")], names=["abc", "xy"]
|
||||
)
|
||||
right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex)
|
||||
|
||||
result = left.join(right, on=["abc", "xy"], how=join_type)
|
||||
expected = (
|
||||
left.reset_index()
|
||||
.merge(right.reset_index(), on=["abc", "xy"], how=join_type)
|
||||
.set_index(["abc", "xy", "num"])
|
||||
)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
left.join(right, on="xy", how=join_type)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
right.join(left, on=["abc", "xy"], how=join_type)
|
||||
|
||||
def test_join_on_tz_aware_datetimeindex(self):
|
||||
# GH 23931, 26335
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"date": pd.date_range(
|
||||
start="2018-01-01", periods=5, tz="America/Chicago"
|
||||
),
|
||||
"vals": list("abcde"),
|
||||
}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"date": pd.date_range(
|
||||
start="2018-01-03", periods=5, tz="America/Chicago"
|
||||
),
|
||||
"vals_2": list("tuvwx"),
|
||||
}
|
||||
)
|
||||
result = df1.join(df2.set_index("date"), on="date")
|
||||
expected = df1.copy()
|
||||
expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_datetime_string(self):
|
||||
# GH 5647
|
||||
dfa = DataFrame(
|
||||
[
|
||||
["2012-08-02", "L", 10],
|
||||
["2012-08-02", "J", 15],
|
||||
["2013-04-06", "L", 20],
|
||||
["2013-04-06", "J", 25],
|
||||
],
|
||||
columns=["x", "y", "a"],
|
||||
)
|
||||
dfa["x"] = pd.to_datetime(dfa["x"])
|
||||
dfb = DataFrame(
|
||||
[["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
|
||||
columns=["x", "y", "z"],
|
||||
index=[2, 4],
|
||||
)
|
||||
dfb["x"] = pd.to_datetime(dfb["x"])
|
||||
result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
|
||||
expected = DataFrame(
|
||||
[
|
||||
[Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
|
||||
[Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
|
||||
],
|
||||
index=[2, 4],
|
||||
columns=["x", "y", "z", "a"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"):
|
||||
|
||||
# some smoke tests
|
||||
for c in join_col:
|
||||
assert result[c].notna().all()
|
||||
|
||||
left_grouped = left.groupby(join_col)
|
||||
right_grouped = right.groupby(join_col)
|
||||
|
||||
for group_key, group in result.groupby(join_col):
|
||||
l_joined = _restrict_to_columns(group, left.columns, lsuffix)
|
||||
r_joined = _restrict_to_columns(group, right.columns, rsuffix)
|
||||
|
||||
try:
|
||||
lgroup = left_grouped.get_group(group_key)
|
||||
except KeyError as err:
|
||||
if how in ("left", "inner"):
|
||||
raise AssertionError(
|
||||
f"key {group_key} should not have been in the join"
|
||||
) from err
|
||||
|
||||
_assert_all_na(l_joined, left.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(l_joined, lgroup)
|
||||
|
||||
try:
|
||||
rgroup = right_grouped.get_group(group_key)
|
||||
except KeyError as err:
|
||||
if how in ("right", "inner"):
|
||||
raise AssertionError(
|
||||
f"key {group_key} should not have been in the join"
|
||||
) from err
|
||||
|
||||
_assert_all_na(r_joined, right.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(r_joined, rgroup)
|
||||
|
||||
|
||||
def _restrict_to_columns(group, columns, suffix):
|
||||
found = [
|
||||
c for c in group.columns if c in columns or c.replace(suffix, "") in columns
|
||||
]
|
||||
|
||||
# filter
|
||||
group = group.loc[:, found]
|
||||
|
||||
# get rid of suffixes, if any
|
||||
group = group.rename(columns=lambda x: x.replace(suffix, ""))
|
||||
|
||||
# put in the right order...
|
||||
group = group.loc[:, columns]
|
||||
|
||||
return group
|
||||
|
||||
|
||||
def _assert_same_contents(join_chunk, source):
|
||||
NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
|
||||
|
||||
jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
|
||||
rows = {tuple(row) for row in jvalues}
|
||||
assert len(rows) == len(source)
|
||||
assert all(tuple(row) in rows for row in svalues)
|
||||
|
||||
|
||||
def _assert_all_na(join_chunk, source_columns, join_col):
|
||||
for c in source_columns:
|
||||
if c in join_col:
|
||||
continue
|
||||
assert join_chunk[c].isna().all()
|
||||
|
||||
|
||||
def _join_by_hand(a, b, how="left"):
|
||||
join_index = a.index.join(b.index, how=how)
|
||||
|
||||
a_re = a.reindex(join_index)
|
||||
b_re = b.reindex(join_index)
|
||||
|
||||
result_columns = a.columns.append(b.columns)
|
||||
|
||||
for col, s in b_re.items():
|
||||
a_re[col] = s
|
||||
return a_re.reindex(columns=result_columns)
|
||||
|
||||
|
||||
def test_join_inner_multiindex_deterministic_order():
|
||||
# GH: 36910
|
||||
left = DataFrame(
|
||||
data={"e": 5},
|
||||
index=MultiIndex.from_tuples([(1, 2, 4)], names=("a", "b", "d")),
|
||||
)
|
||||
right = DataFrame(
|
||||
data={"f": 6}, index=MultiIndex.from_tuples([(2, 3)], names=("b", "c"))
|
||||
)
|
||||
result = left.join(right, how="inner")
|
||||
expected = DataFrame(
|
||||
{"e": [5], "f": [6]},
|
||||
index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
|
||||
)
|
||||
def test_join_cross(input_col, output_cols):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({input_col: [3, 4]})
|
||||
result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y")
|
||||
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_multiindex_one_level(join_type):
|
||||
# GH#36909
|
||||
left = DataFrame(
|
||||
data={"c": 3}, index=MultiIndex.from_tuples([(1, 2)], names=("a", "b"))
|
||||
)
|
||||
right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",)))
|
||||
result = left.join(right, how=join_type)
|
||||
expected = DataFrame(
|
||||
{"c": [3], "d": [4]},
|
||||
index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"categories, values",
|
||||
[
|
||||
(["Y", "X"], ["Y", "X", "X"]),
|
||||
([2, 1], [2, 1, 1]),
|
||||
([2.5, 1.5], [2.5, 1.5, 1.5]),
|
||||
(
|
||||
[Timestamp("2020-12-31"), Timestamp("2019-12-31")],
|
||||
[Timestamp("2020-12-31"), Timestamp("2019-12-31"), Timestamp("2019-12-31")],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_join_multiindex_not_alphabetical_categorical(categories, values):
|
||||
# GH#38502
|
||||
left = DataFrame(
|
||||
{
|
||||
"first": ["A", "A"],
|
||||
"second": Categorical(categories, categories=categories),
|
||||
"value": [1, 2],
|
||||
}
|
||||
).set_index(["first", "second"])
|
||||
right = DataFrame(
|
||||
{
|
||||
"first": ["A", "A", "B"],
|
||||
"second": Categorical(values, categories=categories),
|
||||
"value": [3, 4, 5],
|
||||
}
|
||||
).set_index(["first", "second"])
|
||||
result = left.join(right, lsuffix="_left", rsuffix="_right")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"first": ["A", "A"],
|
||||
"second": Categorical(categories, categories=categories),
|
||||
"value_left": [1, 2],
|
||||
"value_right": [3, 4],
|
||||
}
|
||||
).set_index(["first", "second"])
|
||||
tm.assert_frame_equal(result, expected)
|
2631
.venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge.py
Normal file
2631
.venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,98 @@
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.merge import (
|
||||
MergeError,
|
||||
merge,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
|
||||
)
|
||||
def test_merge_cross(input_col, output_cols):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({input_col: [3, 4]})
|
||||
left_copy = left.copy()
|
||||
right_copy = right.copy()
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(left, left_copy)
|
||||
tm.assert_frame_equal(right, right_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"left_index": True},
|
||||
{"right_index": True},
|
||||
{"on": "a"},
|
||||
{"left_on": "a"},
|
||||
{"right_on": "b"},
|
||||
],
|
||||
)
|
||||
def test_merge_cross_error_reporting(kwargs):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"b": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or "
|
||||
"left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
merge(left, right, how="cross", **kwargs)
|
||||
|
||||
|
||||
def test_merge_cross_mixed_dtypes():
|
||||
# GH#5401
|
||||
left = DataFrame(["a", "b", "c"], columns=["A"])
|
||||
right = DataFrame(range(2), columns=["B"])
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_more_than_one_column():
|
||||
# GH#5401
|
||||
left = DataFrame({"A": list("ab"), "B": [2, 1]})
|
||||
right = DataFrame({"C": range(2), "D": range(4, 6)})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "a", "b", "b"],
|
||||
"B": [2, 2, 1, 1],
|
||||
"C": [0, 1, 0, 1],
|
||||
"D": [4, 5, 4, 5],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_null_values(nulls_fixture):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, nulls_fixture]})
|
||||
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, nulls_fixture, nulls_fixture],
|
||||
"b": ["a", "b", "a", "b"],
|
||||
"c": [1.0, 2.0, 1.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_cross_error_reporting():
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"a": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or "
|
||||
"left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left.join(right, how="cross", on="a")
|
@ -0,0 +1,189 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df1():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
|
||||
"inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
|
||||
"v1": np.linspace(0, 1, 11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df2():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
|
||||
"inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
|
||||
"v2": np.linspace(10, 11, 12),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def left_df(request, df1):
|
||||
"""Construct left test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v1')
|
||||
"""
|
||||
levels = request.param
|
||||
if levels:
|
||||
df1 = df1.set_index(levels)
|
||||
|
||||
return df1
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def right_df(request, df2):
|
||||
"""Construct right test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v2')
|
||||
"""
|
||||
levels = request.param
|
||||
|
||||
if levels:
|
||||
df2 = df2.set_index(levels)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
|
||||
"""
|
||||
Compute the expected merge result for the test case.
|
||||
|
||||
This method computes the expected result of merging two DataFrames on
|
||||
a combination of their columns and index levels. It does so by
|
||||
explicitly dropping/resetting their named index levels, performing a
|
||||
merge on their columns, and then finally restoring the appropriate
|
||||
index in the result.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_left : DataFrame
|
||||
The left DataFrame (may have zero or more named index levels)
|
||||
df_right : DataFrame
|
||||
The right DataFrame (may have zero or more named index levels)
|
||||
on : list of str
|
||||
The on parameter to the merge operation
|
||||
left_on : list of str
|
||||
The left_on parameter to the merge operation
|
||||
right_on : list of str
|
||||
The right_on parameter to the merge operation
|
||||
how : str
|
||||
The how parameter to the merge operation
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The expected merge result
|
||||
"""
|
||||
# Handle on param if specified
|
||||
if on is not None:
|
||||
left_on, right_on = on, on
|
||||
|
||||
# Compute input named index levels
|
||||
left_levels = [n for n in df_left.index.names if n is not None]
|
||||
right_levels = [n for n in df_right.index.names if n is not None]
|
||||
|
||||
# Compute output named index levels
|
||||
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
|
||||
|
||||
# Drop index levels that aren't involved in the merge
|
||||
drop_left = [n for n in left_levels if n not in left_on]
|
||||
if drop_left:
|
||||
df_left = df_left.reset_index(drop_left, drop=True)
|
||||
|
||||
drop_right = [n for n in right_levels if n not in right_on]
|
||||
if drop_right:
|
||||
df_right = df_right.reset_index(drop_right, drop=True)
|
||||
|
||||
# Convert remaining index levels to columns
|
||||
reset_left = [n for n in left_levels if n in left_on]
|
||||
if reset_left:
|
||||
df_left = df_left.reset_index(level=reset_left)
|
||||
|
||||
reset_right = [n for n in right_levels if n in right_on]
|
||||
if reset_right:
|
||||
df_right = df_right.reset_index(level=reset_right)
|
||||
|
||||
# Perform merge
|
||||
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
|
||||
|
||||
# Restore index levels
|
||||
if output_levels:
|
||||
expected = expected.set_index(output_levels)
|
||||
|
||||
return expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"on,how",
|
||||
[
|
||||
(["outer"], "inner"),
|
||||
(["inner"], "left"),
|
||||
(["outer", "inner"], "right"),
|
||||
(["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df, on=on, how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, on=on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left_on,right_on,how",
|
||||
[
|
||||
(["outer"], ["outer"], "inner"),
|
||||
(["inner"], ["inner"], "right"),
|
||||
(["outer", "inner"], ["outer", "inner"], "left"),
|
||||
(["inner", "outer"], ["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_lefton_righton(
|
||||
left_df, right_df, left_on, right_on, how
|
||||
):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(
|
||||
left_df, right_df, left_on=left_on, right_on=right_on, how=how
|
||||
)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
|
||||
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
|
||||
|
||||
# Construct left_df
|
||||
left_df = df1.set_index(left_index)
|
||||
|
||||
# Construct right_df
|
||||
right_df = df2.set_index(["outer", "inner"])
|
||||
|
||||
# Result
|
||||
expected = (
|
||||
left_df.reset_index()
|
||||
.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
.set_index(left_index)
|
||||
)
|
||||
|
||||
# Perform join
|
||||
result = left_df.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
@ -0,0 +1,201 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
merge_ordered,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestMergeOrdered:
|
||||
def setup_method(self, method):
|
||||
self.left = DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
|
||||
|
||||
self.right = DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
|
||||
|
||||
def test_basic(self):
|
||||
result = merge_ordered(self.left, self.right, on="key")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1, np.nan, 2, np.nan, 3, np.nan],
|
||||
"rvalue": [np.nan, 1, 2, 3, np.nan, 4],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self):
|
||||
result = merge_ordered(self.left, self.right, on="key", fill_method="ffill")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_multigroup(self):
|
||||
left = pd.concat([self.left, self.left], ignore_index=True)
|
||||
|
||||
left["group"] = ["a"] * 3 + ["b"] * 3
|
||||
|
||||
result = merge_ordered(
|
||||
left, self.right, on="key", left_by="group", fill_method="ffill"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"] * 2,
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4] * 2,
|
||||
}
|
||||
)
|
||||
expected["group"] = ["a"] * 6 + ["b"] * 6
|
||||
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
result2 = merge_ordered(
|
||||
self.right, left, on="key", right_by="group", fill_method="ffill"
|
||||
)
|
||||
tm.assert_frame_equal(result, result2.loc[:, result.columns])
|
||||
|
||||
result = merge_ordered(left, self.right, on="key", left_by="group")
|
||||
assert result["group"].notna().all()
|
||||
|
||||
def test_merge_type(self):
|
||||
class NotADataFrame(DataFrame):
|
||||
@property
|
||||
def _constructor(self):
|
||||
return NotADataFrame
|
||||
|
||||
nad = NotADataFrame(self.left)
|
||||
result = nad.merge(self.right, on="key")
|
||||
|
||||
assert isinstance(result, NotADataFrame)
|
||||
|
||||
def test_empty_sequence_concat(self):
|
||||
# GH 9157
|
||||
empty_pat = "[Nn]o objects"
|
||||
none_pat = "objects.*None"
|
||||
test_cases = [
|
||||
((), empty_pat),
|
||||
([], empty_pat),
|
||||
({}, empty_pat),
|
||||
([None], none_pat),
|
||||
([None, None], none_pat),
|
||||
]
|
||||
for df_seq, pattern in test_cases:
|
||||
with pytest.raises(ValueError, match=pattern):
|
||||
pd.concat(df_seq)
|
||||
|
||||
pd.concat([DataFrame()])
|
||||
pd.concat([None, DataFrame()])
|
||||
pd.concat([DataFrame(), None])
|
||||
|
||||
def test_doc_example(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"group": list("aaabbb"),
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
|
||||
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"group": list("aaaaabbbbb"),
|
||||
"key": ["a", "b", "c", "d", "e"] * 2,
|
||||
"lvalue": [1, 1, 2, 2, 3] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left, right, on, left_by, right_by, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
["T"],
|
||||
["G", "H"],
|
||||
None,
|
||||
DataFrame(
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
"T",
|
||||
["G", "H"],
|
||||
None,
|
||||
DataFrame(
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
DataFrame({"T": [2], "E": [1]}),
|
||||
DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
|
||||
["T"],
|
||||
None,
|
||||
["G", "H"],
|
||||
DataFrame(
|
||||
{
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_list_type_by(self, left, right, on, left_by, right_by, expected):
|
||||
# GH 35269
|
||||
result = merge_ordered(
|
||||
left=left,
|
||||
right=right,
|
||||
on=on,
|
||||
left_by=left_by,
|
||||
right_by=right_by,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_by_length_equals_to_right_shape0(self):
|
||||
# GH 38166
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
result = merge_ordered(left, right, on="E", left_by=["G", "H"])
|
||||
expected = DataFrame(
|
||||
{"G": ["g"] * 3, "H": ["h"] * 3, "E": [1, 2, 3], "T": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_elements_not_in_by_but_in_df(self):
|
||||
# GH 38167
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
msg = r"\{'h'\} not found in left columns"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
merge_ordered(left, right, on="E", left_by=["G", "h"])
|
909
.venv/Lib/site-packages/pandas/tests/reshape/merge/test_multi.py
Normal file
909
.venv/Lib/site-packages/pandas/tests/reshape/merge/test_multi.py
Normal file
@ -0,0 +1,909 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.merge import merge
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
"""left dataframe (not multi-indexed) for multi-index join tests"""
|
||||
# a little relevant example with NAs
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
return DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right(multiindex_dataframe_random_data):
|
||||
"""right dataframe (multi-indexed) for multi-index join tests"""
|
||||
df = multiindex_dataframe_random_data
|
||||
df.index.names = ["key1", "key2"]
|
||||
|
||||
df.columns = ["j_one", "j_two", "j_three"]
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C"],
|
||||
"Destination": ["A", "B", "A", "C", "A"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP"],
|
||||
"TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
|
||||
"Trips": [1987, 3647, 2470, 4296, 4444],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
|
||||
).set_index(["Origin", "Destination", "Period", "TripPurp"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C", "C", "E"],
|
||||
"Destination": ["A", "B", "A", "B", "A", "B", "F"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
|
||||
"LinkType": ["a", "b", "c", "b", "a", "b", "a"],
|
||||
"Distance": [100, 80, 90, 80, 75, 35, 55],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
|
||||
).set_index(["Origin", "Destination", "Period", "LinkType"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def on_cols_multi():
|
||||
return ["Origin", "Destination", "Period"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def idx_cols_multi():
|
||||
return ["Origin", "Destination", "Period", "TripPurp", "LinkType"]
|
||||
|
||||
|
||||
class TestMergeMulti:
|
||||
def test_merge_on_multikey(self, left, right, join_type):
|
||||
on_cols = ["key1", "key2"]
|
||||
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
|
||||
|
||||
expected = merge(left, right.reset_index(), on=on_cols, how=join_type)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
expected = merge(
|
||||
left, right.reset_index(), on=on_cols, how=join_type, sort=True
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_left_join_multi_index(self, left, right, sort):
|
||||
icols = ["1st", "2nd", "3rd"]
|
||||
|
||||
def bind_cols(df):
|
||||
iord = lambda a: 0 if a != a else ord(a)
|
||||
f = lambda ts: ts.map(iord) - ord("a")
|
||||
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4
|
||||
|
||||
def run_asserts(left, right, sort):
|
||||
res = left.join(right, on=icols, how="left", sort=sort)
|
||||
|
||||
assert len(left) < len(res) + 1
|
||||
assert not res["4th"].isna().any()
|
||||
assert not res["5th"].isna().any()
|
||||
|
||||
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
|
||||
result = bind_cols(res.iloc[:, :-2])
|
||||
tm.assert_series_equal(res["4th"], result, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
if sort:
|
||||
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
|
||||
|
||||
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
|
||||
|
||||
res.index = np.arange(len(res))
|
||||
tm.assert_frame_equal(out, res)
|
||||
|
||||
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
|
||||
left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"])
|
||||
left.insert(1, "2nd", np.random.randint(0, 1000, len(left)))
|
||||
|
||||
i = np.random.permutation(len(left))
|
||||
right = left.iloc[i].copy()
|
||||
|
||||
left["4th"] = bind_cols(left)
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
# inject some nulls
|
||||
left.loc[1::23, "1st"] = np.nan
|
||||
left.loc[2::37, "2nd"] = np.nan
|
||||
left.loc[3::43, "3rd"] = np.nan
|
||||
left["4th"] = bind_cols(left)
|
||||
|
||||
i = np.random.permutation(len(left))
|
||||
right = left.iloc[i, :-1]
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_merge_right_vs_left(self, left, right, sort):
|
||||
# compare left vs right merge with multikey
|
||||
on_cols = ["key1", "key2"]
|
||||
merged_left_right = left.merge(
|
||||
right, left_on=on_cols, right_index=True, how="left", sort=sort
|
||||
)
|
||||
|
||||
merge_right_left = right.merge(
|
||||
left, right_on=on_cols, left_index=True, how="right", sort=sort
|
||||
)
|
||||
|
||||
# Reorder columns
|
||||
merge_right_left = merge_right_left[merged_left_right.columns]
|
||||
|
||||
tm.assert_frame_equal(merged_left_right, merge_right_left)
|
||||
|
||||
def test_merge_multiple_cols_with_mixed_cols_index(self):
|
||||
# GH29522
|
||||
s = Series(
|
||||
range(6),
|
||||
MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]),
|
||||
name="Amount",
|
||||
)
|
||||
df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0})
|
||||
result = merge(df, s.reset_index(), on=["lev1", "lev2"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"lev1": list("AAABBB"),
|
||||
"lev2": [1, 2, 3, 1, 2, 3],
|
||||
"col": [0] * 6,
|
||||
"Amount": range(6),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_compress_group_combinations(self):
|
||||
|
||||
# ~ 40000000 possible unique groups
|
||||
key1 = tm.rands_array(10, 10000)
|
||||
key1 = np.tile(key1, 2)
|
||||
key2 = key1[::-1]
|
||||
|
||||
df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)})
|
||||
|
||||
df2 = DataFrame(
|
||||
{"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)}
|
||||
)
|
||||
|
||||
# just to hit the label compression code path
|
||||
merge(df, df2, how="outer")
|
||||
|
||||
def test_left_join_index_preserve_order(self):
|
||||
|
||||
on_cols = ["k1", "k2"]
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"v": np.array(np.arange(24), dtype=np.int64),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result.sort_values(on_cols, kind="mergesort", inplace=True)
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# test join with multi dtypes blocks
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
|
||||
"v": np.array(np.arange(24), dtype=np.int32),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = result.sort_values(on_cols, kind="mergesort")
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match_multiindex(self):
|
||||
left = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a"],
|
||||
["W", "Y", "C", "e"],
|
||||
["V", "Q", "A", "h"],
|
||||
["V", "R", "D", "i"],
|
||||
["X", "Y", "D", "b"],
|
||||
["X", "Y", "A", "c"],
|
||||
["W", "Q", "B", "f"],
|
||||
["W", "R", "C", "g"],
|
||||
["V", "Y", "C", "j"],
|
||||
["X", "Y", "B", "d"],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag"],
|
||||
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["W", "R", "C", 0],
|
||||
["W", "Q", "B", 3],
|
||||
["W", "Q", "B", 8],
|
||||
["X", "Y", "A", 1],
|
||||
["X", "Y", "A", 4],
|
||||
["X", "Y", "B", 5],
|
||||
["X", "Y", "C", 6],
|
||||
["X", "Y", "C", 9],
|
||||
["X", "Q", "C", -6],
|
||||
["X", "R", "C", -9],
|
||||
["V", "Y", "C", 7],
|
||||
["V", "R", "D", 2],
|
||||
["V", "R", "D", -1],
|
||||
["V", "Q", "A", -3],
|
||||
],
|
||||
columns=["col1", "col2", "col3", "val"],
|
||||
).set_index(["col1", "col2", "col3"])
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a", 6],
|
||||
["X", "Y", "C", "a", 9],
|
||||
["W", "Y", "C", "e", np.nan],
|
||||
["V", "Q", "A", "h", -3],
|
||||
["V", "R", "D", "i", 2],
|
||||
["V", "R", "D", "i", -1],
|
||||
["X", "Y", "D", "b", np.nan],
|
||||
["X", "Y", "A", "c", 1],
|
||||
["X", "Y", "A", "c", 4],
|
||||
["W", "Q", "B", "f", 3],
|
||||
["W", "Q", "B", "f", 8],
|
||||
["W", "R", "C", "g", 0],
|
||||
["V", "Y", "C", "j", 7],
|
||||
["X", "Y", "B", "d", 5],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag", "val"],
|
||||
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
|
||||
|
||||
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match(self):
|
||||
left = DataFrame(
|
||||
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
|
||||
columns=["tag", "val"],
|
||||
index=[2, 0, 1, 3],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["a", "v"],
|
||||
["c", "w"],
|
||||
["c", "x"],
|
||||
["d", "y"],
|
||||
["a", "z"],
|
||||
["c", "r"],
|
||||
["e", "q"],
|
||||
["c", "s"],
|
||||
],
|
||||
columns=["tag", "char"],
|
||||
).set_index("tag")
|
||||
|
||||
result = left.join(right, on="tag", how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["c", 0, "w"],
|
||||
["c", 0, "x"],
|
||||
["c", 0, "r"],
|
||||
["c", 0, "s"],
|
||||
["b", 1, np.nan],
|
||||
["a", 2, "v"],
|
||||
["a", 2, "z"],
|
||||
["b", 3, np.nan],
|
||||
],
|
||||
columns=["tag", "val", "char"],
|
||||
index=[2, 2, 2, 2, 0, 1, 1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on="tag", how="left", sort=True)
|
||||
expected2 = expected.sort_values("tag", kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
|
||||
# GH7331 - maintain left frame order in left merge
|
||||
result = merge(left, right.reset_index(), how="left", on="tag")
|
||||
expected.index = np.arange(len(expected))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_merge_na_buglet(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"id": list("abcde"),
|
||||
"v1": np.random.randn(5),
|
||||
"v2": np.random.randn(5),
|
||||
"dummy": list("abcde"),
|
||||
"v3": np.random.randn(5),
|
||||
},
|
||||
columns=["id", "v1", "v2", "dummy", "v3"],
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"id": ["a", "b", np.nan, np.nan, np.nan],
|
||||
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
result = merge(left, right, on="id", how="left")
|
||||
|
||||
rdf = right.drop(["id"], axis=1)
|
||||
expected = left.join(rdf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_merge_na_keys(self):
|
||||
data = [
|
||||
[1950, "A", 1.5],
|
||||
[1950, "B", 1.5],
|
||||
[1955, "B", 1.5],
|
||||
[1960, "B", np.nan],
|
||||
[1970, "B", 4.0],
|
||||
[1950, "C", 4.0],
|
||||
[1960, "C", np.nan],
|
||||
[1965, "C", 3.0],
|
||||
[1970, "C", 4.0],
|
||||
]
|
||||
|
||||
frame = DataFrame(data, columns=["year", "panel", "data"])
|
||||
|
||||
other_data = [
|
||||
[1960, "A", np.nan],
|
||||
[1970, "A", np.nan],
|
||||
[1955, "A", np.nan],
|
||||
[1965, "A", np.nan],
|
||||
[1965, "B", np.nan],
|
||||
[1955, "C", np.nan],
|
||||
]
|
||||
other = DataFrame(other_data, columns=["year", "panel", "data"])
|
||||
|
||||
result = frame.merge(other, how="outer")
|
||||
|
||||
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
|
||||
expected = expected.replace(-999, np.nan)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, klass):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if klass is not None:
|
||||
on_vector = klass(on_vector)
|
||||
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]}
|
||||
)
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("merge_type", ["left", "right"])
|
||||
def test_merge_datetime_multi_index_empty_df(self, merge_type):
|
||||
# see gh-36895
|
||||
|
||||
left = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
),
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"]
|
||||
)
|
||||
|
||||
expected_index = MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
)
|
||||
|
||||
if merge_type == "left":
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
"state": [None, None],
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = left.merge(right, how="left", on=["date", "panel"])
|
||||
results_join = left.join(right, how="left")
|
||||
else:
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"state": [None, None],
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = right.merge(left, how="right", on=["date", "panel"])
|
||||
results_join = right.join(left, how="right")
|
||||
|
||||
tm.assert_frame_equal(results_merge, expected)
|
||||
tm.assert_frame_equal(results_join, expected)
|
||||
|
||||
@pytest.fixture
|
||||
def household(self):
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 3],
|
||||
"male": [0, 1, 0],
|
||||
"wealth": [196087.3, 316478.7, 294750],
|
||||
},
|
||||
columns=["household_id", "male", "wealth"],
|
||||
).set_index("household_id")
|
||||
return household
|
||||
|
||||
@pytest.fixture
|
||||
def portfolio(self):
|
||||
portfolio = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "name", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
return portfolio
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self):
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"male": [0, 1, 1, 0, 0, 0],
|
||||
"wealth": [
|
||||
196087.3,
|
||||
316478.7,
|
||||
316478.7,
|
||||
294750.0,
|
||||
294750.0,
|
||||
294750.0,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
],
|
||||
"share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
|
||||
"household_id": [1, 2, 2, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id"])
|
||||
.reindex(columns=["male", "wealth", "name", "share"])
|
||||
)
|
||||
return expected
|
||||
|
||||
def test_join_multi_levels(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# GH 3662
|
||||
# merge multi-levels
|
||||
result = household.join(portfolio, how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
portfolio.reset_index(),
|
||||
on=["household_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_outer(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
result = household.join(portfolio, how="outer")
|
||||
expected = concat(
|
||||
[
|
||||
expected,
|
||||
(
|
||||
DataFrame(
|
||||
{"share": [1.00]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(4, np.nan)], names=["household_id", "asset_id"]
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
axis=0,
|
||||
sort=True,
|
||||
).reindex(columns=expected.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_invalid(self, portfolio, household):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# invalid cases
|
||||
household.index.name = "foo"
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot join with no overlapping index names"
|
||||
):
|
||||
household.join(portfolio, how="inner")
|
||||
|
||||
portfolio2 = portfolio.copy()
|
||||
portfolio2.index.set_names(["household_id", "foo"])
|
||||
|
||||
with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
|
||||
portfolio2.join(portfolio, how="inner")
|
||||
|
||||
def test_join_multi_levels2(self):
|
||||
|
||||
# some more advanced merges
|
||||
# GH6360
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
|
||||
log_return = DataFrame(
|
||||
{
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 180, 181],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
).set_index(["asset_id", "t"])
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [2, 2, 2, 3, 3, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 233, 234, 235, 180, 181],
|
||||
"share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
# this is the equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
None,
|
||||
],
|
||||
"t": [
|
||||
None,
|
||||
None,
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
180,
|
||||
181,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
"share": [
|
||||
1.0,
|
||||
0.4,
|
||||
0.6,
|
||||
0.6,
|
||||
0.6,
|
||||
0.15,
|
||||
0.15,
|
||||
0.15,
|
||||
0.6,
|
||||
0.6,
|
||||
0.25,
|
||||
1.0,
|
||||
],
|
||||
"log_return": [
|
||||
None,
|
||||
None,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="outer",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestJoinMultiMulti:
|
||||
def test_join_multi_multi(
|
||||
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
|
||||
):
|
||||
# Multi-index join tests
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(idx_cols_multi)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_empty_frames(
|
||||
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
|
||||
):
|
||||
|
||||
left_multi = left_multi.drop(columns=left_multi.columns)
|
||||
right_multi = right_multi.drop(columns=right_multi.columns)
|
||||
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(idx_cols_multi)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, box):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if box is not None:
|
||||
on_vector = box(on_vector)
|
||||
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]}
|
||||
)
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_common_level(self):
|
||||
index_left = MultiIndex.from_tuples(
|
||||
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
|
||||
)
|
||||
|
||||
left = DataFrame(
|
||||
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
|
||||
)
|
||||
|
||||
index_right = MultiIndex.from_tuples(
|
||||
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
|
||||
index=index_right,
|
||||
)
|
||||
|
||||
result = left.join(right)
|
||||
expected = merge(
|
||||
left.reset_index(), right.reset_index(), on=["key"], how="inner"
|
||||
).set_index(["key", "X", "Y"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_wrong_order(self):
|
||||
# GH 25760
|
||||
# GH 28956
|
||||
|
||||
midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
|
||||
midx3 = MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
|
||||
|
||||
left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
|
||||
right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
|
||||
|
||||
result = left.join(right)
|
||||
|
||||
expected = DataFrame(
|
||||
index=midx1,
|
||||
data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
Reference in New Issue
Block a user