From 8eea4df63bb3cb3dcc8731d0840415b7031af620 Mon Sep 17 00:00:00 2001 From: itholic Date: Mon, 27 Sep 2021 16:38:27 +0900 Subject: [PATCH 1/5] Implement MultiIndex.equal_levels --- .../reference/pyspark.pandas/indexing.rst | 1 + python/pyspark/pandas/indexes/multi.py | 37 +++++++++++++++++++ .../pyspark/pandas/tests/indexes/test_base.py | 7 ++++ .../pandas/tests/test_ops_on_diff_frames.py | 37 +++++++++++++++++++ 4 files changed, 82 insertions(+) diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index 05353fde71daa..7e796c69dc27e 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -269,6 +269,7 @@ MultiIndex Modifying and computations :toctree: api/ MultiIndex.equals + MultiIndex.equal_levels MultiIndex.identical MultiIndex.insert MultiIndex.drop diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index fb0208099fd6d..7fc183fb91f24 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn ) return cast(MultiIndex, DataFrame(internal).index) + def equal_levels(self, other: "MultiIndex") -> bool: + """ + Return True if the levels of both MultiIndex objects are the same + + Notes + ----- + This API can be expensive since it has logic to sort and compare the values of + all levels of indices that belong to MultiIndex. + + Examples + -------- + >>> from pyspark.pandas.config import set_option, reset_option + >>> set_option("compute.ops_on_diff_frames", True) + + >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + >>> psmidx1.equal_levels(psmidx2) + True + + >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + >>> psmidx1.equal_levels(psmidx2) + False + + >>> reset_option("compute.ops_on_diff_frames") + """ + nlevels = self.nlevels + if nlevels != other.nlevels: + return False + + for nlevel in range(nlevels): + self_values = self.get_level_values(nlevel).unique().sort_values() + other_values = other.get_level_values(nlevel).unique().sort_values() + if not self_values.equals(other_values): + return False + + return True + @property def hasnans(self) -> bool: raise NotImplementedError("hasnans is not defined for MultiIndex") diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 605e3f8fc0686..4d58cb736b346 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -2369,6 +2369,13 @@ def test_map(self): lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}), ) + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + if __name__ == "__main__": from pyspark.pandas.tests.indexes.test_base import * # noqa: F401 diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 3e3bb0d231ec6..143e9b663c47d 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -1830,6 +1830,35 @@ def _test_cov(self, pser1, pser2): pscov = psser1.cov(psser2, min_periods=3) self.assert_eq(pcov, pscov, almost=True) + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")]) + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + class OpsOnDiffFramesDisabledTest(PandasOnSparkTestCase, SQLTestUtils): @classmethod @@ -2017,6 +2046,14 @@ def test_combine_first(self): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): psdf1.combine_first(psdf2) + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): + psmidx1.equal_levels(psmidx2) + if __name__ == "__main__": from pyspark.pandas.tests.test_ops_on_diff_frames import * # noqa: F401 From 6306fb1c66233e7ec159def64eee0d51cdee026d Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 29 Sep 2021 13:42:43 +0900 Subject: [PATCH 2/5] Resolved comments --- python/pyspark/pandas/indexes/multi.py | 26 +++++++------ python/pyspark/pandas/missing/indexes.py | 1 - .../pyspark/pandas/tests/indexes/test_base.py | 28 ++++++++++++++ .../pandas/tests/test_ops_on_diff_frames.py | 37 ------------------- 4 files changed, 43 insertions(+), 49 deletions(-) diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index 136c876e33074..42405b34db6c0 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -16,7 +16,7 @@ # from distutils.version import LooseVersion -from functools import partial +from functools import partial, reduce from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast, no_type_check import pandas as pd @@ -1141,11 +1141,6 @@ def equal_levels(self, other: "MultiIndex") -> bool: """ Return True if the levels of both MultiIndex objects are the same - Notes - ----- - This API can be expensive since it has logic to sort and compare the values of - all levels of indices that belong to MultiIndex. - Examples -------- >>> from pyspark.pandas.config import set_option, reset_option @@ -1166,13 +1161,22 @@ def equal_levels(self, other: "MultiIndex") -> bool: if nlevels != other.nlevels: return False + self_sdf = self._internal.spark_frame + other_sdf = other._internal.spark_frame + subtract_list = [] for nlevel in range(nlevels): - self_values = self.get_level_values(nlevel).unique().sort_values() - other_values = other.get_level_values(nlevel).unique().sort_values() - if not self_values.equals(other_values): - return False + self_index_scol = self._internal.index_spark_columns[nlevel] + other_index_scol = other._internal.index_spark_columns[nlevel] + self_subtract_other = self_sdf.select(self_index_scol).subtract( + other_sdf.select(other_index_scol) + ) + subtract_list.append(self_subtract_other) - return True + unioned_subtracts = reduce(lambda x, y: x.union(y), subtract_list) + if len(unioned_subtracts.head(1)) == 0: + return True + else: + return False @property def hasnans(self) -> bool: diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py index e81156fdd74af..4170aa70f7d4c 100644 --- a/python/pyspark/pandas/missing/indexes.py +++ b/python/pyspark/pandas/missing/indexes.py @@ -105,7 +105,6 @@ class MissingPandasLikeMultiIndex(object): # Functions argsort = _unsupported_function("argsort") asof_locs = _unsupported_function("asof_locs") - equal_levels = _unsupported_function("equal_levels") factorize = _unsupported_function("factorize") format = _unsupported_function("format") get_indexer = _unsupported_function("get_indexer") diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 0ebec3272eafa..40039983c4c11 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -2389,6 +2389,34 @@ def test_map(self): ) def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")]) + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")]) psmidx1 = ps.from_pandas(pmidx1) diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 3d05c7004bf06..cd5d8347b12cf 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -1845,35 +1845,6 @@ def _test_cov(self, pser1, pser2): pscov = psser1.cov(psser2, min_periods=3) self.assert_eq(pcov, pscov, almost=True) - def test_multiindex_equal_levels(self): - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")]) - pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")]) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - class OpsOnDiffFramesDisabledTest(PandasOnSparkTestCase, SQLTestUtils): @classmethod @@ -2068,14 +2039,6 @@ def test_combine_first(self): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): psdf1.combine_first(psdf2) - def test_multiindex_equal_levels(self): - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) - with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - psmidx1.equal_levels(psmidx2) - if __name__ == "__main__": from pyspark.pandas.tests.test_ops_on_diff_frames import * # noqa: F401 From 1f5154100f37a3ecf574917bc0c76c852bd4bf23 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 29 Sep 2021 13:44:07 +0900 Subject: [PATCH 3/5] Remove options --- python/pyspark/pandas/indexes/multi.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index 42405b34db6c0..d30d558685d8a 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -1143,9 +1143,6 @@ def equal_levels(self, other: "MultiIndex") -> bool: Examples -------- - >>> from pyspark.pandas.config import set_option, reset_option - >>> set_option("compute.ops_on_diff_frames", True) - >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) >>> psmidx1.equal_levels(psmidx2) @@ -1154,8 +1151,6 @@ def equal_levels(self, other: "MultiIndex") -> bool: >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) >>> psmidx1.equal_levels(psmidx2) False - - >>> reset_option("compute.ops_on_diff_frames") """ nlevels = self.nlevels if nlevels != other.nlevels: From f19e365c451bc921533e18fe82bfd5dc27fdcebe Mon Sep 17 00:00:00 2001 From: Haejoon Lee <44108233+itholic@users.noreply.github.com> Date: Wed, 29 Sep 2021 14:28:35 +0900 Subject: [PATCH 4/5] Update python/pyspark/pandas/indexes/multi.py Co-authored-by: Hyukjin Kwon --- python/pyspark/pandas/indexes/multi.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index d30d558685d8a..cf7f7619e698b 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -1168,10 +1168,7 @@ def equal_levels(self, other: "MultiIndex") -> bool: subtract_list.append(self_subtract_other) unioned_subtracts = reduce(lambda x, y: x.union(y), subtract_list) - if len(unioned_subtracts.head(1)) == 0: - return True - else: - return False + return len(unioned_subtracts.head(1)) == 0 @property def hasnans(self) -> bool: From 9f938e1ae8e821d604ab3a4692c4cfca1b2bb924 Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 1 Oct 2021 11:18:57 +0900 Subject: [PATCH 5/5] add version --- python/pyspark/pandas/indexes/multi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index d30d558685d8a..b22c7cb844044 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -1141,6 +1141,8 @@ def equal_levels(self, other: "MultiIndex") -> bool: """ Return True if the levels of both MultiIndex objects are the same + .. versionadded:: 3.3.0 + Examples -------- >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])