From 8eea4df63bb3cb3dcc8731d0840415b7031af620 Mon Sep 17 00:00:00 2001
From: itholic <haejoon.lee@databricks.com>
Date: Mon, 27 Sep 2021 16:38:27 +0900
Subject: [PATCH 1/5] Implement MultiIndex.equal_levels

---
 .../reference/pyspark.pandas/indexing.rst     |  1 +
 python/pyspark/pandas/indexes/multi.py        | 37 +++++++++++++++++++
 .../pyspark/pandas/tests/indexes/test_base.py |  7 ++++
 .../pandas/tests/test_ops_on_diff_frames.py   | 37 +++++++++++++++++++
 4 files changed, 82 insertions(+)

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 05353fde71daa..7e796c69dc27e 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -269,6 +269,7 @@ MultiIndex Modifying and computations
    :toctree: api/
 
    MultiIndex.equals
+   MultiIndex.equal_levels
    MultiIndex.identical
    MultiIndex.insert
    MultiIndex.drop
diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index fb0208099fd6d..7fc183fb91f24 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIn
         )
         return cast(MultiIndex, DataFrame(internal).index)
 
+    def equal_levels(self, other: "MultiIndex") -> bool:
+        """
+        Return True if the levels of both MultiIndex objects are the same
+
+        Notes
+        -----
+        This API can be expensive since it has logic to sort and compare the values of
+        all levels of indices that belong to MultiIndex.
+
+        Examples
+        --------
+        >>> from pyspark.pandas.config import set_option, reset_option
+        >>> set_option("compute.ops_on_diff_frames", True)
+
+        >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
+        >>> psmidx1.equal_levels(psmidx2)
+        True
+
+        >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
+        >>> psmidx1.equal_levels(psmidx2)
+        False
+
+        >>> reset_option("compute.ops_on_diff_frames")
+        """
+        nlevels = self.nlevels
+        if nlevels != other.nlevels:
+            return False
+
+        for nlevel in range(nlevels):
+            self_values = self.get_level_values(nlevel).unique().sort_values()
+            other_values = other.get_level_values(nlevel).unique().sort_values()
+            if not self_values.equals(other_values):
+                return False
+
+        return True
+
     @property
     def hasnans(self) -> bool:
         raise NotImplementedError("hasnans is not defined for MultiIndex")
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
index 605e3f8fc0686..4d58cb736b346 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -2369,6 +2369,13 @@ def test_map(self):
             lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}),
         )
 
+    def test_multiindex_equal_levels(self):
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.indexes.test_base import *  # noqa: F401
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index 3e3bb0d231ec6..143e9b663c47d 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -1830,6 +1830,35 @@ def _test_cov(self, pser1, pser2):
         pscov = psser1.cov(psser2, min_periods=3)
         self.assert_eq(pcov, pscov, almost=True)
 
+    def test_multiindex_equal_levels(self):
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")])
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
 
 class OpsOnDiffFramesDisabledTest(PandasOnSparkTestCase, SQLTestUtils):
     @classmethod
@@ -2017,6 +2046,14 @@ def test_combine_first(self):
         with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
             psdf1.combine_first(psdf2)
 
+    def test_multiindex_equal_levels(self):
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
+            psmidx1.equal_levels(psmidx2)
+
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.test_ops_on_diff_frames import *  # noqa: F401

From 6306fb1c66233e7ec159def64eee0d51cdee026d Mon Sep 17 00:00:00 2001
From: itholic <haejoon.lee@databricks.com>
Date: Wed, 29 Sep 2021 13:42:43 +0900
Subject: [PATCH 2/5] Resolved comments

---
 python/pyspark/pandas/indexes/multi.py        | 26 +++++++------
 python/pyspark/pandas/missing/indexes.py      |  1 -
 .../pyspark/pandas/tests/indexes/test_base.py | 28 ++++++++++++++
 .../pandas/tests/test_ops_on_diff_frames.py   | 37 -------------------
 4 files changed, 43 insertions(+), 49 deletions(-)

diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index 136c876e33074..42405b34db6c0 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -16,7 +16,7 @@
 #
 
 from distutils.version import LooseVersion
-from functools import partial
+from functools import partial, reduce
 from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast, no_type_check
 
 import pandas as pd
@@ -1141,11 +1141,6 @@ def equal_levels(self, other: "MultiIndex") -> bool:
         """
         Return True if the levels of both MultiIndex objects are the same
 
-        Notes
-        -----
-        This API can be expensive since it has logic to sort and compare the values of
-        all levels of indices that belong to MultiIndex.
-
         Examples
         --------
         >>> from pyspark.pandas.config import set_option, reset_option
@@ -1166,13 +1161,22 @@ def equal_levels(self, other: "MultiIndex") -> bool:
         if nlevels != other.nlevels:
             return False
 
+        self_sdf = self._internal.spark_frame
+        other_sdf = other._internal.spark_frame
+        subtract_list = []
         for nlevel in range(nlevels):
-            self_values = self.get_level_values(nlevel).unique().sort_values()
-            other_values = other.get_level_values(nlevel).unique().sort_values()
-            if not self_values.equals(other_values):
-                return False
+            self_index_scol = self._internal.index_spark_columns[nlevel]
+            other_index_scol = other._internal.index_spark_columns[nlevel]
+            self_subtract_other = self_sdf.select(self_index_scol).subtract(
+                other_sdf.select(other_index_scol)
+            )
+            subtract_list.append(self_subtract_other)
 
-        return True
+        unioned_subtracts = reduce(lambda x, y: x.union(y), subtract_list)
+        if len(unioned_subtracts.head(1)) == 0:
+            return True
+        else:
+            return False
 
     @property
     def hasnans(self) -> bool:
diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py
index e81156fdd74af..4170aa70f7d4c 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -105,7 +105,6 @@ class MissingPandasLikeMultiIndex(object):
     # Functions
     argsort = _unsupported_function("argsort")
     asof_locs = _unsupported_function("asof_locs")
-    equal_levels = _unsupported_function("equal_levels")
     factorize = _unsupported_function("factorize")
     format = _unsupported_function("format")
     get_indexer = _unsupported_function("get_indexer")
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
index 0ebec3272eafa..40039983c4c11 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -2389,6 +2389,34 @@ def test_map(self):
         )
 
     def test_multiindex_equal_levels(self):
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")])
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
+        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")])
+        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")])
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
+
         pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
         pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")])
         psmidx1 = ps.from_pandas(pmidx1)
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index 3d05c7004bf06..cd5d8347b12cf 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -1845,35 +1845,6 @@ def _test_cov(self, pser1, pser2):
         pscov = psser1.cov(psser2, min_periods=3)
         self.assert_eq(pcov, pscov, almost=True)
 
-    def test_multiindex_equal_levels(self):
-        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-        pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
-        psmidx1 = ps.from_pandas(pmidx1)
-        psmidx2 = ps.from_pandas(pmidx2)
-        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
-
-        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
-        psmidx2 = ps.from_pandas(pmidx2)
-        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
-
-        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")])
-        psmidx2 = ps.from_pandas(pmidx2)
-        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
-
-        pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
-        psmidx2 = ps.from_pandas(pmidx2)
-        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
-
-        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")])
-        psmidx2 = ps.from_pandas(pmidx2)
-        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
-
-        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")])
-        pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")])
-        psmidx1 = ps.from_pandas(pmidx1)
-        psmidx2 = ps.from_pandas(pmidx2)
-        self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
-
 
 class OpsOnDiffFramesDisabledTest(PandasOnSparkTestCase, SQLTestUtils):
     @classmethod
@@ -2068,14 +2039,6 @@ def test_combine_first(self):
         with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
             psdf1.combine_first(psdf2)
 
-    def test_multiindex_equal_levels(self):
-        pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-        pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
-        psmidx1 = ps.from_pandas(pmidx1)
-        psmidx2 = ps.from_pandas(pmidx2)
-        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
-            psmidx1.equal_levels(psmidx2)
-
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.test_ops_on_diff_frames import *  # noqa: F401

From 1f5154100f37a3ecf574917bc0c76c852bd4bf23 Mon Sep 17 00:00:00 2001
From: itholic <haejoon.lee@databricks.com>
Date: Wed, 29 Sep 2021 13:44:07 +0900
Subject: [PATCH 3/5] Remove options

---
 python/pyspark/pandas/indexes/multi.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index 42405b34db6c0..d30d558685d8a 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -1143,9 +1143,6 @@ def equal_levels(self, other: "MultiIndex") -> bool:
 
         Examples
         --------
-        >>> from pyspark.pandas.config import set_option, reset_option
-        >>> set_option("compute.ops_on_diff_frames", True)
-
         >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
         >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")])
         >>> psmidx1.equal_levels(psmidx2)
@@ -1154,8 +1151,6 @@ def equal_levels(self, other: "MultiIndex") -> bool:
         >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")])
         >>> psmidx1.equal_levels(psmidx2)
         False
-
-        >>> reset_option("compute.ops_on_diff_frames")
         """
         nlevels = self.nlevels
         if nlevels != other.nlevels:

From f19e365c451bc921533e18fe82bfd5dc27fdcebe Mon Sep 17 00:00:00 2001
From: Haejoon Lee <44108233+itholic@users.noreply.github.com>
Date: Wed, 29 Sep 2021 14:28:35 +0900
Subject: [PATCH 4/5] Update python/pyspark/pandas/indexes/multi.py

Co-authored-by: Hyukjin Kwon <gurwls223@gmail.com>
---
 python/pyspark/pandas/indexes/multi.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index d30d558685d8a..cf7f7619e698b 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -1168,10 +1168,7 @@ def equal_levels(self, other: "MultiIndex") -> bool:
             subtract_list.append(self_subtract_other)
 
         unioned_subtracts = reduce(lambda x, y: x.union(y), subtract_list)
-        if len(unioned_subtracts.head(1)) == 0:
-            return True
-        else:
-            return False
+        return len(unioned_subtracts.head(1)) == 0
 
     @property
     def hasnans(self) -> bool:

From 9f938e1ae8e821d604ab3a4692c4cfca1b2bb924 Mon Sep 17 00:00:00 2001
From: itholic <haejoon.lee@databricks.com>
Date: Fri, 1 Oct 2021 11:18:57 +0900
Subject: [PATCH 5/5] add version

---
 python/pyspark/pandas/indexes/multi.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index d30d558685d8a..b22c7cb844044 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -1141,6 +1141,8 @@ def equal_levels(self, other: "MultiIndex") -> bool:
         """
         Return True if the levels of both MultiIndex objects are the same
 
+        .. versionadded:: 3.3.0
+
         Examples
         --------
         >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])