From de705b72a1d52cbe3b1881d172b58e9c63d1c032 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 23 Sep 2020 15:40:07 +0900 Subject: [PATCH 1/5] Implemented MultiIndex.equal_levels --- databricks/koalas/indexes.py | 24 ++++++++++++++++++++++++ databricks/koalas/tests/test_indexes.py | 13 +++++++++++++ 2 files changed, 37 insertions(+) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 4edc2f23df..ded80a641d 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -2973,6 +2973,30 @@ def item(self): """ return self._kdf.head(2)._to_internal_pandas().index.item() + def equal_levels(self, other): + """ + Return True if the levels of both MultiIndex objects are the same + + Examples + -------- + >>> kmidx1 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + >>> kmidx2 = ks.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + >>> kmidx1.equal_levels(kmidx2) + True + """ + nlevels = self.nlevels + if nlevels != other.nlevels: + return False + self = self.sort_values() + other = other.sort_values() + with ks.option_context("compute.ops_on_diff_frames", True): + for i in range(nlevels): + self_level_values = self.get_level_values(i) + other_level_values = other.get_level_values(i) + if not self_level_values.equals(other_level_values): + return False + return True + @property def inferred_type(self): """ diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 6615d1b670..54d2d2d8d6 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -1581,3 +1581,16 @@ def test_multiindex_is_unique(self): kdf = ks.from_pandas(pdf) self.assertEqual(kdf.index.is_unique, expected) + + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + kmidx1 = ks.from_pandas(pmidx1) + kmidx2 = ks.from_pandas(pmidx2) + + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) + kmidx2 = ks.from_pandas(pmidx2) + + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) From d7d674388208d7896bb301afec24402d9e575a6a Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 23 Sep 2020 15:49:02 +0900 Subject: [PATCH 2/5] add doc --- databricks/koalas/missing/indexes.py | 1 - docs/source/reference/indexing.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 9e22ac0f45..42f041d745 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -106,7 +106,6 @@ class MissingPandasLikeMultiIndex(object): # Functions argsort = _unsupported_function("argsort") asof_locs = _unsupported_function("asof_locs") - equal_levels = _unsupported_function("equal_levels") factorize = _unsupported_function("factorize") format = _unsupported_function("format") get_indexer = _unsupported_function("get_indexer") diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst index 1ed2227323..03b9c31860 100644 --- a/docs/source/reference/indexing.rst +++ b/docs/source/reference/indexing.rst @@ -215,6 +215,7 @@ MultiIndex Modifying and computations :toctree: api/ MultiIndex.equals + MultiIndex.equal_levels MultiIndex.identical MultiIndex.drop MultiIndex.copy From 75872abb1d4ef49bc35dc0d64e705b67ed0c69fd Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 24 Sep 2020 23:27:19 +0900 Subject: [PATCH 3/5] Minimize run Spark job --- databricks/koalas/indexes.py | 21 ++++++++-------- databricks/koalas/tests/test_indexes.py | 7 +++--- .../koalas/tests/test_ops_on_diff_frames.py | 24 +++++++++++++++++++ 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index ded80a641d..d5ca838957 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -52,6 +52,7 @@ from databricks.koalas.missing.indexes import MissingPandasLikeIndex, MissingPandasLikeMultiIndex from databricks.koalas.series import Series, first_series from databricks.koalas.utils import ( + combine_frames, compare_allow_null, compare_disallow_null, compare_null_first, @@ -2985,17 +2986,17 @@ def equal_levels(self, other): True """ nlevels = self.nlevels - if nlevels != other.nlevels: + if (nlevels != other.nlevels) or (len(self) != len(other)): return False - self = self.sort_values() - other = other.sort_values() - with ks.option_context("compute.ops_on_diff_frames", True): - for i in range(nlevels): - self_level_values = self.get_level_values(i) - other_level_values = other.get_level_values(i) - if not self_level_values.equals(other_level_values): - return False - return True + self_frame = self.sort_values().to_frame() + other_frame = other.sort_values().to_frame() + combined = combine_frames(self_frame, other_frame) + + sdf = combined._internal.spark_frame + that_index_name = "__that_{}".format(other._internal.index_spark_column_names[0]) + that_index_scol = scol_for(sdf, that_index_name) + + return len(sdf.filter(that_index_scol.isNull()).head(1)) == 0 @property def inferred_type(self): diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 54d2d2d8d6..3168081337 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -1584,13 +1584,12 @@ def test_multiindex_is_unique(self): def test_multiindex_equal_levels(self): pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) kmidx1 = ks.from_pandas(pmidx1) - kmidx2 = ks.from_pandas(pmidx2) - - self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) kmidx2 = ks.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) + pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")]) + kmidx2 = ks.from_pandas(pmidx2) self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py index b583932a98..cd506bf0e6 100644 --- a/databricks/koalas/tests/test_ops_on_diff_frames.py +++ b/databricks/koalas/tests/test_ops_on_diff_frames.py @@ -911,6 +911,17 @@ def test_series_repeat(self): else: self.assert_eq(kser1.repeat(kser2).sort_index(), pser1.repeat(pser2).sort_index()) + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + kmidx1 = ks.from_pandas(pmidx1) + kmidx2 = ks.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + kmidx2 = ks.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) + class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils): @classmethod @@ -1052,3 +1063,16 @@ def test_mask(self): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): kdf1.mask(kdf2 > -250) + + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + kmidx1 = ks.from_pandas(pmidx1) + kmidx2 = ks.from_pandas(pmidx2) + with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + kmidx2 = ks.from_pandas(pmidx2) + with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) From 73f5cc5abfddc1ff54ade1d8ed2f06f9c6aa4bb1 Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 25 Sep 2020 00:00:05 +0900 Subject: [PATCH 4/5] Fix doctest --- databricks/koalas/indexes.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 9e0f9238a1..af920a836b 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -2988,10 +2988,19 @@ def equal_levels(self, other): Examples -------- + >>> from databricks.koalas.config import set_option, reset_option + >>> set_option("compute.ops_on_diff_frames", True) + >>> kmidx1 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> kmidx2 = ks.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) >>> kmidx1.equal_levels(kmidx2) True + + >>> kmidx2 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + >>> kmidx1.equal_levels(kmidx2) + False + + >>> reset_option("compute.ops_on_diff_frames") """ nlevels = self.nlevels if (nlevels != other.nlevels) or (len(self) != len(other)): From 8b57c57ea8a3f88f679bf4858255dec2b882cd00 Mon Sep 17 00:00:00 2001 From: itholic Date: Sun, 27 Sep 2020 02:59:48 +0900 Subject: [PATCH 5/5] Addressed comments --- databricks/koalas/indexes.py | 7 +++--- databricks/koalas/tests/test_indexes.py | 11 +++++++++ .../koalas/tests/test_ops_on_diff_frames.py | 24 ------------------- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index af920a836b..9d848c779b 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -3003,14 +3003,15 @@ def equal_levels(self, other): >>> reset_option("compute.ops_on_diff_frames") """ nlevels = self.nlevels - if (nlevels != other.nlevels) or (len(self) != len(other)): + if nlevels != other.nlevels: return False self_frame = self.sort_values().to_frame() other_frame = other.sort_values().to_frame() - combined = combine_frames(self_frame, other_frame) + with option_context("compute.ops_on_diff_frames", True): + combined = combine_frames(self_frame, other_frame) sdf = combined._internal.spark_frame - that_index_name = "__that_{}".format(other._internal.index_spark_column_names[0]) + that_index_name = combined["that"]._internal.data_spark_column_names[0] that_index_scol = scol_for(sdf, that_index_name) return len(sdf.filter(that_index_scol.isNull()).head(1)) == 0 diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 393a0a934c..31c871450c 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -1588,7 +1588,18 @@ def test_multiindex_is_unique(self): def test_multiindex_equal_levels(self): pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) kmidx1 = ks.from_pandas(pmidx1) + kmidx2 = ks.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + kmidx2 = ks.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")]) + kmidx2 = ks.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) kmidx2 = ks.from_pandas(pmidx2) diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py index cd506bf0e6..b583932a98 100644 --- a/databricks/koalas/tests/test_ops_on_diff_frames.py +++ b/databricks/koalas/tests/test_ops_on_diff_frames.py @@ -911,17 +911,6 @@ def test_series_repeat(self): else: self.assert_eq(kser1.repeat(kser2).sort_index(), pser1.repeat(pser2).sort_index()) - def test_multiindex_equal_levels(self): - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) - kmidx1 = ks.from_pandas(pmidx1) - kmidx2 = ks.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) - kmidx2 = ks.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) - class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils): @classmethod @@ -1063,16 +1052,3 @@ def test_mask(self): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): kdf1.mask(kdf2 > -250) - - def test_multiindex_equal_levels(self): - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) - kmidx1 = ks.from_pandas(pmidx1) - kmidx2 = ks.from_pandas(pmidx2) - with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) - kmidx2 = ks.from_pandas(pmidx2) - with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(pmidx1.equal_levels(pmidx2), kmidx1.equal_levels(kmidx2))