From 336a269022c6d73b4ebc1abdebfd4ec66b0f59ab Mon Sep 17 00:00:00 2001 From: Bartosz Smoczynski Date: Mon, 30 Jun 2025 11:16:34 +0200 Subject: [PATCH] Optionally return match operations in get_all_paths --- editdistance/_edit_distance_osa.cpp | 26 +++++++++++++++++++------- editdistance/_edit_distance_osa.hpp | 3 ++- editdistance/edit_distance_osa.pyx | 24 +++++++++++++++--------- examples/osa_example.py | 17 +++++++++++------ tests/__init__.py | 0 tests/tests_osa.py | 8 ++------ 6 files changed, 49 insertions(+), 29 deletions(-) create mode 100644 tests/__init__.py diff --git a/editdistance/_edit_distance_osa.cpp b/editdistance/_edit_distance_osa.cpp index 632dee3..2fe9929 100644 --- a/editdistance/_edit_distance_osa.cpp +++ b/editdistance/_edit_distance_osa.cpp @@ -91,19 +91,30 @@ std::vector> backtrack_all_paths( all_paths.insert(all_paths.end(), paths.begin(), paths.end()); current_path.pop_back(); } - - if (i > 0 && j > 0) { - double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : replace_weight; - if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) { - std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]); - CppEditop op(REPLACE, i-1, j-1, sub_cost, out_char); + + if (i > 0 && j > 0 && a[i-1] != b[j-1]) { + if (std::abs((dp[i-1][j-1] + replace_weight) - current_cost) < tol) { + std::string out_char = std::string(1, b[j-1]); + CppEditop op(REPLACE, i-1, j-1, replace_weight, out_char); current_path.push_back(op); auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); current_path.pop_back(); } } - + + if (i > 0 && j > 0 && a[i-1] == b[j-1]) { + double match_weight = 0.0; // We might want to make this non-zero in the future + if (std::abs((dp[i-1][j-1] + match_weight) - current_cost) < tol) { + std::string out_char = std::string(1, a[i-1]); + CppEditop op(MATCH, i-1, j-1, match_weight, out_char); + current_path.push_back(op); + auto paths = backtrack_all_paths(a, b, dp, i-1, j-1, current_path, replace_weight, insert_weight, delete_weight, swap_weight); + all_paths.insert(all_paths.end(), paths.begin(), paths.end()); + current_path.pop_back(); + } + } + if (i > 1 && j > 1 && a[i-1] == b[j-2] && a[i-2] == b[j-1] && std::abs((dp[i-2][j-2] + swap_weight) - current_cost) < tol) { @@ -161,6 +172,7 @@ std::string editop_name_to_string(CppEditopName name) { case DELETE: return "DELETE"; case REPLACE: return "REPLACE"; case SWAP: return "SWAP"; + case MATCH: return "MATCH"; default: return "UNKNOWN"; } } diff --git a/editdistance/_edit_distance_osa.hpp b/editdistance/_edit_distance_osa.hpp index b97d131..323c2fb 100644 --- a/editdistance/_edit_distance_osa.hpp +++ b/editdistance/_edit_distance_osa.hpp @@ -11,7 +11,8 @@ enum CppEditopName { INSERT, DELETE, REPLACE, - SWAP + SWAP, + MATCH }; struct CppEditop { diff --git a/editdistance/edit_distance_osa.pyx b/editdistance/edit_distance_osa.pyx index 68356e9..5daf5ac 100644 --- a/editdistance/edit_distance_osa.pyx +++ b/editdistance/edit_distance_osa.pyx @@ -1,6 +1,7 @@ # distutils: language = c++ # distutils: sources = ./editdistance/_edit_distance_osa.cpp +from libcpp cimport bool from libcpp.map cimport map from libcpp.string cimport string from libcpp.vector cimport vector @@ -14,6 +15,7 @@ cdef extern from "_edit_distance_osa.hpp": DELETE REPLACE SWAP + MATCH cdef struct CppEditop: CppEditopName name @@ -38,6 +40,7 @@ class EditopName(Enum): DELETE = 1 REPLACE = 2 SWAP = 3 + MATCH = 4 cdef class Editop: @@ -64,7 +67,8 @@ def get_all_paths( double replace_weight=1.0, double insert_weight=1.0, double delete_weight=1.0, - double swap_weight=1.0 + double swap_weight=1.0, + bool return_matches=False, ): cdef string cpp_a = a.encode("utf-8") cdef string cpp_b = b.encode("utf-8") @@ -76,8 +80,6 @@ def get_all_paths( for cpp_path in cpp_paths: python_path = [] for cpp_op in cpp_path: - if cpp_op.cost == 0: - continue if cpp_op.name == INSERT: py_name = EditopName.INSERT elif cpp_op.name == DELETE: @@ -86,6 +88,11 @@ def get_all_paths( py_name = EditopName.REPLACE elif cpp_op.name == SWAP: py_name = EditopName.SWAP + elif cpp_op.name == MATCH: + if return_matches: + py_name = EditopName.MATCH + else: + continue else: py_name = None python_path.append(Editop( @@ -99,12 +106,11 @@ def get_all_paths( return python_paths def apply_editops(src, dst, editops): + # assumes editops are sorted from left to right + # assumes match operations are included src_idx = 0 s = "" for op in editops: - while src_idx < op.src_idx: - s += src[src_idx] - src_idx += 1 if op.name == EditopName.INSERT: s += dst[op.dst_idx] elif op.name == EditopName.DELETE: @@ -116,9 +122,9 @@ def apply_editops(src, dst, editops): s += src[op.src_idx + 1] s += src[op.src_idx] src_idx += 2 - while src_idx < len(src): - s += src[src_idx] - src_idx += 1 + elif op.name == EditopName.MATCH: + s += src[op.src_idx] + src_idx += 1 return s diff --git a/examples/osa_example.py b/examples/osa_example.py index c9451d7..27d5ae9 100644 --- a/examples/osa_example.py +++ b/examples/osa_example.py @@ -5,10 +5,7 @@ """ try: - from editdistance.osa import ( - compute_distance, - get_all_paths, - ) + from editdistance.osa import compute_distance, get_all_paths def main(): # Test case from original Python code @@ -25,12 +22,20 @@ def main(): print(f"Distance: {distance}") paths = get_all_paths(source, target) + paths_with_matches = get_all_paths(source, target, return_matches=True) print(f"Number of optimal edit sequences: {len(paths)}") + print("Paths without match editops:") for i, path in enumerate(paths, 1): - print(f"Path {i}:") + print(f" Path {i}:") for op in path: - print(f" {op}") + print(f" {op}") + print() + print("Paths with match editops:") + for i, path in enumerate(paths_with_matches, 1): + print(f" Path {i}:") + for op in path: + print(f" {op}") print() if __name__ == "__main__": diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tests_osa.py b/tests/tests_osa.py index 5647022..96c7683 100644 --- a/tests/tests_osa.py +++ b/tests/tests_osa.py @@ -1,10 +1,6 @@ import unittest -from editdistance.osa import ( - apply_editops, - compute_distance, - get_all_paths, -) +from editdistance.osa import apply_editops, compute_distance, get_all_paths COMPUTE_DISTANCE_TEST_CASES = [ ("single character", "a", "b", 1.0), @@ -76,7 +72,7 @@ def test_get_all_paths(self): def test_editops_transform(self): for src, dst in EDITOPS_TRANSFORM_TEST_CASES: with self.subTest(src=src, dst=dst): - paths = get_all_paths(src, dst) + paths = get_all_paths(src, dst, return_matches=True) self.assertTrue(paths, f"No paths found for {src} -> {dst}") for path in paths: result = apply_editops(src, dst, path)