From 3a1b36404f4fe23ab4a6330c049fb45478ac4f30 Mon Sep 17 00:00:00 2001 From: Krzysztof Czajkowski Date: Thu, 12 Jun 2025 12:11:15 +0200 Subject: [PATCH] Add initial OSA code --- .flake8 | 13 +++ .github/workflows/ci.yml | 39 +++++++ .gitignore | 5 +- README.md | 58 +++++++++- editdistance/_edit_distance_osa.cpp | 165 ++++++++++++++++++++++++++++ editdistance/_edit_distance_osa.hpp | 72 ++++++++++++ editdistance/edit_distance_osa.pyx | 111 +++++++++++++++++++ examples/osa_example.py | 52 +++++++++ linters.sh | 12 ++ linters__fix.sh | 10 ++ pyproject.toml | 71 ++++++++++++ setup.py | 50 +++++++++ tests/tests_osa.py | 60 ++++++++++ 13 files changed, 716 insertions(+), 2 deletions(-) create mode 100644 .flake8 create mode 100644 .github/workflows/ci.yml create mode 100644 editdistance/_edit_distance_osa.cpp create mode 100644 editdistance/_edit_distance_osa.hpp create mode 100644 editdistance/edit_distance_osa.pyx create mode 100644 examples/osa_example.py create mode 100755 linters.sh create mode 100755 linters__fix.sh create mode 100644 pyproject.toml create mode 100644 setup.py create mode 100644 tests/tests_osa.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..23c91f8 --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +# see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8 +extend-ignore = E203, E501, W503, E701, E704 +exclude = \ + .venv, + */.ipynb_checkpoints + +max-line-length=100 + +per-file-ignores = + # imported but unused + __init__.py: F401 + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..024e33f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Upgrade pip + run: python -m pip install --upgrade pip + + - name: Install development dependencies + run: pip install .[dev] + + - name: Ensure linters.sh is executable + run: chmod +x ./linters.sh + + - name: Run linters script + run: ./linters.sh + + - name: Run tests + run: python -m unittest discover -s tests -p "*.py" diff --git a/.gitignore b/.gitignore index 7b004e5..3263b77 100644 --- a/.gitignore +++ b/.gitignore @@ -191,4 +191,7 @@ cython_debug/ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data # refer to https://docs.cursor.com/context/ignore-files .cursorignore -.cursorindexingignore \ No newline at end of file +.cursorindexingignore + +# Edit Distance OSA file build by Cython +editdistance/edit_distance_osa.cpp \ No newline at end of file diff --git a/README.md b/README.md index 76bcc44..15c0b5f 100644 --- a/README.md +++ b/README.md @@ -1 +1,57 @@ -# edit-distance \ No newline at end of file +# edit-distance + +edit-distance is a Python package that provides an implementation of the Optimal String Alignment (OSA) algorithm for calculating edit distances. The package leverages C++ extensions via Cython for improved performance. + +## Features + +- Calculate edit distances using the OSA algorithm. +- Enables custom weights for each edit operation. +- Find all edit paths resulting in the minimal OSA distance between strings. +- High-performance implementation using C++ and Cython. +- Easy integration into Python projects. + +## Installation + +Ensure you have a C++ compiler installed. Then, clone the repository and install the package using: + +```sh +pip install . +``` + +Alternatively, call `setup.py` directly: + +```sh +python setup.py build_ext --inplace +python setup.py install +``` + +For more details on the setup, see [setup.py](setup.py). + +## Usage + +After installation, you can import and use the module in your Python code: + +```python +import editdistance.osa + +# Example usage: +str1 = "kitten" +str2 = "sitting" +distance = editdistance.osa.calculate_distance(str1, str2) +print(f"The edit distance between '{{}}' and '{{}}' is {{}}".format(str1, str2, distance)) +``` + +See examples located in [examples](examples/osa_example.py) directory. + +## Running Tests + +The test suite is located in the [tests](tests/tests_osa.py) directory. To run the tests, execute: + +```sh +python -m unittest discover -v +``` + +## License + +This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. +``` \ No newline at end of file diff --git a/editdistance/_edit_distance_osa.cpp b/editdistance/_edit_distance_osa.cpp new file mode 100644 index 0000000..952f11e --- /dev/null +++ b/editdistance/_edit_distance_osa.cpp @@ -0,0 +1,165 @@ +#include "_edit_distance_osa.hpp" +#include +#include + +std::vector> compute_dp_table( + const std::string& a, + const std::string& b, + const std::map& cost_map +) { + int len_a = a.length(); + int len_b = b.length(); + std::vector> dp(len_a + 1, std::vector(len_b + 1, 0.0)); + + for (int i = 0; i <= len_a; ++i) { + dp[i][0] = i * cost_map.at(DELETE); + } + for (int j = 0; j <= len_b; ++j) { + dp[0][j] = j * cost_map.at(INSERT); + } + + for (int i = 1; i <= len_a; ++i) { + for (int j = 1; j <= len_b; ++j) { + double deletion = dp[i-1][j] + cost_map.at(DELETE); + double insertion = dp[i][j-1] + cost_map.at(INSERT); + double substitution_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE); + double substitution = dp[i-1][j-1] + substitution_cost; + + dp[i][j] = std::min({deletion, insertion, substitution}); + + if (i > 1 && j > 1 && + a[i-1] == b[j-2] && a[i-2] == b[j-1]) { + dp[i][j] = std::min(dp[i][j], + dp[i-2][j-2] + cost_map.at(TRANSPOSE)); + } + } + } + + return dp; +} + + +double cpp_compute_distance( + const std::string& a, + const std::string& b, + const std::map& cost_map +) { + auto dp = compute_dp_table(a, b, cost_map); + return dp[a.length()][b.length()]; +} + +std::vector> backtrack_all_paths( + const std::string& a, + const std::string& b, + const std::map& cost_map, + const std::vector>& dp, + int i, + int j, + std::vector& current_path +) { + if (i == 0 && j == 0) { + std::vector reversed_path = current_path; + std::reverse(reversed_path.begin(), reversed_path.end()); + return {reversed_path}; + } + + std::vector> all_paths; + double current_cost = dp[i][j]; + const double tol = 1e-6; + + + if (i > 0 && std::abs((dp[i-1][j] + cost_map.at(DELETE)) - current_cost) < tol) { + Editop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1])); + current_path.push_back(op); + auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j, current_path); + all_paths.insert(all_paths.end(), paths.begin(), paths.end()); + current_path.pop_back(); + } + + if (j > 0 && std::abs((dp[i][j-1] + cost_map.at(INSERT)) - current_cost) < tol) { + Editop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1])); + current_path.push_back(op); + auto paths = backtrack_all_paths(a, b, cost_map, dp, i, j-1, current_path); + all_paths.insert(all_paths.end(), paths.begin(), paths.end()); + current_path.pop_back(); + } + + + if (i > 0 && j > 0) { + double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE); + if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) { + std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]); + Editop op(REPLACE, i-1, j-1, sub_cost, out_char); + current_path.push_back(op); + auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j-1, current_path); + all_paths.insert(all_paths.end(), paths.begin(), paths.end()); + current_path.pop_back(); + } + } + + + if (i > 1 && j > 1 && + a[i-1] == b[j-2] && a[i-2] == b[j-1] && + std::abs((dp[i-2][j-2] + cost_map.at(TRANSPOSE)) - current_cost) < tol) { + std::string transpose_str = std::string(1, b[j-2]) + std::string(1, b[j-1]); + Editop op(TRANSPOSE, i-2, j-2, cost_map.at(TRANSPOSE), transpose_str); + current_path.push_back(op); + auto paths = backtrack_all_paths(a, b, cost_map, dp, i-2, j-2, current_path); + all_paths.insert(all_paths.end(), paths.begin(), paths.end()); + current_path.pop_back(); + } + + return all_paths; +} + + +std::vector> cpp_compute_all_paths( + const std::string& a, + const std::string& b, + const std::map& cost_map +) { + auto dp = compute_dp_table(a, b, cost_map); + std::vector current_path; + return backtrack_all_paths(a, b, cost_map, dp, a.length(), b.length(), current_path); +} + + +void cpp_print_all_paths( + const std::string& a, + const std::string& b, + const std::map& cost_map +) { + auto paths = cpp_compute_all_paths(a, b, cost_map); + double distance = cpp_compute_distance(a, b, cost_map); + + std::cout << "OSA Distance from '" << a << "' to '" << b << "': " << distance << std::endl; + std::cout << "Number of optimal edit sequences: " << paths.size() << std::endl; + std::cout << std::endl; + + for (size_t i = 0; i < paths.size(); ++i) { + std::cout << "Path " << (i + 1) << ":" << std::endl; + for (const auto& op : paths[i]) { + std::cout << " " << op << std::endl; + } + std::cout << std::endl; + } +} + +std::string editop_name_to_string(EditopName name) { + switch (name) { + case INSERT: return "INSERT"; + case DELETE: return "DELETE"; + case REPLACE: return "REPLACE"; + case TRANSPOSE: return "TRANSPOSE"; + default: return "UNKNOWN"; + } +} + +std::ostream& operator<<(std::ostream& os, const Editop& op) { + os << "Editop(name=" << editop_name_to_string(op.name) + << ", src_idx=" << op.src_idx + << ", dst_idx=" << op.dst_idx + << ", cost=" << op.cost + << ", output_string='" << op.output_string << "')"; + return os; +} diff --git a/editdistance/_edit_distance_osa.hpp b/editdistance/_edit_distance_osa.hpp new file mode 100644 index 0000000..b7f86a9 --- /dev/null +++ b/editdistance/_edit_distance_osa.hpp @@ -0,0 +1,72 @@ +#ifndef EDIT_DISTANCE_OSA_HPP +#define EDIT_DISTANCE_OSA_HPP + +#include +#include +#include +#include + + +enum EditopName { + INSERT, + DELETE, + REPLACE, + TRANSPOSE +}; + +struct Editop { + EditopName name; + int src_idx; + int dst_idx; + double cost; + std::string output_string; + + Editop() : name(INSERT), src_idx(0), dst_idx(0), cost(0.0), output_string("") {} + Editop(EditopName n, int si, int di, double c, const std::string& os) + : name(n), src_idx(si), dst_idx(di), cost(c), output_string(os) {} +}; + + +std::vector> compute_dp_table( + const std::string& a, + const std::string& b, + const std::map& cost_map +); + + +double cpp_compute_distance( + const std::string& a, + const std::string& b, + const std::map& cost_map +); + + +std::vector> backtrack_all_paths( + const std::string& a, + const std::string& b, + const std::map& cost_map, + const std::vector>& dp, + int i, + int j, + std::vector& current_path +); + + +std::vector> cpp_compute_all_paths( + const std::string& a, + const std::string& b, + const std::map& cost_map +); + + +void cpp_print_all_paths( + const std::string& a, + const std::string& b, + const std::map& cost_map +); + + +std::string editop_name_to_string(EditopName name); +std::ostream& operator<<(std::ostream& os, const Editop& op); + +#endif // EDIT_DISTANCE_OSA_HPP \ No newline at end of file diff --git a/editdistance/edit_distance_osa.pyx b/editdistance/edit_distance_osa.pyx new file mode 100644 index 0000000..553c02d --- /dev/null +++ b/editdistance/edit_distance_osa.pyx @@ -0,0 +1,111 @@ +# distutils: language = c++ +# distutils: sources = ./editdistance/_edit_distance_osa.cpp + +from libcpp.map cimport map +from libcpp.string cimport string +from libcpp.vector cimport vector + +from enum import Enum + + +cdef extern from "_edit_distance_osa.hpp": + cdef enum EditopName: + INSERT + DELETE + REPLACE + TRANSPOSE + + cdef struct Editop: + EditopName name + int src_idx + int dst_idx + double cost + string output_string + + vector[vector[Editop]] cpp_compute_all_paths(const string& a, const string& b, const map[EditopName, double]& cost_map) + void cpp_print_all_paths(const string& a, const string& b, const map[EditopName, double]& cost_map) + double cpp_compute_distance(const string& a, const string& b, const map[EditopName, double]& cost_map) + + +class PyEditopName(Enum): + INSERT = 0 + DELETE = 1 + REPLACE = 2 + TRANSPOSE = 3 + + +cdef class PyEditop: + cdef readonly object name + cdef readonly int src_idx + cdef readonly int dst_idx + cdef readonly double cost + cdef readonly str output_string + + def __init__(self, name, src_idx, dst_idx, cost, output_string): + self.name = name + self.src_idx = src_idx + self.dst_idx = dst_idx + self.cost = cost + self.output_string = output_string + + def __repr__(self): + return f"Editop(name={self.name}, src_idx={self.src_idx}, dst_idx={self.dst_idx}, cost={self.cost}, output_string='{self.output_string}')" + + +cdef map[EditopName, double] _convert_cost_map(dict cost_map): + cdef map[EditopName, double] cpp_cost_map + if PyEditopName.INSERT in cost_map: + cpp_cost_map[INSERT] = cost_map[PyEditopName.INSERT] + if PyEditopName.DELETE in cost_map: + cpp_cost_map[DELETE] = cost_map[PyEditopName.DELETE] + if PyEditopName.REPLACE in cost_map: + cpp_cost_map[REPLACE] = cost_map[PyEditopName.REPLACE] + if PyEditopName.TRANSPOSE in cost_map: + cpp_cost_map[TRANSPOSE] = cost_map[PyEditopName.TRANSPOSE] + return cpp_cost_map + + +def compute_with_all_paths(str a, str b, dict cost_map): + cdef string cpp_a = a.encode("utf-8") + cdef string cpp_b = b.encode("utf-8") + cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map) + cdef vector[vector[Editop]] cpp_paths = cpp_compute_all_paths(cpp_a, cpp_b, cpp_cost_map) + python_paths = [] + cdef vector[Editop] cpp_path + cdef Editop cpp_op + for cpp_path in cpp_paths: + python_path = [] + for cpp_op in cpp_path: + if cpp_op.name == INSERT: + py_name = PyEditopName.INSERT + elif cpp_op.name == DELETE: + py_name = PyEditopName.DELETE + elif cpp_op.name == REPLACE: + py_name = PyEditopName.REPLACE + elif cpp_op.name == TRANSPOSE: + py_name = PyEditopName.TRANSPOSE + else: + py_name = None + python_path.append(PyEditop( + py_name, + cpp_op.src_idx, + cpp_op.dst_idx, + cpp_op.cost, + cpp_op.output_string.decode("utf-8") + )) + python_paths.append(python_path) + return python_paths + + +def print_all_paths(str a, str b, dict cost_map): + cdef string cpp_a = a.encode("utf-8") + cdef string cpp_b = b.encode("utf-8") + cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map) + cpp_print_all_paths(cpp_a, cpp_b, cpp_cost_map) + + +def compute_distance(str a, str b, dict cost_map): + cdef string cpp_a = a.encode("utf-8") + cdef string cpp_b = b.encode("utf-8") + cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map) + return cpp_compute_distance(cpp_a, cpp_b, cpp_cost_map) diff --git a/examples/osa_example.py b/examples/osa_example.py new file mode 100644 index 0000000..5b701aa --- /dev/null +++ b/examples/osa_example.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Example usage of the OSA distance implementation. +First compile with: python setup.py build_ext --inplace +""" + +try: + from editdistance.osa import ( + PyEditopName, + compute_distance, + compute_with_all_paths, + print_all_paths, + ) + + def main(): + # Define cost map + cost_map = { + PyEditopName.DELETE: 1.0, + PyEditopName.INSERT: 1.0, + PyEditopName.REPLACE: 1.0, + PyEditopName.TRANSPOSE: 1.0, + } + + # Test case from original Python code + print("Testing OSA distance with all paths:") + print_all_paths("aaaaaaaaaa", "abaabababa", cost_map) + + # Additional test case + print("\nAdditional test case:") + paths = compute_with_all_paths("CA", "AX", cost_map) + distance = compute_distance("CA", "AX", cost_map) + + print(f"OSA Distance from 'CA' to 'AX': {distance}") + print(f"Number of optimal edit sequences: {len(paths)}") + print() + + for i, path in enumerate(paths, 1): + print(f"Path {i}:") + for op in path: + print(f" {op}") + print() + + if __name__ == "__main__": + main() + +except ImportError as e: + print("Error importing the compiled module:") + print(e) + print("\nTo compile the module, run:") + print("python setup.py build_ext --inplace") + print("\nMake sure you have Cython installed:") + print("pip install cython") diff --git a/linters.sh b/linters.sh new file mode 100755 index 0000000..9232bfa --- /dev/null +++ b/linters.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +echo "===> black" +black --check . + +echo "===> isort" +isort --check . + +echo "===> flake8" +flake8 . diff --git a/linters__fix.sh b/linters__fix.sh new file mode 100755 index 0000000..039bd62 --- /dev/null +++ b/linters__fix.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +echo "===> black" +black . + +echo "===> isort" +isort . + +echo "===> flake8" +flake8 . diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..24e3b1b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,71 @@ +[build-system] +requires = ["setuptools>=77.0.3", "Cython>=3.0.8"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "black>=24.4.2", + "isort>=5.13.2", + "flake8>=7.1.1", +] + +[project] +name = "editdistance" +version = "0.1.6" +requires-python = ">=3.9" +authors = [ + {name = "QED"}, +] +description = "Fast Damerau optimal string alignment algorithm." +readme = "README.md" +license = "MIT" +license-files = ["LICENSE"] +keywords = ["edit distance", "levenshtein", "damerau"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Natural Language :: English", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: C++", + "Programming Language :: Cython", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", +] + +[project.urls] +Repository = "https://github.com/qedsoftware/edit-distance" + +[tool.black] +line-length = 100 + +[tool.isort] +profile = "black" +line_length = 100 + +[tool.basedpyright] +reportUnusedCallResult = "none" + +[tool.cibuildwheel] +skip = "cp36-* cp37-* pp* *_i686 *_c64le *_s390x" +environment = {LDFLAGS="-Wl,--strip-all"} +build-verbosity = "3" + +test-command = "python -m unittest discover -v" + +[[tool.cibuildwheel.overrides]] +select = "*-macosx*" +environment = {CFLAGS="-g0"} + +[tool.setuptools.packages.find] +where = ["."] +include = ["editdistance"] diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0f73ee4 --- /dev/null +++ b/setup.py @@ -0,0 +1,50 @@ +import os +import platform +import sys +from sysconfig import get_config_vars + +from Cython.Build import cythonize +from packaging.version import parse as parse_version +from setuptools import Extension, setup + + +def is_platform_mac(): + return sys.platform == "darwin" + + +extra_compile_args = [] +extra_link_args = [] +if is_platform_mac(): + if "MACOSX_DEPLOYMENT_TARGET" not in os.environ: + current_system = platform.mac_ver()[0] + python_target = get_config_vars().get("MACOSX_DEPLOYMENT_TARGET", current_system) + target_macos_version = "10.9" + parsed_macos_version = parse_version(target_macos_version) + if ( + parse_version(str(python_target)) + < parsed_macos_version + <= parse_version(current_system) + ): + os.environ["MACOSX_DEPLOYMENT_TARGET"] = target_macos_version + + extra_compile_args = ["-std=c++11"] + extra_link_args = ["-stdlib=libc++"] + +extensions = [ + Extension( + "editdistance.osa", + sources=[ + "./editdistance/edit_distance_osa.pyx", + "./editdistance/_edit_distance_osa.cpp", + ], + language="c++", + include_dirs=["./editdistance"], + extra_compile_args=["-std=c++11"], + ) +] + +setup( + name="editdistance", + ext_modules=cythonize(extensions), + zip_safe=False, +) diff --git a/tests/tests_osa.py b/tests/tests_osa.py new file mode 100644 index 0000000..f6fa7d0 --- /dev/null +++ b/tests/tests_osa.py @@ -0,0 +1,60 @@ +import unittest + +from editdistance.osa import ( + PyEditopName, + compute_distance, + compute_with_all_paths, +) + +COMPUTE_DISTANCE_TEST_CASES = [ + ("single character", "a", "b", 1.0), + ("identical strings", "abc", "abc", 0.0), + ("single deletion", "abc", "ab", 1.0), + ("single insertion", "ab", "abc", 1.0), + ("entirely different", "abc", "def", 3.0), +] + +COMPUTE_ALL_PATHS_TEST_CASES = [ + ("single character", "a", "b", 1), + ("identical strings", "abc", "abc", 1), + ("single deletion", "abc", "ab", 1), + ("single insertion", "ab", "abc", 1), + ("entirely different", "abc", "def", 1), + ("entirely different", "cab", "axb", 2), +] + + +class TestOsaDistance(unittest.TestCase): + def setUp(self): + self.cost_map = { + PyEditopName.DELETE: 1.0, + PyEditopName.INSERT: 1.0, + PyEditopName.REPLACE: 1.0, + PyEditopName.TRANSPOSE: 1.0, + } + + def test_compute_distance(self): + for ( + description, + source, + target, + expected_distance, + ) in COMPUTE_DISTANCE_TEST_CASES: + with self.subTest( + description, + ): + distance = compute_distance(source, target, self.cost_map) + self.assertEqual(distance, expected_distance) + + def test_compute_with_all_paths(self): + for ( + description, + source, + target, + expected_num_paths, + ) in COMPUTE_ALL_PATHS_TEST_CASES: + with self.subTest( + description, + ): + paths = compute_with_all_paths(source, target, self.cost_map) + self.assertEqual(len(paths), expected_num_paths)