Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[flake8]
# see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8
extend-ignore = E203, E501, W503, E701, E704
exclude = \
.venv,
*/.ipynb_checkpoints

max-line-length=100

per-file-ignores =
# imported but unused
__init__.py: F401

39 changes: 39 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build:
runs-on: ubuntu-latest

strategy:
matrix:
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Upgrade pip
run: python -m pip install --upgrade pip

- name: Install development dependencies
run: pip install .[dev]

- name: Ensure linters.sh is executable
run: chmod +x ./linters.sh

- name: Run linters script
run: ./linters.sh

- name: Run tests
run: python -m unittest discover -s tests -p "*.py"
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -191,4 +191,7 @@ cython_debug/
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
# refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore
.cursorindexingignore

# Edit Distance OSA file build by Cython
editdistance/edit_distance_osa.cpp
58 changes: 57 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,57 @@
# edit-distance
# edit-distance

edit-distance is a Python package that provides an implementation of the Optimal String Alignment (OSA) algorithm for calculating edit distances. The package leverages C++ extensions via Cython for improved performance.

## Features

- Calculate edit distances using the OSA algorithm.
- Enables custom weights for each edit operation.
- Find all edit paths resulting in the minimal OSA distance between strings.
- High-performance implementation using C++ and Cython.
- Easy integration into Python projects.

## Installation

Ensure you have a C++ compiler installed. Then, clone the repository and install the package using:

```sh
pip install .
```

Alternatively, call `setup.py` directly:

```sh
python setup.py build_ext --inplace
python setup.py install
```

For more details on the setup, see [setup.py](setup.py).

## Usage

After installation, you can import and use the module in your Python code:

```python
import editdistance.osa

# Example usage:
str1 = "kitten"
str2 = "sitting"
distance = editdistance.osa.calculate_distance(str1, str2)
print(f"The edit distance between '{{}}' and '{{}}' is {{}}".format(str1, str2, distance))
```

See examples located in [examples](examples/osa_example.py) directory.

## Running Tests

The test suite is located in the [tests](tests/tests_osa.py) directory. To run the tests, execute:

```sh
python -m unittest discover -v
```

## License

This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
```
165 changes: 165 additions & 0 deletions editdistance/_edit_distance_osa.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#include "_edit_distance_osa.hpp"
#include <algorithm>
#include <cmath>

std::vector<std::vector<double>> compute_dp_table(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
) {
int len_a = a.length();
int len_b = b.length();
std::vector<std::vector<double>> dp(len_a + 1, std::vector<double>(len_b + 1, 0.0));

for (int i = 0; i <= len_a; ++i) {
dp[i][0] = i * cost_map.at(DELETE);
}
for (int j = 0; j <= len_b; ++j) {
dp[0][j] = j * cost_map.at(INSERT);
}

for (int i = 1; i <= len_a; ++i) {
for (int j = 1; j <= len_b; ++j) {
double deletion = dp[i-1][j] + cost_map.at(DELETE);
double insertion = dp[i][j-1] + cost_map.at(INSERT);
double substitution_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE);
double substitution = dp[i-1][j-1] + substitution_cost;

dp[i][j] = std::min({deletion, insertion, substitution});

if (i > 1 && j > 1 &&
a[i-1] == b[j-2] && a[i-2] == b[j-1]) {
dp[i][j] = std::min(dp[i][j],
dp[i-2][j-2] + cost_map.at(TRANSPOSE));
}
}
}

return dp;
}


double cpp_compute_distance(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
) {
auto dp = compute_dp_table(a, b, cost_map);
return dp[a.length()][b.length()];
}

std::vector<std::vector<Editop>> backtrack_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map,
const std::vector<std::vector<double>>& dp,
int i,
int j,
std::vector<Editop>& current_path
) {
if (i == 0 && j == 0) {
std::vector<Editop> reversed_path = current_path;
std::reverse(reversed_path.begin(), reversed_path.end());
return {reversed_path};
}

std::vector<std::vector<Editop>> all_paths;
double current_cost = dp[i][j];
const double tol = 1e-6;


if (i > 0 && std::abs((dp[i-1][j] + cost_map.at(DELETE)) - current_cost) < tol) {
Editop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1]));
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}

if (j > 0 && std::abs((dp[i][j-1] + cost_map.at(INSERT)) - current_cost) < tol) {
Editop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1]));
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i, j-1, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}


if (i > 0 && j > 0) {
double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE);
if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) {
std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]);
Editop op(REPLACE, i-1, j-1, sub_cost, out_char);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j-1, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}
}


if (i > 1 && j > 1 &&
a[i-1] == b[j-2] && a[i-2] == b[j-1] &&
std::abs((dp[i-2][j-2] + cost_map.at(TRANSPOSE)) - current_cost) < tol) {
std::string transpose_str = std::string(1, b[j-2]) + std::string(1, b[j-1]);
Editop op(TRANSPOSE, i-2, j-2, cost_map.at(TRANSPOSE), transpose_str);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-2, j-2, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}

return all_paths;
}


std::vector<std::vector<Editop>> cpp_compute_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
) {
auto dp = compute_dp_table(a, b, cost_map);
std::vector<Editop> current_path;
return backtrack_all_paths(a, b, cost_map, dp, a.length(), b.length(), current_path);
}


void cpp_print_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
) {
auto paths = cpp_compute_all_paths(a, b, cost_map);
double distance = cpp_compute_distance(a, b, cost_map);

std::cout << "OSA Distance from '" << a << "' to '" << b << "': " << distance << std::endl;
std::cout << "Number of optimal edit sequences: " << paths.size() << std::endl;
std::cout << std::endl;

for (size_t i = 0; i < paths.size(); ++i) {
std::cout << "Path " << (i + 1) << ":" << std::endl;
for (const auto& op : paths[i]) {
std::cout << " " << op << std::endl;
}
std::cout << std::endl;
}
}

std::string editop_name_to_string(EditopName name) {
switch (name) {
case INSERT: return "INSERT";
case DELETE: return "DELETE";
case REPLACE: return "REPLACE";
case TRANSPOSE: return "TRANSPOSE";
default: return "UNKNOWN";
}
}

std::ostream& operator<<(std::ostream& os, const Editop& op) {
os << "Editop(name=" << editop_name_to_string(op.name)
<< ", src_idx=" << op.src_idx
<< ", dst_idx=" << op.dst_idx
<< ", cost=" << op.cost
<< ", output_string='" << op.output_string << "')";
return os;
}
72 changes: 72 additions & 0 deletions editdistance/_edit_distance_osa.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#ifndef EDIT_DISTANCE_OSA_HPP
#define EDIT_DISTANCE_OSA_HPP

#include <string>
#include <vector>
#include <map>
#include <iostream>


enum EditopName {
INSERT,
DELETE,
REPLACE,
TRANSPOSE
};

struct Editop {
EditopName name;
int src_idx;
int dst_idx;
double cost;
std::string output_string;

Editop() : name(INSERT), src_idx(0), dst_idx(0), cost(0.0), output_string("") {}
Editop(EditopName n, int si, int di, double c, const std::string& os)
: name(n), src_idx(si), dst_idx(di), cost(c), output_string(os) {}
};


std::vector<std::vector<double>> compute_dp_table(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
);


double cpp_compute_distance(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
);


std::vector<std::vector<Editop>> backtrack_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map,
const std::vector<std::vector<double>>& dp,
int i,
int j,
std::vector<Editop>& current_path
);


std::vector<std::vector<Editop>> cpp_compute_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
);


void cpp_print_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
);


std::string editop_name_to_string(EditopName name);
std::ostream& operator<<(std::ostream& os, const Editop& op);

#endif // EDIT_DISTANCE_OSA_HPP
Loading