diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..8f91a40 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,62 @@ +name: Release to PyPI +on: + release: + types: [published] + +jobs: + build-wheels: + name: Build wheels - ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + args: --release --out dist + + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-${{ matrix.os }} + path: dist + + build-sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + publish: + name: Publish to PyPI + needs: [build-wheels, build-sdist] + runs-on: ubuntu-latest + + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + pattern: wheels-* + merge-multiple: true + path: dist + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index dac8377..90b9aeb 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,8 +24,6 @@ jobs: test-file: pepmatch/tests/test_best_match.py - name: Discontinuous Epitope test-file: pepmatch/tests/test_discontinuous_search.py - - name: Parallel - test-file: pepmatch/tests/test_parallel_match.py - name: Output test-file: pepmatch/tests/test_output.py diff --git a/.gitignore b/.gitignore index 4290d86..006f268 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ notebooks/ old_code/ pepmatch.egg-info/ pepmatch/__pycache__ +pepmatch/rs-engine/target/ proteomes/*.pdb proteomes/*.phr proteomes/*.pin @@ -31,4 +32,4 @@ test.csv wheelhouse/ # keep test data -!pepmatch/tests/data/ \ No newline at end of file +!pepmatch/tests/data/ diff --git a/pepmatch/__init__.py b/pepmatch/__init__.py index 888b5e0..5677240 100644 --- a/pepmatch/__init__.py +++ b/pepmatch/__init__.py @@ -1,8 +1,6 @@ from .preprocessor import Preprocessor from .matcher import Matcher -from .parallel_match import ParallelMatcher -from .benchmarker import Benchmarker from .version import __version__ -__all__ = ['Preprocessor', 'Matcher', 'ParallelMatcher', 'Benchmarker', '__version__'] \ No newline at end of file +__all__ = ['Preprocessor', 'Matcher', '__version__'] diff --git a/pepmatch/benchmarker.py b/pepmatch/benchmarker.py deleted file mode 100644 index d2160e3..0000000 --- a/pepmatch/benchmarker.py +++ /dev/null @@ -1,57 +0,0 @@ -from .preprocessor import Preprocessor -from .parallel_match import ParallelMatcher - - -class Benchmarker(ParallelMatcher): - """Benchmarker class for PEPMatch. This will be the class that is called by - benchmarking.py that calls the necessary methods to run the benchmarking.""" - - def __init__( - self, benchmark: str, query: str, proteome: str, lengths: list, max_mismatches: int, - method_parameters: dict - ): - self.benchmark = benchmark - self.query = query - self.proteome = proteome - self.max_mismatches = max_mismatches - self.lengths = lengths - self.method_parameters = method_parameters - - super().__init__( - query=query, - proteome_file=proteome, - max_mismatches=max_mismatches, - best_match=True if max_mismatches == -1 else False, - output_format='dataframe', - sequence_version=False, - n_jobs=5 - ) - - def __str__(self): - return 'PEPMatch' - - def preprocess_query(self) -> None: - """No query preprocessing, raise TypeError""" - raise TypeError(self.__str__() + ' does not preprocess queries.\n') - - def preprocess_proteome(self) -> None: - """Preprocess proteome once or multiple times for each split calculated.""" - - preprocessor = Preprocessor(self.proteome) - - if self.benchmark == 'mhc_ligands': - preprocessor.sql_proteome(k=9) - elif self.benchmark == 'coronavirus': - for i in range(2, 6): - preprocessor.pickle_proteome(k=i) - elif self.benchmark == 'neoepitopes': - preprocessor.pickle_proteome(k=3) - elif self.benchmark == 'milk': - preprocessor.sql_proteome(k=15) - for i in [2, 3, 7]: - preprocessor.pickle_proteome(k=i) - - - def search(self) -> list: - """Call overarching match function which returns a dataframe of matches.""" - return self.match() diff --git a/pepmatch/hamming.c b/pepmatch/hamming.c deleted file mode 100644 index 0a64b9c..0000000 --- a/pepmatch/hamming.c +++ /dev/null @@ -1,43 +0,0 @@ -#define PY_SSIZE_T_CLEAN -#include - - -static PyObject* hamming(PyObject* self, PyObject* args) { - const char *kmer1; - const char *kmer2; - int max_mismatches; - Py_ssize_t len1, len2; - - if (!PyArg_ParseTuple(args, "s#s#i", &kmer1, &len1, &kmer2, &len2, &max_mismatches)) { - return NULL; - } - - int mismatches = 0; - for (int i = 0; kmer1[i] && kmer2[i]; ++i) { - if (kmer1[i] != kmer2[i]) { - mismatches++; - if (mismatches > max_mismatches) { - return PyLong_FromLong(max_mismatches + 1); - } - } - } - - return PyLong_FromLong(mismatches); -} - -static PyMethodDef methods[] = { - {"hamming", hamming, METH_VARARGS, "Calculate the Hamming distance between two strings"}, - {NULL, NULL, 0, NULL} -}; - -static struct PyModuleDef module = { - PyModuleDef_HEAD_INIT, - "hamming", - NULL, - -1, - methods -}; - -PyMODINIT_FUNC PyInit_hamming(void) { - return PyModule_Create(&module); -} diff --git a/pepmatch/helpers.py b/pepmatch/helpers.py deleted file mode 100644 index 18e5080..0000000 --- a/pepmatch/helpers.py +++ /dev/null @@ -1,123 +0,0 @@ -import re -import polars as pl - -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord - -PROTEIN_INDEX_MULTIPLIER= 10_000_000 - -class TqdmDummy: - """A dummy class that mimics tqdm for when it's not installed.""" - def __init__(self, *args, **kwargs): pass - def update(self, n=1): pass - def __enter__(self): return self - def __exit__(self, exc_type, exc_val, exc_tb): pass - - -def parse_fasta(file: str) -> list: - """Return a parsed Biopython SeqRecord object from a FASTA file. - Args: - file: path to FASTA file. - """ - return list(SeqIO.parse(file, 'fasta')) - - -def split_sequence(sequence: str, k: int) -> list: - """ - Splits a peptide into equal sized k-mers on a rolling basis. - Ex: k = 4, NSLFLTDLY --> ['NSLF', 'SLFL', 'LFLT', 'FLTD', 'LTDL', 'TDLY'] - - Args: - sequence: peptide sequence. - k: k-mer length. - """ - return [sequence[i:i+k] for i in range(len(sequence) - k + 1)] - - -def extract_metadata(record: SeqRecord, header_id: bool, seen_genes: set) -> list: - """Extract metadata from a FASTA header. - Args: - record: protein SeqRecord from proteome FASTA file. - """ - regexes = { - 'protein_id': re.compile(r"\|([^|]*)\|"), # between | and | - 'protein_name': re.compile(r"\s(.+?)\sOS"), # between space and space before OS - 'species': re.compile(r"OS=(.+?)\sOX"), # between OS= and space before OX - 'taxon_id': re.compile(r"OX=(.+?)(\s|$)"), # between OX= and space - 'gene': re.compile(r"GN=(.+?)(\s|$)"), # between GN= and space - 'pe_level': re.compile(r"PE=(.+?)(\s|$)"), # between PE= and space - 'sequence_version': re.compile(r"SV=(.+?)(\s|$)"), # between SV= and space - 'swissprot': re.compile(r"^(tr|sp)\|"), # between > and | - } - metadata = [] - swissprot = '0' - gene = '' - protein_id = '' - for key in regexes: # loop through compiled regexes to extract metadata - match = regexes[key].search(str(record.description)) - if match: - if key == 'swissprot': - val = '1' if match.group(1) == 'sp' else '0' - elif key == 'protein_id': - val = str(record.id) if header_id else match.group(1) - else: - val = match.group(1) - else: - if key == 'swissprot': - val = '0' - elif key == 'protein_id': - val = str(record.id) - elif key == 'protein_name': - header = str(record.description) - parts = header.split(None, 1) - val = parts[1] if len(parts) > 1 else '' # get the rest of the description after ID - elif key == 'sequence_version': - val = '1' - elif key == 'pe_level': - val = '0' - else: - val = '' - - if key == 'swissprot': - swissprot = val - elif key == 'gene': - gene = val - elif key == 'protein_id': - protein_id = val - - metadata.append(val) - - # gene priority logic: if a protein is in SwissProt and does not have a dash ("-") and gene not seen yet - if swissprot == '1' and '-' not in protein_id and gene and gene not in seen_genes: - seen_genes.add(gene) - metadata.insert(-1, '1') - else: - metadata.insert(-1, '0') - - return metadata - - -def output_matches(df: pl.DataFrame, output_format: str, output_name: str) -> None: - df_to_write = df.clone() # for files that can't do nested data, we convert mutated positions column to string - if "Mutated Positions" in df_to_write.columns and output_format in ['csv', 'tsv', 'xlsx']: - df_to_write = df_to_write.with_columns( - pl.when(pl.col("Mutated Positions").list.len() > 0) - .then(pl.format("[{}]", pl.col("Mutated Positions").list.eval(pl.element().cast(pl.Utf8)).list.join(", "))) - .otherwise(pl.lit("[]")) - .alias("Mutated Positions") - ) - - # appends '.' + filetype if the name does not already contain it - path = output_name.__str__() - if not path.lower().endswith(f".{output_format}"): - path += f".{output_format}" - - if output_format == 'csv': - df_to_write.write_csv(path) - elif output_format == 'tsv': - df_to_write.write_csv(path, separator='\t') - elif output_format == 'xlsx': - df_to_write.write_excel(path) - elif output_format == 'json': - df_to_write.write_json(path) - diff --git a/pepmatch/matcher.py b/pepmatch/matcher.py index 25ea00c..53d8e28 100755 --- a/pepmatch/matcher.py +++ b/pepmatch/matcher.py @@ -1,886 +1,326 @@ -from __future__ import annotations import os -import _pickle as pickle -import sqlite3 import polars as pl +from pathlib import Path +from Bio import SeqIO +from ._rs import rs_preprocess, rs_match, rs_discontinuous -from typing import Optional, Union -from collections import Counter, defaultdict - -from .helpers import ( - parse_fasta, split_sequence, extract_metadata, output_matches, TqdmDummy, PROTEIN_INDEX_MULTIPLIER -) -from .preprocessor import Preprocessor -from .hamming import hamming - -try: - from tqdm import tqdm # for progress bar during search -except ImportError: - tqdm = TqdmDummy - - -NUM_OUTPUT_COLUMNS = 14 VALID_OUTPUT_FORMATS = ['dataframe', 'csv', 'tsv', 'xlsx', 'json'] - +FASTA_EXTENSIONS = { + '.fasta', '.fas', '.fa', '.fna', '.ffn', '.faa', '.mpfa', '.frn' +} + +def output_matches(df: pl.DataFrame, output_format: str, output_name: str) -> None: + path = output_name.__str__() + if not path.lower().endswith(f".{output_format}"): + path += f".{output_format}" + if output_format == 'csv': + df.write_csv(path) + elif output_format == 'tsv': + df.write_csv(path, separator='\t') + elif output_format == 'xlsx': + df.write_excel(path) + elif output_format == 'json': + df.write_json(path) class Matcher: - """ - Object class that class takes in a query (either FASTA file or Python list) - of peptides, a proteome file, and a maximum # of mismatches. It can either - find exact matches, matches with mismatches, or the best match for a peptide, - depending on these parameters. - - A k value can also be passed if the proteome is already preprocessed, which - most of the time it will be - so pass the k value to save time. - - If best match is selected and mismatching is to be done, one of two things - will happen: - 1. A k is specified already - just run mismatching and get the best match. - 2. A k is not specified - run best match code which preprocesses the proteome - multiple times to get the best match quickly. - - Optional: output and output_format arguments to write results to file. - Supported formats are "csv", "tsv", "xlsx", and "json".""" + """Searches query peptides against a preprocessed proteome index + and returns matches as a Polars DataFrame or output file.""" def __init__( self, query, proteome_file, - max_mismatches=-1, + max_mismatches=0, k=0, preprocessed_files_path='.', best_match=False, output_format='dataframe', output_name='', - sequence_version=True + sequence_version=True, ): - if k < 0 or k == 1: - raise ValueError('Invalid k value given. k cannot be negative or 1.') + + if best_match and k == 0: + self.k = 0 + self.k_specified = False + elif k == 0: + self.k = 5 + self.k_specified = False + else: + self.k = k + self.k_specified = True + + if k != 0 and k < 2: + raise ValueError('k must be >= 2.') if output_format not in VALID_OUTPUT_FORMATS: raise ValueError( - 'Invalid output format, please choose `dataframe`, ' - '`csv`, `tsv`, `xlsx`, or `json`.' + f'Invalid output format. Choose from: {VALID_OUTPUT_FORMATS}' ) - # initialize query and output name based on input type - self.query = self._initialize_query(query, proteome_file, output_name) - self.proteome = proteome_file + self.proteome_file = str(proteome_file) self.proteome_name = str(proteome_file).split('/')[-1].split('.')[0] - - # discontinuous epitopes and linear epitopes handling - store separately - self.discontinuous_epitopes = self._find_discontinuous_epitopes() - self.query = self._clean_query() - - assert self.query or self.discontinuous_epitopes, 'Query is empty.' - - self.lengths = sorted(set(len(peptide) for _, peptide in self.query)) self.max_mismatches = max_mismatches self.preprocessed_files_path = preprocessed_files_path self.best_match = best_match self.output_format = output_format self.sequence_version = sequence_version + self.output_name = output_name or 'PEPMatch_results' - # select format based on # of mismatches to pass to Preprocessor - # SQLite for exact matching - pickle for mismatching - self.preprocess_format = 'sql' if not max_mismatches else 'pickle' - - # initialize k and k_specified - if self.query: - self.k, self.k_specified = self._initialize_k(k) - - # for mismatching, if no k is specified, batch the peptides by length - # then later we will use the ideal k to search them - if max_mismatches > 0: - self.batched_peptides = self._batch_query() - - # best match where no mismatches is specified means we will preprocess - # the proteome by the smallest length, try to find exact matches, - # then preprocess by half the length and search for more and repeat - # until every peptide in the query has a match - if max_mismatches == -1: - self.ks = self._best_match_ks() - - - def _initialize_query( - self, query: Union[str, list], proteome_file: str, output_name: str - ) -> list: - """Initialize query and output name based on input type. - - Args: - query: either a FASTA file or a Python list of peptides. - proteome_file: the proteome FASTA file. - output_name: the name of the output file.""" + self.query = self._parse_query(query) + self.discontinuous_epitopes = self._find_discontinuous_epitopes() + self.query = self._clean_query() + if not self.query and not self.discontinuous_epitopes: + raise ValueError('Query is empty.') + + def _parse_query(self, query): if isinstance(query, list): - if output_name: - self.output_name = output_name - else: - self.output_name = 'PEPMatch_results' + return [(str(i + 1), seq.upper()) for i, seq in enumerate(query)] + + path = Path(query) + if not path.is_file(): + raise FileNotFoundError(f'Query file not found: {query}') + + if path.suffix.lower() in FASTA_EXTENSIONS: return [ - (str(i+1), seq.upper()) # number each peptide in the list - for i, seq in enumerate(query) if isinstance(seq, str) and seq.strip() - ] - - else: # parse from FASTA if not Python list - parsed_query = [ - (record.id, str(record.seq).upper()) - for record in parse_fasta(query) if str(record.seq).strip() + (record.id, str(record.seq).upper()) + for record in SeqIO.parse(str(path), 'fasta') ] - if output_name: - self.output_name = output_name - else: # output_name = query_name_to_proteome_name - query_name = str(query).split('/')[-1].split('.')[0] - proteome_name = str(proteome_file).split('/')[-1].split('.')[0] - self.output_name = f'{query_name}_to_{proteome_name}' - return parsed_query - - - def _find_discontinuous_epitopes(self) -> dict: - """Find discontinuous epitopes and store them in a dictionary - mapping their query_id to the parsed epitope.""" + elif path.suffix.lower() == '.txt': + with open(path) as f: + lines = [line.strip() for line in f if line.strip()] + return [(str(i + 1), seq.upper()) for i, seq in enumerate(lines)] + + raise ValueError( + f'Unsupported query format: {path.suffix}. ' + f'Use a Python list, .txt file, or FASTA file for the query.' + ) + + def _find_discontinuous_epitopes(self): discontinuous_epitopes = {} for query_id, peptide in self.query: try: - # Check if the peptide string can be parsed as a discontinuous epitope - discontinuous_epitope = [(x[0], int(x[1:])) for x in peptide.split(', ')] - discontinuous_epitopes[query_id] = discontinuous_epitope + epitope = [(x[0], int(x[1:])) for x in peptide.split(', ')] + discontinuous_epitopes[query_id] = epitope except (ValueError, IndexError): continue return discontinuous_epitopes - - def _clean_query(self) -> list: - """Remove discontinuous epitopes from the main query list.""" - # Get the query_ids of all identified discontinuous epitopes + def _clean_query(self): discontinuous_ids = set(self.discontinuous_epitopes.keys()) - # Return a new query list excluding those IDs return [ - (query_id, peptide) for query_id, peptide in self.query - if query_id not in discontinuous_ids + (qid, seq) for qid, seq in self.query + if qid not in discontinuous_ids ] + def match(self): + linear_df = pl.DataFrame() + discontinuous_df = pl.DataFrame() - def _initialize_k(self, k: int) -> tuple: - """Initialize k and k_specified values based on k and max_mismatches input. - - Args: - k: the k-mer length to use for matching, 0 if not specified.""" - - if k > 1: - return k, True - else: # use the length of the shortest peptide for exact match - if not self.max_mismatches and not k: - return self.lengths[0], False + if self.query: + if self.best_match and self.k_specified: + results = self._search(self.k, self.max_mismatches) + linear_df = self._best_match_filter(self._to_dataframe(results)) + elif self.best_match: + linear_df = self.best_match_search() else: - return 0, False - + results = self._search(self.k, self.max_mismatches) + linear_df = self._to_dataframe(results) + + if self.discontinuous_epitopes: + pepidx_path = self._pepidx_path(2) + if not os.path.isfile(pepidx_path): + rs_preprocess(self.proteome_file, 2, pepidx_path) + epitopes = [ + (qid, residues) for qid, residues in self.discontinuous_epitopes.items() + ] + results = rs_discontinuous(pepidx_path, epitopes, self.max_mismatches) + discontinuous_df = self._to_dataframe(results) - def _batch_query(self) -> dict: - """Batch peptides together by ideal k so it can be used when searching. - If k is specified, just return the query as a dictionary with k as the key.""" + dfs = [d for d in [linear_df, discontinuous_df] if d.height > 0] + df = pl.concat(dfs, how="vertical") if dfs else linear_df - if self.k_specified: - return {self.k: self.query} + if self.output_format == 'dataframe': + return df else: - batched_peptides = defaultdict(list) - for query_id, seq in self.query: - key = len(seq) // (self.max_mismatches + 1) - batched_peptides[key].append((query_id, seq)) - - return dict(batched_peptides) + output_matches(df, self.output_format, self.output_name) + def _pepidx_path(self, k): + return os.path.join( + self.preprocessed_files_path, f'{self.proteome_name}_{k}mers.pepidx' + ) - def _best_match_ks(self) -> list: - """For the special case where mismatching is to be done, k is not - specified and best match is selected, we need to get all the k values - to preprocess the proteome multiple times. Starting with the length - of the smallest peptide and then halving until we get to 2.""" + def _search(self, k, max_mismatches, peptides=None): + pepidx_path = self._pepidx_path(k) + if not os.path.isfile(pepidx_path): + print(f"Preprocessing {self.proteome_name} with k={k}...") + rs_preprocess(self.proteome_file, k, pepidx_path) + query = peptides or self.query + print(f"Searching {len(query)} peptides against {self.proteome_name} (k={k}, max_mismatches={max_mismatches})...") + return rs_match(pepidx_path, query, k, max_mismatches) - initial_k = self.lengths[0] - ks = [initial_k] + def best_match_search(self): + peptides_remaining = self.query.copy() + all_results = [] - # halve the length and add k values until we get to 2 - # and make sure 2 is a k in the list + initial_k = min(len(seq) for _, seq in peptides_remaining) + ks = [initial_k] while initial_k > 2: initial_k //= 2 + ks.append(max(initial_k, 2)) + ks = sorted(set(ks), reverse=True) - if initial_k == 1: - ks.append(2) - else: - ks.append(initial_k) - - # mismatching function uses batched peptides, for best match, batch into one - self.batched_peptides = {0: self.query} - - return ks - - - def match(self) -> Union[pl.DataFrame, None]: - """Overarching function that calls the appropriate search matching function - based on the parameters.""" + for k in ks: + if not peptides_remaining: + break - total_items = len(self.query) + len(self.discontinuous_epitopes) - with tqdm(total=total_items, desc="Matching peptides", unit="peptide") as pbar: - query_matches = [] - if self.query: - if self.max_mismatches == -1: - query_matches = self.best_match_search(pbar=pbar) - elif self.max_mismatches > 0: - query_matches = self.mismatch_search(pbar=pbar) - else: - query_matches = self.exact_match_search(pbar=pbar) + min_len = min(len(seq) for _, seq in peptides_remaining) + max_mm = (min_len // k) - 1 + if max_mm < 0: + max_mm = 0 - discontinuous_matches = [] - if self.discontinuous_epitopes: - discontinuous_matches = self.discontinuous_search(pbar=pbar) + results = self._search(k, max_mm, peptides_remaining) - query_df = self._dataframe_matches(query_matches) - discontinuous_df = self._dataframe_matches(discontinuous_matches) - df = pl.concat([query_df, discontinuous_df], how="vertical") + matched_ids = set() + for row in results: + if row[2] != '': + all_results.append(row) + matched_ids.add(row[0]) - if self.output_format == 'dataframe': - return df - else: - output_matches(df, self.output_format, self.output_name) - + peptides_remaining = [ + (qid, seq) for qid, seq in peptides_remaining if qid not in matched_ids + ] + print(f" -> k={k}, mismatches<={max_mm}: {len(peptides_remaining)} remaining") - def exact_match_search(self, pbar: Optional[TqdmDummy] = None) -> list: - """Using preprocessed data within a SQLite database and the query peptides, - find the peptide matches within the proteome without any residue - substitutions.""" + if peptides_remaining: + max_mm = min(len(seq) for _, seq in peptides_remaining) // 2 - preprocessed_db = os.path.join( - self.preprocessed_files_path, self.proteome_name + '.db' - ) - kmers_table_name = f'{self.proteome_name}_{str(self.k)}mers' - metadata_table_name = f'{self.proteome_name}_metadata' + while peptides_remaining: + shortest_len = min(len(seq) for _, seq in peptides_remaining) + if max_mm >= shortest_len: + break - conn = sqlite3.connect(preprocessed_db) - cursor = conn.cursor() + max_mm += 1 + results = self._search(2, max_mm, peptides_remaining) - cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{kmers_table_name}'") - if cursor.fetchone() is None: - cursor.close() - conn.close() + matched_ids = set() + for row in results: + if row[2] != '': + all_results.append(row) + matched_ids.add(row[0]) - print( - f"\nMissing preprocessed file or table. Creating table for k={self.k}. This may take a bit..." + peptides_remaining = [ + (qid, seq) for qid, seq in peptides_remaining if qid not in matched_ids + ] + print(f" -> k=2, mismatches<={max_mm}: {len(peptides_remaining)} remaining") + + if peptides_remaining: + for qid, seq in peptides_remaining: + all_results.append([qid, seq] + [''] * 14) + + df = self._to_dataframe(all_results) + return self._best_match_filter(df) + + def _best_match_filter(self, df): + matched_df = df.filter(pl.col("Matched Sequence").is_not_null()) + unmatched_df = df.filter(pl.col("Matched Sequence").is_null()) + + if matched_df.height > 0: + matched_df = matched_df.with_columns( + pl.col("Mismatches").min().over("Query ID").alias("_min_mm") + ).filter( + pl.col("Mismatches") == pl.col("_min_mm") + ).drop("_min_mm") + + matched_df = matched_df.with_columns( + pl.col("Gene Priority").max().over("Query ID").alias("_max_gp") + ).filter( + pl.col("Gene Priority") == pl.col("_max_gp") + ).drop("_max_gp") + + matched_df = ( + matched_df.with_columns( + (~pl.col("Protein ID").str.contains("-")).cast(pl.Int8).alias("_is_canonical") + ).with_columns( + pl.col("_is_canonical").max().over("Query ID").alias("_max_canonical") + ).filter( + pl.col("_is_canonical") == pl.col("_max_canonical") + ).drop(["_is_canonical", "_max_canonical"]) ) - - p = Preprocessor(self.proteome, preprocessed_files_path=self.preprocessed_files_path) - p.sql_proteome(self.k) - - conn = sqlite3.connect(preprocessed_db) - cursor = conn.cursor() - - all_matches = [] - for query_id, peptide in self.query: - - if len(peptide) < self.k: - continue - # split peptide into kmers - only use kmers necessary that overlap entire peptide - all_kmers = split_sequence(peptide, self.k) - - target_indices = list(range(0, len(all_kmers), self.k)) - if (len(all_kmers) - 1) not in target_indices: - target_indices.append(len(all_kmers) - 1) - target_indices = sorted(list(set(target_indices))) - - target_kmer_positions = defaultdict(list) - for index in target_indices: - kmer = all_kmers[index] - target_kmer_positions[kmer].append(index) - - unique_target_kmers = list(target_kmer_positions.keys()) - sql_placeholders = ', '.join('?' * len(unique_target_kmers)) - sql_query = f""" - SELECT kmer, idx FROM "{kmers_table_name}" - WHERE kmer IN ({sql_placeholders}) - """ - cursor.execute(sql_query, unique_target_kmers) - kmer_indexes = cursor.fetchall() - - kmer_hit_list = [] - for kmer, db_index in kmer_indexes: - correct_positions = target_kmer_positions.get(kmer, []) - for pos in correct_positions: - kmer_hit_list.append(db_index - pos) - - matches = [] - sum_hits = Counter(kmer_hit_list) - for hit, count in sum_hits.items(): - if count == len(target_indices): - matches.append(hit) - - processed_matches = self._process_exact_matches( - query_id, peptide, matches, cursor, metadata_table_name + matched_df = matched_df.with_columns( + pl.col("SwissProt Reviewed").cast(pl.Int8).max().over("Query ID").alias("_max_reviewed") + ).filter( + pl.col("SwissProt Reviewed").cast(pl.Int8) == pl.col("_max_reviewed") + ).drop("_max_reviewed") + + matched_df = matched_df.with_columns( + pl.col("Protein Existence Level").min().over("Query ID").alias("_min_pe") + ).filter( + pl.col("Protein Existence Level") == pl.col("_min_pe") + ).drop("_min_pe") + + matched_df = ( + matched_df.with_columns( + (~pl.col("Protein Name").str.contains("Fragment")).alias("_not_fragment") + ).with_columns( + pl.col("_not_fragment").any().over("Query ID").alias("_has_non_fragment") + ).filter( + (pl.col("_not_fragment") & pl.col("_has_non_fragment")) | + (~pl.col("_has_non_fragment")) + ).drop(["_not_fragment", "_has_non_fragment"]) ) - all_matches.extend(processed_matches) - pbar.update(1) - - cursor.close() - conn.close() - - return all_matches - - - def _process_exact_matches( - self, query_id: str, peptide: str, matches: list, cursor: sqlite3.Cursor, metadata_table_name: str - ) -> list: - """Extract all metadata for the exact matches and return as a list of tuples. - - Args: - peptide: the query peptide. - matches: the list of exact matches for the peptide. - cursor: the cursor object to execute SQL queries. - metadata_table_name: the name of the metadata table in the database.""" - - all_matches = [] - if not matches: - all_matches.append((query_id, peptide) + (None,) * NUM_OUTPUT_COLUMNS) - else: - for match in matches: - protein_number = (match - (match % PROTEIN_INDEX_MULTIPLIER)) // PROTEIN_INDEX_MULTIPLIER - query = f"""SELECT * - FROM "{metadata_table_name}" - WHERE protein_number = "{protein_number}" - """ - cursor.execute(query) - protein_data = cursor.fetchone() - match_data = ( - query_id, # query identifier - peptide, # query peptide - peptide, # matched peptide (same as query) - protein_data[1], # protein ID - protein_data[2], # protein name - protein_data[3], # species - protein_data[4], # taxon ID - protein_data[5], # gene symbol - 0, # 0 mismatches for exact match - [], # mutated positions (none) - (match % PROTEIN_INDEX_MULTIPLIER) + 1, # index start - (match % PROTEIN_INDEX_MULTIPLIER) + len(peptide), # index end - protein_data[6], # protein existence level - protein_data[7], # sequence version - protein_data[8], # gene priority flag - protein_data[9]) # swissprot flag - - all_matches.append(match_data) - - return all_matches - - - def mismatch_search(self, pbar: Optional[TqdmDummy] = None) -> list: - """Using preprocessed data within a serialized pickle files, the query - peptides, and a maximum number of residue substitutions, find all peptide - matches up to and including the maximum number of residue substitutions. - - This will utilize the pigeon hole principle to find all possible matches. - We first search for any k-mer exact matches in the proteome and then - check the left and right k-mer neighbors for mismatches. If the number of - mismatches is less than or equal to the max, then it's a match.""" - - all_matches = [] - for k, peptides in self.batched_peptides.items(): # iterate through each batch - if not self.k_specified: # k: [peptides] - self.k = k - try: - kmer_dict, rev_kmer_dict, metadata_dict = self._read_pickle_files() - except FileNotFoundError: # do preprocessing if pickle files don't exist - print(f"\nPickle files not found, building files for k={self.k}. This may take a bit...") - Preprocessor(self.proteome).pickle_proteome(self.k) - kmer_dict, rev_kmer_dict, metadata_dict = self._read_pickle_files() - - for query_id, peptide in peptides: - - all_kmers = split_sequence(peptide, self.k) - - if len(peptide) % self.k == 0: # faster search if possible - matches = self._find_even_split_matches( - all_kmers, kmer_dict, rev_kmer_dict, len(peptide) - ) - - else: # slower search if necessary - matches = self._find_uneven_split_matches( - all_kmers, kmer_dict, rev_kmer_dict, len(peptide) - ) - - processed_matches = self._process_mismatch_matches( - query_id, peptide, matches, metadata_dict - ) - all_matches.extend(processed_matches) - if not self.best_match: - pbar.update(1) - - return all_matches - - - def _read_pickle_files(self) -> tuple: - """Read in the already created pickle files for each dictionary in the - preprocessing step.""" - - with open(os.path.join(self.preprocessed_files_path, - f'{self.proteome_name}_{str(self.k)}mers.pkl'), 'rb') as f: - kmer_dict = pickle.load(f) - - with open(os.path.join(self.preprocessed_files_path, - f'{self.proteome_name}_metadata.pkl'), 'rb') as f: - metadata_dict = pickle.load(f) - - rev_kmer_dict = {i: k for k, v in kmer_dict.items() for i in v} - - return kmer_dict, rev_kmer_dict, metadata_dict - - - def _find_even_split_matches( - self, kmers: list, kmer_dict: dict, rev_kmer_dict: dict, peptide_length: int - ) -> list: - """If the peptide length is evenly divisible by k, perform faster search. - Check the associated k-mers for the left and right neighbors of any - exact matching k-mers in the proteome. - - EXAMPLE: "YLLDLHSYL", k=3 -> check only ["YLL", "DLH", "SYL"]. - If, say "DLH" is found somewhere in the proteome, check the neighboring - k-mers to the left and right to see if "YLL" (left neighbor) and "SYL" - (right neighbor) have mismatches with the proteome k-mers, return any - matches that are less than or equal to self.max_mismatches. - - Args: - kmers: the k-mers of the query peptide. - kmer_dict: the k-mer -> index dictionary. - rev_kmer_dict: the index -> k-mer dictionary. - peptide_length: the length of the query peptide.""" - - matches = set() - for idx in range(0, len(kmers), self.k): # gets only the k-mers to check - kmer_hits = kmer_dict.get(kmers[idx], []) - for kmer_hit in kmer_hits: - - mismatches = 0 - mismatches = self._check_left_neighbors( - kmers, idx, kmer_hit, rev_kmer_dict, mismatches - ) - - mismatches = self._check_right_neighbors( - kmers, idx, kmer_hit, rev_kmer_dict, mismatches - ) - - if mismatches <= self.max_mismatches: - matched_peptide = '' # add k-mers to get the matched peptide - for i in range(0, peptide_length, self.k): - matched_peptide += rev_kmer_dict[kmer_hit - idx + i] - - matches.add((matched_peptide, mismatches, kmer_hit - idx)) - - if self.best_match and not mismatches: # can't have a better match - return list(matches) - - else: - continue - - return list(matches) - - - def _find_uneven_split_matches( - self, kmers: list, kmer_dict: dict, rev_kmer_dict: dict, peptide_length: int - ) -> list: - """If the peptide length is NOT evenly divisible by k, perform slow search. - Check the associated residues for the left and right neighbors of any - exact matching k-mers in the proteome. - - EXAMPLE: "YLLDLHSYL", k=3 -> check all k-mers: - ["YLL", "LLD", "LDL", "DLH", "LHS", "HSY", "SYL"]. - - If, say "DLH" is found somewhere in the proteome, check left "L" from "LDL", - left "L" from "LLD", and "Y" from "YLL" to for any mismatches. Then do the - same for the residues of the right neighbors. Return any matches that are - less than or equal to self.max_mismatches. - - Args: - kmers: the k-mers of the query peptide. - kmer_dict: the k-mer -> index dictionary. - rev_kmer_dict: the index -> k-mer dictionary. - peptide_length: the length of the query peptide.""" - - matches = set() - for idx in range(0, len(kmers)): # every k-mer - kmer_hits = kmer_dict.get(kmers[idx], []) - for kmer_hit in kmer_hits: - - mismatches = 0 - mismatches = self._check_left_residues( - kmers, idx, kmer_hit, rev_kmer_dict, mismatches - ) - - mismatches = self._check_right_residues( - kmers, idx, kmer_hit, rev_kmer_dict, mismatches - ) - - if mismatches <= self.max_mismatches: - matched_peptide = '' - for i in range(0, peptide_length, self.k): - if i + self.k >= peptide_length: # handle last k-mer when uneven split - remaining_res = peptide_length - i - last_kmer = rev_kmer_dict[kmer_hit - idx + i - (self.k - remaining_res)] - matched_peptide += last_kmer[-remaining_res:] # attach remaining residues - else: - matched_peptide += rev_kmer_dict[kmer_hit - idx + i] - - matches.add((matched_peptide, mismatches, kmer_hit - idx)) - - if self.best_match and not mismatches: # can't have a better match - return matches - - return matches - - - def _check_left_neighbors( - self, kmers: list, idx: int, kmer_hit: int, rev_kmer_dict: dict, mismatches: int - ) -> int: - """Get mismatches of left k-mer neighbors in proteome. - - Args: - kmers: the k-mers of the query peptide. - idx: the index of the k-mer in the query peptide. - kmer_hit: the index of the k-mer hit in the proteome. - rev_kmer_dict: the index -> k-mer dictionary. - mismatches: the number of mismatches so far.""" - - for i in range(0, idx, self.k): - if mismatches > self.max_mismatches: - return 100 - - kmer_to_check = rev_kmer_dict.get(kmer_hit + i - idx) - if kmer_to_check is not None: - mismatches += hamming(kmer_to_check, kmers[i], self.max_mismatches) - else: - return 100 - - return mismatches - - - def _check_right_neighbors( - self, kmers: list, idx: int, kmer_hit: int, rev_kmer_dict: dict, mismatches: int - ) -> int: - """Get mismatches of right k-mer neighbors in proteome. - - Args: - kmers: the k-mers of the query peptide. - idx: the index of the k-mer in the query peptide. - kmer_hit: the index of the k-mer hit in the proteome. - rev_kmer_dict: the index -> k-mer dictionary. - mismatches: the number of mismatches so far.""" - - for i in range(self.k + idx, len(kmers), self.k): - if mismatches > self.max_mismatches: - return 100 - - kmer_to_check = rev_kmer_dict.get(kmer_hit + i - idx) - if kmer_to_check is not None: - mismatches += hamming(kmer_to_check, kmers[i], self.max_mismatches) - else: - return 100 - - return mismatches - - - def _check_left_residues( - self, kmers: list, idx: int, kmer_hit: int, rev_kmer_dict: dict, mismatches: int - ) -> int: - """Get mismatches of left residues of left k-mer neighbors in proteome. - - Args: - kmers: the k-mers of the query peptide. - idx: the index of the k-mer in the query peptide. - kmer_hit: the index of the k-mer hit in the proteome. - rev_kmer_dict: the index -> k-mer dictionary. - mismatches: the number of mismatches so far.""" - - for i in range(0, idx): - if mismatches > self.max_mismatches: - return 100 - - kmer_to_check = rev_kmer_dict.get(kmer_hit + i - idx) - if kmer_to_check is not None: - if kmer_to_check[0] != kmers[i][0]: - mismatches += 1 - else: - return 100 - - return mismatches - - - def _check_right_residues( - self, kmers: list, idx: int, kmer_hit: int, rev_kmer_dict: dict, mismatches: int - ) -> int: - """Get mismatches of right residues of right k-mer neighbors in proteome. - - Args: - kmers: the k-mers of the query peptide. - idx: the index of the k-mer in the query peptide. - kmer_hit: the index of the k-mer hit in the proteome. - rev_kmer_dict: the index -> k-mer dictionary. - mismatches: the number of mismatches so far.""" - - for i in range(idx + 1, len(kmers)): - if mismatches > self.max_mismatches: # check before since we checked left first - return 100 - - kmer_to_check = rev_kmer_dict.get(kmer_hit + i - idx) - if kmer_to_check is not None: - if kmer_to_check[-1] != kmers[i][-1]: - mismatches += 1 - else: - return 100 - - return mismatches - - - def _process_mismatch_matches( - self, query_id: str, peptide: str, matches: list, metadata_dict: dict - ) -> list: - """Extract all metadata for the mismatch matches and return as a list of tuples. - - Args: - peptide: the query peptide. - matches: the list of mismatch matches for the peptide. - metadata_dict: the protein number -> metadata dictionary.""" - - all_matches = [] - if not matches: - all_matches.append((query_id, peptide) + (None,) * NUM_OUTPUT_COLUMNS) - else: - for match in matches: - metadata_key = (match[2] - (match[2] % PROTEIN_INDEX_MULTIPLIER)) // PROTEIN_INDEX_MULTIPLIER - metadata = metadata_dict[metadata_key] - - mutated_positions = [ - i+1 for i in range(len(peptide)) if peptide[i] != match[0][i] - ] - index_start = int((match[2] % PROTEIN_INDEX_MULTIPLIER) + 1) - index_end = int((match[2] % PROTEIN_INDEX_MULTIPLIER) + len(peptide)) - - taxon_id = int(metadata[3]) if metadata[3] else None - pe_level = int(metadata[5]) if metadata[5] else None - - match_data = ( - query_id, # query identifier - peptide, # query peptide - match[0], # matched peptide - metadata[0], # protein ID - metadata[1], # protein name - metadata[2], # species - taxon_id, # taxon ID - metadata[4], # gene symbol - int(match[1]), # mismatches count - mutated_positions, # mutated positions - index_start, # index start - index_end, # index end - pe_level, # protein existence level - metadata[6], # sequence version - metadata[7], # gene priority flag - metadata[8] # swissprot flag - ) - all_matches.append(match_data) - - return all_matches - - - def best_match_search(self, pbar: Optional[TqdmDummy] = None) -> list: - """Finds the best match for each peptide by iteratively decreasing k and - increasing the allowed number of mismatches, reporting progress after each stage.""" - - print("\n=== Warning: best match search feature of PEPMatch may take awhile. ===") - all_found_matches = [] - peptides_to_search = self.query.copy() - initial_peptide_count = len(peptides_to_search) - print(f"Starting best match search for {initial_peptide_count} peptides.") - - # STAGE 1: Try top k values to get matches that are quicker to get - if self.ks: - for k in self.ks: - if not peptides_to_search: - break # stop if all peptides have been matched - - min_len = min(len(p[1]) for p in peptides_to_search) - self.max_mismatches = (min_len // k) - 1 - if self.max_mismatches < 0: - self.max_mismatches = 0 - - self.k, self.k_specified = k, True - self.query, self.batched_peptides = peptides_to_search, {0: peptides_to_search} - - if self.max_mismatches == 0: - matches_this_pass = self.exact_match_search(pbar=TqdmDummy()) - else: - matches_this_pass = self.mismatch_search(pbar=TqdmDummy()) - - peptides_still_unmatched = [] - peptides_matched_count = 0 - - for match in matches_this_pass: - if match[2] is not None: - all_found_matches.append(match) - peptides_matched_count += 1 - else: - peptides_still_unmatched.append((match[0], match[1])) - - print( - f"-> k={k}, mismatches<={self.max_mismatches}: {len(peptides_still_unmatched)} remaining." - ) - peptides_to_search = peptides_still_unmatched - - # STAGE 2: For peptides not found above, use k=2 and increase the mismatch threshold - if peptides_to_search: - self.k, self.k_specified = 2, True - self.max_mismatches = (min(len(p[1]) for p in peptides_to_search) // 2) - - while peptides_to_search: - shortest_len = min(len(p[1]) for p in peptides_to_search) - if self.max_mismatches >= shortest_len: - print(f"Stopping search: required mismatches ({self.max_mismatches}) exceeds peptide length ({shortest_len}).") - break - - self.max_mismatches += 1 - self.query, self.batched_peptides = peptides_to_search, {0: peptides_to_search} - - matches_this_pass = self.mismatch_search(pbar=TqdmDummy()) - peptides_still_unmatched = [] - peptides_matched_count = 0 - - for match in matches_this_pass: - if match[2] is not None: - all_found_matches.append(match) - peptides_matched_count += 1 - else: - peptides_still_unmatched.append((match[0], match[1])) - - print(f"-> k=2, mismatches<={self.max_mismatches}: {len(peptides_still_unmatched)} remaining.") - peptides_to_search = peptides_still_unmatched - - if peptides_to_search: - print(f"Could not find matches for {len(peptides_to_search)} peptides.") - for query_id, peptide in peptides_to_search: - all_found_matches.append((query_id, peptide) + (None,) * NUM_OUTPUT_COLUMNS) - - pbar.update(initial_peptide_count) - return all_found_matches - - - def discontinuous_search(self, pbar: Optional[TqdmDummy] = None) -> list: - """Find matches for discontinuous epitopes. Loops through every protein - in the proteome and checks if the residues at the given positions match - the query epitope residues up to the maximum number of mismatches.""" - - all_matches = [] - for query_id, dis_epitope in self.discontinuous_epitopes.items(): - match = False - query_peptide_str = ', '.join([x[0] + str(x[1]) for x in dis_epitope]) - for protein_record in parse_fasta(self.proteome): - try: - residue_matches = sum( - [x[0] == protein_record.seq[x[1] - 1] for x in dis_epitope] - ) - if residue_matches >= (len(dis_epitope) - self.max_mismatches): - match = True - metadata = extract_metadata(protein_record, False, set()) - match_data = ( - query_id, - query_peptide_str, # query peptide - ', '.join( # matched epitope - [protein_record.seq[x[1] - 1] + str(x[1]) for x in dis_epitope]), - metadata[0], # protein ID - metadata[1], # protein name - metadata[2], # species - metadata[3], # taxon ID - metadata[4], # gene symbol - len(dis_epitope) - residue_matches, # mismatches count - # mutated positions - [x[1] for x in dis_epitope if x[0] != protein_record.seq[x[1] - 1]], - dis_epitope[0][1], # index start - dis_epitope[-1][1], # index end - metadata[5], # protein existence level - metadata[6], # sequence version - metadata[7], # gene priority flag - metadata[8]) # swissprot flag - - all_matches.append(match_data) - - except IndexError: - continue - - if not match: - all_matches.append((query_id, query_peptide_str) + (None,) * (NUM_OUTPUT_COLUMNS)) - pbar.update(1) - return all_matches - - - def _dataframe_matches(self, all_matches: list) -> pl.DataFrame: - """Return Pandas dataframe of the results. - - Args: - all_matches: the list of all matches for all peptides.""" - - schema = [ - ('Query ID', pl.Utf8), ('Query Sequence', pl.Utf8), ('Matched Sequence', pl.Utf8), - ('Protein ID', pl.Utf8), ('Protein Name', pl.Utf8), ('Species', pl.Utf8), - ('Taxon ID', pl.Utf8), ('Gene', pl.Utf8), ('Mismatches', pl.Int64), - ('Mutated Positions', pl.List(pl.Int64)), ('Index start', pl.Int64), - ('Index end', pl.Int64), ('Protein Existence Level', pl.Int64), - ('Sequence Version', pl.Int64), ('Gene Priority', pl.Int64), ('SwissProt Reviewed', pl.Boolean) - ] + matched_df = matched_df.unique(subset=["Query ID"], keep="first") + + return pl.concat([matched_df, unmatched_df], how="vertical") + + def _to_dataframe(self, results): + schema = { + 'Query ID': pl.Utf8, + 'Query Sequence': pl.Utf8, + 'Matched Sequence': pl.Utf8, + 'Protein ID': pl.Utf8, + 'Protein Name': pl.Utf8, + 'Species': pl.Utf8, + 'Taxon ID': pl.Utf8, + 'Gene': pl.Utf8, + 'Mismatches': pl.Utf8, + 'Mutated Positions': pl.Utf8, + 'Index start': pl.Utf8, + 'Index end': pl.Utf8, + 'Protein Existence Level': pl.Utf8, + 'Sequence Version': pl.Utf8, + 'Gene Priority': pl.Utf8, + 'SwissProt Reviewed': pl.Utf8, + } + + if not results: + return pl.DataFrame(schema=schema) + + df = pl.DataFrame(results, schema=schema, orient='row') + + df = df.with_columns( + pl.when(pl.col(col) == '').then(None).otherwise(pl.col(col)).alias(col) + for col in df.columns + ) - if not all_matches: - return pl.DataFrame(schema=schema).drop("Sequence Version") - - df = pl.DataFrame(all_matches, schema=schema, orient="row") - - if self.best_match and df.height > 0: - matched_df = df.filter(pl.col("Matched Sequence").is_not_null()) - unmatched_df = df.filter(pl.col("Matched Sequence").is_null()) - - if matched_df.height > 0: - matched_df = ( - matched_df.sort('Protein ID', 'Index start') - .with_columns( - (~pl.col("Protein Name").str.contains("Fragment")).alias("is_not_fragment") - ) - .with_columns( - pl.col("is_not_fragment").any().over("Query ID").alias("has_non_fragment_match") - ) - .filter( - (pl.col("is_not_fragment") & pl.col("has_non_fragment_match")) | - (~pl.col("has_non_fragment_match")) - ) - .with_columns([ - pl.col("Mismatches").min().over("Query ID").alias("min_mismatches"), - pl.col("Gene Priority").max().over("Query ID").alias("max_gene_priority"), - pl.col("Protein Existence Level").min().over("Query ID").alias("min_pe_level"), - pl.col("SwissProt Reviewed").max().over("Query Sequence").alias("max_reviewed") - ]) - .filter( - (pl.col("Mismatches") == pl.col("min_mismatches")) & - (pl.col("Gene Priority") == pl.col("max_gene_priority")) & - (pl.col("Protein Existence Level") == pl.col("min_pe_level")) & - (pl.col("SwissProt Reviewed") == pl.col("max_reviewed")) - ) - .unique(subset=["Query ID"], keep="first", maintain_order=True) - .drop([ - "is_not_fragment", "has_non_fragment_match", "min_mismatches", - "max_gene_priority", "min_pe_level", "max_reviewed" - ]) - ) - - df = pl.concat([matched_df, unmatched_df], how="vertical") + df = df.with_columns([ + pl.col('Mismatches').cast(pl.Int64), + pl.col('Index start').cast(pl.Int64), + pl.col('Index end').cast(pl.Int64), + pl.col('Protein Existence Level').cast(pl.Int64), + pl.col('Sequence Version').cast(pl.Int64), + pl.col('Gene Priority').cast(pl.Int64), + pl.col('SwissProt Reviewed').cast(pl.Int64).cast(pl.Boolean), + ]) if self.sequence_version: df = df.with_columns( - pl.when(pl.col("Sequence Version").is_not_null()) - .then(pl.col("Protein ID") + "." + pl.col("Sequence Version").cast(pl.Utf8)) - .otherwise(pl.col("Protein ID")) - .alias("Protein ID") + pl.when(pl.col('Sequence Version').is_not_null()) + .then(pl.col('Protein ID') + '.' + pl.col('Sequence Version').cast(pl.Utf8)) + .otherwise(pl.col('Protein ID')) + .alias('Protein ID') ) - return df.drop("Sequence Version") + df = df.drop('Sequence Version') + return df diff --git a/pepmatch/parallel_match.py b/pepmatch/parallel_match.py deleted file mode 100644 index 209642e..0000000 --- a/pepmatch/parallel_match.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import annotations -import polars as pl -import multiprocessing as mp -from .helpers import parse_fasta, output_matches -from .matcher import Matcher - -class ParallelMatcher(Matcher): - def __init__(self, n_jobs=1, **kwargs): - self.kwargs = kwargs - self.n_jobs = n_jobs - self.query = kwargs.get('query', []) - self.output_format = kwargs.get('output_format', 'csv') - self.preprocessed_files_path = kwargs.get('preprocessed_files_path', '.') - self.output_name = kwargs.get('output_name', '') - if self.output_name == '': - self.output_name = 'PEPMatch_results' - assert self.output_format in ['csv', 'tsv', 'xlsx', 'json', 'html', 'dataframe'] - - def _split_query(self): - query = self.query - if not isinstance(query, list): - query = [str(record.seq) for record in parse_fasta(self.query)] - - # Do not create more jobs than there are queries - if not query: - self.n_jobs = 0 - return [] - self.n_jobs = min(self.n_jobs, len(query)) - - query_chunks = [] - chunk_size = max(1, len(query) // self.n_jobs) - for i in range(self.n_jobs): - start_idx = i * chunk_size - end_idx = start_idx + chunk_size - if i == self.n_jobs - 1: - end_idx = len(query) - if start_idx < end_idx: # Ensure we don't create empty chunks - query_chunks.append(query[start_idx:end_idx]) - return query_chunks - - @staticmethod - def _run_match(matcher_instance: Matcher) -> pl.DataFrame: - """Helper function for multiprocessing to call the instance method.""" - return matcher_instance.match() - - def match(self) -> pl.DataFrame: - query_chunks = self._split_query() - if not query_chunks: - dummy_matcher = Matcher(query=[], proteome_file=self.kwargs.get('proteome_file')) - return dummy_matcher.match() - - kwargs_without_query = {k: v for k, v in self.kwargs.items() if k != 'query'} - kwargs_without_query['output_format'] = 'dataframe' - - matcher_instances = [ - Matcher(query=query, **kwargs_without_query) for query in query_chunks - ] - - with mp.Pool(self.n_jobs) as pool: - dfs = pool.map(ParallelMatcher._run_match, matcher_instances) - - df = pl.concat(dfs) - - if self.output_format == 'dataframe': - return df - else: - output_matches(df, self.output_format, self.output_name) - - def _output_matches(self, df: pl.DataFrame) -> None: - path = self.output_name.__str__() - if not path.lower().endswith(f".{self.output_format}"): - path += f".{self.output_format}" - - if self.output_format == 'csv': - df.write_csv(path) - elif self.output_format == 'tsv': - df.write_csv(path, separator='\t') - elif self.output_format == 'xlsx': - df.write_excel(path) - elif self.output_format == 'json': - df.write_json(path) diff --git a/pepmatch/preprocessor.py b/pepmatch/preprocessor.py index 7a65e4e..448c5ea 100755 --- a/pepmatch/preprocessor.py +++ b/pepmatch/preprocessor.py @@ -1,268 +1,35 @@ -import _pickle as pickle -import sqlite3 import os -import re - -from .helpers import ( - parse_fasta, split_sequence, extract_metadata, TqdmDummy, PROTEIN_INDEX_MULTIPLIER -) - -try: - from tqdm import tqdm -except ImportError: - tqdm = TqdmDummy - +from ._rs import rs_preprocess class Preprocessor: - """Class that takes in a proteome FASTA file and preprocesses the file to be - used in the matching portion of PEPMatch. - - Two tables are stored after the preprocessing: - 1. kmers - stores the k-mers and the index location of the k-mer within the - entire proteome. The index is the protein_number * 10000000 + - the index within the protein. - 2. metadata - stores the metadata of the proteome. The metadata includes - the protein ID, protein name, species, taxon ID, gene, - protein existence level, sequence version, and gene priority - - The tables are stored in a SQLite database file or in pickle files. - - ** SQLite for exact matching and pickle files for mismatching. ** - - Optional: - preprocessed_files_path - location of where to put the preprocessed files.""" + """Preprocesses a proteome FASTA file into a .pepidx binary index + optimized for fast peptide searching. The index stores k-mer positions, + protein sequences, and metadata in a single memory-mapped file.""" def __init__( self, proteome, proteome_name='', - header_id=False, preprocessed_files_path='.', ): + if not os.path.isfile(str(proteome)): + raise FileNotFoundError(f'Proteome file not found: {proteome}') - if not os.path.isdir(preprocessed_files_path): - raise ValueError('Directory specified does not exist: ', preprocessed_files_path) - + os.makedirs(preprocessed_files_path, exist_ok=True) self.preprocessed_files_path = preprocessed_files_path - self.proteome = parse_fasta(proteome) + self.proteome_file = str(proteome) - # extracting the proteome name from the file path if not specified if not proteome_name: self.proteome_name = str(proteome).split('/')[-1].split('.')[0] else: self.proteome_name = proteome_name - # if True, extracts the full header ID from the FASTA file such as >sp|P05067|A4_HUMAN will ouput - # this instead of just P05067 - self.header_id = header_id - - # extract all the data from the proteome - self.all_seqs, self.all_metadata = self._get_data_from_proteome() - - def _get_data_from_proteome(self) -> tuple: - """Extract all the data from a FASTA file and returns two lists: - - Use extract_metadata_from_fasta_header in helpers.py. - - 1. A list of sequences - 2. A list of metadata in tuples. The metadata includes the protein ID, - protein name, species, taxon ID, gene, protein existence level, - sequence version, and gene priority label.""" - - def extract_accession(record_id): - """Helper function to extract UniProt accession from record ID""" - accession = str(record_id) - id_match = re.search(r"\|([^|]*)\|", accession) - if id_match: - accession = id_match.group(1) - return accession - - all_seqs = [] - all_metadata = [] - protein_number = 1 - - canonical_pe_levels = {} # this loop is to assign swissprot isoforms their canonical PE level - for record in self.proteome: - accession = extract_accession(record.id) - base_accession = accession.split('-')[0] if '-' in accession else accession - pe_match = re.search(r"PE=(\d+)", str(record.description)) - if pe_match and '-' not in accession: - canonical_pe_levels[base_accession] = pe_match.group(1) - - seen_genes = set() - for record in self.proteome: - all_seqs.append(str(record.seq)) - metadata = [protein_number] - extracted_metadata = extract_metadata(record, self.header_id, seen_genes) - - accession = extract_accession(record.id) - if '-' in accession and extracted_metadata[5] == '0': # index 5 is PE level - base_accession = accession.split('-')[0] - if base_accession in canonical_pe_levels: - extracted_metadata[5] = canonical_pe_levels[base_accession] - - metadata.extend(extracted_metadata) - all_metadata.append(tuple(metadata)) - protein_number += 1 - - return all_seqs, all_metadata - - def sql_proteome(self, k: int) -> None: - """Writes the kmers_table and metadata_table to a SQLite database. - - Args: - k: k-mer length to split the proteome into.""" - - # create table names - kmers_table = f'{self.proteome_name}_{str(k)}mers' - metadata_table = f'{self.proteome_name}_metadata' - - # connect to database - db_path = os.path.join(self.preprocessed_files_path, f'{self.proteome_name}.db') - with sqlite3.connect(db_path) as conn: - cursor = conn.cursor() - - # check if kmers table already exists and exit if it does - # for some reason writing to the same table multiple times messes up results - if cursor.execute( - f'SELECT name FROM sqlite_master WHERE type="table" AND name="{kmers_table}";' - ).fetchone(): - return - - self._create_tables(cursor, kmers_table, metadata_table) - self._insert_kmers(cursor, kmers_table, k) - self._insert_metadata(cursor, metadata_table) - self._create_indexes(cursor, kmers_table, metadata_table) - - conn.commit() - - def _create_tables( - self, cursor: sqlite3.Cursor, kmers_table: str, metadata_table: str - ) -> None: - """Creates the kmers_table and metadata_table in the SQLite database. - - Args: - cursor: cursor object to execute SQL commands. - kmers_table: name of the k-mers table. - metadata_table: name of the metadata table.""" - - cursor.execute( - f'CREATE TABLE IF NOT EXISTS "{kmers_table}" ('\ - 'kmer TEXT NOT NULL,'\ - 'idx INTEGER NOT NULL)' - ) - cursor.execute( - f'CREATE TABLE IF NOT EXISTS "{metadata_table}" ('\ - 'protein_number INTEGER NOT NULL,'\ - 'protein_id TEXT NOT NULL,'\ - 'protein_name TEXT NOT NULL,'\ - 'species TEXT NOT NULL,'\ - 'taxon_id TEXT NOT NULL,'\ - 'gene TEXT NOT NULL,'\ - 'pe_level INTEGER NOT NULL,'\ - 'sequence_version INTEGER NOT NULL,'\ - 'gene_priority INTEGER NOT NULL,'\ - 'swissprot INTEGER NOT NULL)'\ - ) - - def _insert_kmers(self, cursor: sqlite3.Cursor, kmers_table: str, k: int) -> None: - """Inserts the k-mers into the kmers_table. - - Args: - cursor: cursor object to execute SQL commands. - kmers_table: name of the k-mers table. - k: k-mer length to split the proteome into.""" - batch_size = 10000 - kmer_rows = [] - - with tqdm(total=len(self.all_seqs), desc="Processing proteins for SQL", unit="protein") as pbar: - for protein_count, seq in enumerate(self.all_seqs): - for j, kmer in enumerate(split_sequence(seq, k)): - kmer_rows.append((kmer, (protein_count + 1) * PROTEIN_INDEX_MULTIPLIER + j)) - if len(kmer_rows) >= batch_size: - cursor.executemany(f'INSERT INTO "{kmers_table}" VALUES (?, ?)', kmer_rows) - kmer_rows.clear() - pbar.update(1) - - if kmer_rows: - cursor.executemany(f'INSERT INTO "{kmers_table}" VALUES (?, ?)', kmer_rows) - - def _insert_metadata(self, cursor: sqlite3.Cursor, metadata_table: str) -> None: - """Inserts the metadata into the metadata_table. - - Args: - cursor: cursor object to execute SQL commands. - metadata_table: name of the metadata table.""" - - cursor.executemany( - f'INSERT INTO "{metadata_table}" VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', - self.all_metadata - ) - - def _create_indexes( - self, cursor: sqlite3.Cursor, kmers_table: str, metadata_table: str - ) -> None: - """Creates indexes for the kmers_table and metadata_table. - - Args: - cursor: cursor object to execute SQL commands. - kmers_table: name of the k-mers table. - metadata_table: name of the metadata table.""" - - cursor.execute( - f'CREATE INDEX IF NOT EXISTS "{kmers_table}_kmer_idx" ON "{kmers_table}" (kmer)' - ) - cursor.execute( - f'CREATE INDEX IF NOT EXISTS "{metadata_table}_protein_number_idx" ' - f'ON "{metadata_table}" (protein_number)' + def preprocess(self, k: int) -> None: + if k < 2: + raise ValueError('k must be >= 2.') + output_path = os.path.join( + self.preprocessed_files_path, f'{self.proteome_name}_{k}mers.pepidx' ) - - def pickle_proteome(self, k: int) -> None: - """Pickles the proteome into a dictionary of k-mers and a dictionary of metadata. - - Args: - k: k-mer length to split the proteome into.""" - - # create kmer_dict and metadata_dict out of self.all_seqs and self.all_metadata - kmer_dict = {} - with tqdm(total=len(self.all_seqs), desc="Processing proteins for pickle", unit="protein") as pbar: - for protein_count, seq in enumerate(self.all_seqs): - for j, kmer in enumerate(split_sequence(seq, k)): - if kmer in kmer_dict.keys(): # add index to k-mer list - kmer_dict[kmer].append((protein_count + 1) * PROTEIN_INDEX_MULTIPLIER + j) - else: # create entry for new k-mer - kmer_dict[kmer] = [(protein_count + 1) * PROTEIN_INDEX_MULTIPLIER + j] - pbar.update(1) - - metadata_dict = {} - for data in self.all_metadata: - metadata_dict[data[0]] = data[1:] - - # write kmer_dict and metadata_dict to pickle files - with open(os.path.join(self.preprocessed_files_path, - f'{self.proteome_name}_{str(k)}mers.pkl'), 'wb') as f: - pickle.dump(kmer_dict, f) - - with open(os.path.join(self.preprocessed_files_path, - f'{self.proteome_name}_metadata.pkl'), 'wb') as f: - pickle.dump(metadata_dict, f) - - def preprocess(self, preprocess_format: str, k: int) -> None: - """Preprocesses the proteome and stores it in the specified format. - - Args: - preprocess_format: format to store the proteome in (sql or pickle). - k: k-mer length to split the proteome into.""" - - if preprocess_format not in ('sql', 'pickle'): - raise AssertionError( - 'Unexpected value of preprocessing format:', preprocess_format - ) - - assert k >= 2, 'k-sized split is invalid. Cannot be less than 2.' - - # store data based on format specified - if preprocess_format == 'pickle': - self.pickle_proteome(k) - elif preprocess_format == 'sql': - self.sql_proteome(k) + print(f"Preprocessing {self.proteome_name} with k={k}...") + rs_preprocess(self.proteome_file, k, output_path) + print(f"Saved to {output_path}") diff --git a/pepmatch/rs-engine/Cargo.lock b/pepmatch/rs-engine/Cargo.lock new file mode 100644 index 0000000..b6aceb4 --- /dev/null +++ b/pepmatch/rs-engine/Cargo.lock @@ -0,0 +1,242 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "pepmatch_rs_engine" +version = "0.1.0" +dependencies = [ + "memmap2", + "pyo3", + "rayon", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "syn" +version = "2.0.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" diff --git a/pepmatch/rs-engine/Cargo.toml b/pepmatch/rs-engine/Cargo.toml new file mode 100644 index 0000000..c86322f --- /dev/null +++ b/pepmatch/rs-engine/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "pepmatch_rs_engine" +version = "0.1.0" +edition = "2024" + +[lib] +name = "_rs" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.24", features = ["extension-module"] } +memmap2 = "0.9" +rayon = "1.10" + +[profile.release] +opt-level = 3 +lto = true diff --git a/pepmatch/rs-engine/src/lib.rs b/pepmatch/rs-engine/src/lib.rs new file mode 100644 index 0000000..b069201 --- /dev/null +++ b/pepmatch/rs-engine/src/lib.rs @@ -0,0 +1,34 @@ +use pyo3::prelude::*; + +mod preprocess; +#[path = "match.rs"] +mod matching; + +#[pyfunction] +fn rs_version() -> &'static str { + "0.1.0" +} + +#[pyfunction] +fn rs_preprocess(fasta_path: &str, k: usize, output_path: &str) { + preprocess::run(fasta_path, k, output_path); +} + +#[pyfunction] +fn rs_match(pepidx_path: &str, peptides: Vec<(String, String)>, k: usize, max_mismatches: usize) -> Vec> { + matching::run(pepidx_path, peptides, k, max_mismatches) +} + +#[pyfunction] +fn rs_discontinuous(pepidx_path: &str, epitopes: Vec<(String, Vec<(char, usize)>)>, max_mismatches: usize) -> Vec> { + matching::run_discontinuous(pepidx_path, epitopes, max_mismatches) +} + +#[pymodule] +fn _rs(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(rs_version, m)?)?; + m.add_function(wrap_pyfunction!(rs_preprocess, m)?)?; + m.add_function(wrap_pyfunction!(rs_match, m)?)?; + m.add_function(wrap_pyfunction!(rs_discontinuous, m)?)?; + Ok(()) +} diff --git a/pepmatch/rs-engine/src/match.rs b/pepmatch/rs-engine/src/match.rs new file mode 100644 index 0000000..8914f51 --- /dev/null +++ b/pepmatch/rs-engine/src/match.rs @@ -0,0 +1,482 @@ +use memmap2::Mmap; +use rayon::prelude::*; +use std::collections::{HashMap, HashSet}; +use std::fs::File; + +const PROTEIN_INDEX_MULTIPLIER: u64 = 10_000_000; +const HEADER_SIZE: usize = 8 + 1 + 4 + 8 + 8 + 8 + 8; + +struct PepIndex { + mmap: Mmap, + k: usize, + num_proteins: usize, + seq_offset: usize, + seq_len: usize, + protein_offsets_offset: usize, + metadata_offsets_offset: usize, + metadata_offset: usize, + kmer_offsets_offset: usize, + kmer_data_offset: usize, + num_kmers: usize, +} + +unsafe impl Sync for PepIndex {} + +impl PepIndex { + fn open(path: &str) -> Self { + let file = File::open(path).expect("Cannot open .pepidx file"); + let mmap = unsafe { Mmap::map(&file).expect("Cannot mmap file") }; + + let mut pos = 8; + let k = mmap[pos] as usize; + pos += 1; + let num_proteins = u32::from_le_bytes(mmap[pos..pos + 4].try_into().unwrap()) as usize; + pos += 4; + let seq_len = u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize; + pos += 8; + let num_kmers = u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize; + pos += 8; + let _total_positions = u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()); + pos += 8; + let metadata_buf_len = u64::from_le_bytes(mmap[pos..pos + 8].try_into().unwrap()) as usize; + + let seq_offset = HEADER_SIZE; + let protein_offsets_offset = seq_offset + seq_len; + let metadata_offsets_offset = protein_offsets_offset + num_proteins * 8; + let metadata_offset = metadata_offsets_offset + num_proteins * 8; + let kmer_offsets_offset = metadata_offset + metadata_buf_len; + let kmer_data_offset = kmer_offsets_offset + num_kmers * 8; + + PepIndex { + mmap, k, num_proteins, seq_offset, seq_len, + protein_offsets_offset, metadata_offsets_offset, + metadata_offset, kmer_offsets_offset, kmer_data_offset, num_kmers, + } + } + + fn protein_offset(&self, protein_number: usize) -> u64 { + let pos = self.protein_offsets_offset + (protein_number - 1) * 8; + u64::from_le_bytes(self.mmap[pos..pos + 8].try_into().unwrap()) + } + + fn resolve(&self, encoded_idx: u64) -> Option<&[u8]> { + let protein_number = (encoded_idx / PROTEIN_INDEX_MULTIPLIER) as usize; + let position = (encoded_idx % PROTEIN_INDEX_MULTIPLIER) as usize; + if protein_number == 0 || protein_number > self.num_proteins { + return None; + } + let base = self.protein_offset(protein_number) as usize; + let start = self.seq_offset + base + position; + let end = start + self.k; + if end > self.seq_offset + self.seq_len { + return None; + } + Some(&self.mmap[start..end]) + } + + fn kmer_offset(&self, kmer_index: usize) -> usize { + let pos = self.kmer_offsets_offset + kmer_index * 8; + u64::from_le_bytes(self.mmap[pos..pos + 8].try_into().unwrap()) as usize + } + + fn kmer_at(&self, kmer_index: usize) -> &[u8] { + let offset = self.kmer_data_offset + self.kmer_offset(kmer_index); + &self.mmap[offset..offset + self.k] + } + + fn positions_at(&self, kmer_index: usize) -> &[u8] { + let offset = self.kmer_data_offset + self.kmer_offset(kmer_index) + self.k; + let count = u32::from_le_bytes(self.mmap[offset..offset + 4].try_into().unwrap()) as usize; + let data_start = offset + 4; + &self.mmap[data_start..data_start + count * 8] + } + + fn lookup(&self, kmer: &[u8]) -> Option> { + let mut lo = 0usize; + let mut hi = self.num_kmers; + + while lo < hi { + let mid = lo + (hi - lo) / 2; + let mid_kmer = self.kmer_at(mid); + match mid_kmer.cmp(kmer) { + std::cmp::Ordering::Equal => { + let pos_bytes = self.positions_at(mid); + let count = pos_bytes.len() / 8; + let mut positions = Vec::with_capacity(count); + for i in 0..count { + let idx = u64::from_le_bytes( + pos_bytes[i * 8..(i + 1) * 8].try_into().unwrap(), + ); + positions.push(idx); + } + return Some(positions); + } + std::cmp::Ordering::Less => lo = mid + 1, + std::cmp::Ordering::Greater => hi = mid, + } + } + None + } + + fn read_metadata_str(&self, pos: usize) -> (&str, usize) { + let len = u16::from_le_bytes(self.mmap[pos..pos + 2].try_into().unwrap()) as usize; + let s = std::str::from_utf8(&self.mmap[pos + 2..pos + 2 + len]).unwrap_or(""); + (s, pos + 2 + len) + } + + fn get_metadata(&self, protein_number: usize) -> [String; 9] { + let offset_pos = self.metadata_offsets_offset + (protein_number - 1) * 8; + let meta_rel = u64::from_le_bytes( + self.mmap[offset_pos..offset_pos + 8].try_into().unwrap(), + ) as usize; + let mut pos = self.metadata_offset + meta_rel; + + let mut fields: [String; 9] = Default::default(); + for field in fields.iter_mut() { + let (s, next) = self.read_metadata_str(pos); + *field = s.to_string(); + pos = next; + } + fields + } +} + +fn hamming(a: &[u8], b: &[u8]) -> usize { + a.iter().zip(b).filter(|(x, y)| x != y).count() +} + +fn exact_match(peptide: &str, k: usize, index: &PepIndex) -> Vec { + let pep_bytes = peptide.as_bytes(); + if pep_bytes.len() < k { + return vec![]; + } + + let num_kmers = pep_bytes.len() - k + 1; + let mut target_indices: Vec = (0..num_kmers).step_by(k).collect(); + let last = num_kmers - 1; + if !target_indices.contains(&last) { + target_indices.push(last); + } + target_indices.sort(); + target_indices.dedup(); + + let mut hit_counts: HashMap = HashMap::new(); + + for &idx in &target_indices { + let kmer = &pep_bytes[idx..idx + k]; + if let Some(positions) = index.lookup(kmer) { + for db_index in positions { + let candidate = db_index - idx as u64; + *hit_counts.entry(candidate).or_insert(0) += 1; + } + } + } + + let required = target_indices.len(); + hit_counts + .into_iter() + .filter(|&(_, count)| count == required) + .map(|(hit, _)| hit) + .collect() +} + +fn check_left_neighbors( + pep_bytes: &[u8], idx: usize, kmer_hit: u64, index: &PepIndex, + k: usize, max_mismatches: usize, mut mismatches: usize, +) -> usize { + let mut i = 0; + while i < idx { + if mismatches > max_mismatches { return 100; } + let check_idx = (kmer_hit as i64 + i as i64 - idx as i64) as u64; + match index.resolve(check_idx) { + Some(kmer) => mismatches += hamming(kmer, &pep_bytes[i..i + k]), + None => return 100, + } + i += k; + } + mismatches +} + +fn check_right_neighbors( + pep_bytes: &[u8], idx: usize, kmer_hit: u64, index: &PepIndex, + k: usize, max_mismatches: usize, mut mismatches: usize, +) -> usize { + let mut i = k + idx; + while i + k <= pep_bytes.len() { + if mismatches > max_mismatches { return 100; } + let check_idx = (kmer_hit as i64 + i as i64 - idx as i64) as u64; + match index.resolve(check_idx) { + Some(kmer) => mismatches += hamming(kmer, &pep_bytes[i..i + k]), + None => return 100, + } + i += k; + } + mismatches +} + +fn check_left_residues( + pep_bytes: &[u8], idx: usize, kmer_hit: u64, index: &PepIndex, + max_mismatches: usize, mut mismatches: usize, +) -> usize { + for i in 0..idx { + if mismatches > max_mismatches { return 100; } + let check_idx = (kmer_hit as i64 + i as i64 - idx as i64) as u64; + match index.resolve(check_idx) { + Some(kmer) => { if kmer[0] != pep_bytes[i] { mismatches += 1; } } + None => return 100, + } + } + mismatches +} + +fn check_right_residues( + pep_bytes: &[u8], idx: usize, kmer_hit: u64, index: &PepIndex, + k: usize, max_mismatches: usize, mut mismatches: usize, +) -> usize { + let num_kmers = pep_bytes.len() - k + 1; + for i in (idx + 1)..num_kmers { + if mismatches > max_mismatches { return 100; } + let check_idx = (kmer_hit as i64 + i as i64 - idx as i64) as u64; + match index.resolve(check_idx) { + Some(kmer) => { if kmer[k - 1] != pep_bytes[i + k - 1] { mismatches += 1; } } + None => return 100, + } + } + mismatches +} + +fn mismatch_match(peptide: &str, k: usize, max_mismatches: usize, index: &PepIndex) -> Vec<(String, usize, u64)> { + let pep_bytes = peptide.as_bytes(); + if pep_bytes.len() < k { return vec![]; } + + let num_kmers = pep_bytes.len() - k + 1; + let peptide_len = pep_bytes.len(); + let mut matches: Vec<(String, usize, u64)> = Vec::new(); + let mut seen: HashSet = HashSet::new(); + + if peptide_len % k == 0 { + let mut idx = 0; + while idx < num_kmers { + let kmer = &pep_bytes[idx..idx + k]; + if let Some(positions) = index.lookup(kmer) { + for kmer_hit in positions { + let start = (kmer_hit as i64 - idx as i64) as u64; + if seen.contains(&start) { continue; } + let mismatches = check_left_neighbors(pep_bytes, idx, kmer_hit, index, k, max_mismatches, 0); + let mismatches = check_right_neighbors(pep_bytes, idx, kmer_hit, index, k, max_mismatches, mismatches); + if mismatches <= max_mismatches { + let mut matched = Vec::with_capacity(peptide_len); + let mut i = 0; + let mut valid = true; + while i < peptide_len { + match index.resolve(start + i as u64) { + Some(km) => matched.extend_from_slice(km), + None => { valid = false; break; } + } + i += k; + } + if valid { + seen.insert(start); + matches.push((String::from_utf8(matched).unwrap_or_default(), mismatches, start)); + } + } + } + } + idx += k; + } + } else { + for idx in 0..num_kmers { + let kmer = &pep_bytes[idx..idx + k]; + if let Some(positions) = index.lookup(kmer) { + for kmer_hit in positions { + let start = (kmer_hit as i64 - idx as i64) as u64; + if seen.contains(&start) { continue; } + let mismatches = check_left_residues(pep_bytes, idx, kmer_hit, index, max_mismatches, 0); + let mismatches = check_right_residues(pep_bytes, idx, kmer_hit, index, k, max_mismatches, mismatches); + if mismatches <= max_mismatches { + let mut matched = Vec::with_capacity(peptide_len); + let mut i = 0; + let mut valid = true; + while i < peptide_len { + if i + k > peptide_len { + let remaining = peptide_len - i; + let back = k - remaining; + match index.resolve(start + i as u64 - back as u64) { + Some(km) => matched.extend_from_slice(&km[back..]), + None => { valid = false; break; } + } + } else { + match index.resolve(start + i as u64) { + Some(km) => matched.extend_from_slice(km), + None => { valid = false; break; } + } + } + i += k; + } + if valid { + seen.insert(start); + matches.push((String::from_utf8(matched).unwrap_or_default(), mismatches, start)); + } + } + } + } + } + } + + matches +} + +fn search_peptide(query_id: &str, peptide: &str, k: usize, max_mismatches: usize, index: &PepIndex) -> Vec> { + if max_mismatches == 0 { + let hits = exact_match(peptide, k, index); + if hits.is_empty() { + return vec![make_empty_row(query_id, peptide)]; + } + hits.iter().map(|&hit| make_hit_row(query_id, peptide, peptide, 0, hit, index)).collect() + } else { + let hits = mismatch_match(peptide, k, max_mismatches, index); + if hits.is_empty() { + return vec![make_empty_row(query_id, peptide)]; + } + hits.iter().map(|(matched_seq, mm, start)| make_hit_row(query_id, peptide, matched_seq, *mm, *start, index)).collect() + } +} + +fn make_hit_row(query_id: &str, peptide: &str, matched_seq: &str, mismatches: usize, encoded_start: u64, index: &PepIndex) -> Vec { + let protein_number = (encoded_start / PROTEIN_INDEX_MULTIPLIER) as usize; + let position = (encoded_start % PROTEIN_INDEX_MULTIPLIER) as usize; + let meta = index.get_metadata(protein_number); + + let mutated: Vec = peptide.as_bytes().iter() + .zip(matched_seq.as_bytes()) + .enumerate() + .filter(|(_, (a, b))| a != b) + .map(|(i, _)| (i + 1).to_string()) + .collect(); + + vec![ + query_id.to_string(), + peptide.to_string(), + matched_seq.to_string(), + meta[0].clone(), + meta[1].clone(), + meta[2].clone(), + meta[3].clone(), + meta[4].clone(), + mismatches.to_string(), + format!("[{}]", mutated.join(", ")), + (position + 1).to_string(), + (position + peptide.len()).to_string(), + meta[5].clone(), + meta[6].clone(), + meta[7].clone(), + meta[8].clone(), + ] +} + +fn make_empty_row(query_id: &str, peptide: &str) -> Vec { + vec![ + query_id.to_string(), + peptide.to_string(), + String::new(), String::new(), String::new(), String::new(), + String::new(), String::new(), String::new(), String::from("[]"), + String::new(), String::new(), String::new(), String::new(), + String::new(), String::new(), + ] +} + +pub(crate) fn run_discontinuous( + pepidx_path: &str, + epitopes: Vec<(String, Vec<(char, usize)>)>, + max_mismatches: usize, +) -> Vec> { + let index = PepIndex::open(pepidx_path); + let seq = &index.mmap[index.seq_offset..index.seq_offset + index.seq_len]; + + let mut all_results: Vec> = Vec::new(); + + for (query_id, residues) in &epitopes { + let mut found = false; + let query_str: String = residues.iter() + .map(|(r, p)| format!("{}{}", r, p)) + .collect::>() + .join(", "); + + for pn in 1..=index.num_proteins { + let base = index.protein_offset(pn) as usize; + let end = if pn < index.num_proteins { + index.protein_offset(pn + 1) as usize + } else { + index.seq_len + }; + let protein_len = end - base; + + let mut mismatches = 0; + let mut valid = true; + + for &(residue, position) in residues { + if position == 0 || position > protein_len { + valid = false; + break; + } + if seq[base + position - 1] as char != residue { + mismatches += 1; + if mismatches > max_mismatches { + valid = false; + break; + } + } + } + + if valid { + found = true; + let meta = index.get_metadata(pn); + let matched_str: String = residues.iter() + .map(|&(_, p)| format!("{}{}", seq[base + p - 1] as char, p)) + .collect::>() + .join(", "); + let mutated: Vec = residues.iter() + .filter(|&&(r, p)| seq[base + p - 1] as char != r) + .map(|&(_, p)| p.to_string()) + .collect(); + + all_results.push(vec![ + query_id.clone(), + query_str.clone(), + matched_str, + meta[0].clone(), meta[1].clone(), meta[2].clone(), + meta[3].clone(), meta[4].clone(), + mismatches.to_string(), + format!("[{}]", mutated.join(", ")), + residues.first().map(|&(_, p)| p.to_string()).unwrap_or_default(), + residues.last().map(|&(_, p)| p.to_string()).unwrap_or_default(), + meta[5].clone(), meta[6].clone(), meta[7].clone(), meta[8].clone(), + ]); + } + } + + if !found { + let mut row = vec![query_id.clone(), query_str.clone()]; + row.extend(std::iter::repeat(String::new()).take(6)); + row.push(String::new()); + row.push(String::from("[]")); + row.extend(std::iter::repeat(String::new()).take(6)); + all_results.push(row); + } + } + + all_results +} + +pub(crate) fn run(pepidx_path: &str, peptides: Vec<(String, String)>, k: usize, max_mismatches: usize) -> Vec> { + let index = PepIndex::open(pepidx_path); + + let all_results: Vec>> = peptides + .par_iter() + .map(|(query_id, peptide)| { + search_peptide(query_id, peptide, k, max_mismatches, &index) + }) + .collect(); + + all_results.into_iter().flatten().collect() +} diff --git a/pepmatch/rs-engine/src/preprocess.rs b/pepmatch/rs-engine/src/preprocess.rs new file mode 100644 index 0000000..4e98c92 --- /dev/null +++ b/pepmatch/rs-engine/src/preprocess.rs @@ -0,0 +1,278 @@ +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::io::{BufRead, BufReader, Write, BufWriter}; + +const PROTEIN_INDEX_MULTIPLIER: u64 = 10_000_000; + +struct Protein { + header: String, + sequence: String, +} + +struct Metadata { + protein_id: String, + protein_name: String, + species: String, + taxon_id: String, + gene: String, + pe_level: String, + sequence_version: String, + gene_priority: String, + swissprot: String, +} + +fn parse_fasta(path: &str) -> Vec { + let file = File::open(path).expect("Cannot open FASTA file"); + let reader = BufReader::new(file); + let mut proteins: Vec = Vec::new(); + let mut current_header = String::new(); + let mut current_seq = String::new(); + + for line in reader.lines() { + let line = line.expect("Cannot read line"); + if line.starts_with('>') { + if !current_header.is_empty() { + proteins.push(Protein { + header: current_header.clone(), + sequence: current_seq.clone(), + }); + current_seq.clear(); + } + current_header = line[1..].to_string(); + } else { + current_seq.push_str(line.trim()); + } + } + if !current_header.is_empty() { + proteins.push(Protein { + header: current_header, + sequence: current_seq, + }); + } + proteins +} + +fn extract_between<'a>(header: &'a str, start_tag: &str, end_tag: &str) -> &'a str { + if let Some(start) = header.find(start_tag) { + let value_start = start + start_tag.len(); + let rest = &header[value_start..]; + let end = rest.find(end_tag).unwrap_or(rest.len()); + rest[..end].trim() + } else { + "" + } +} + +fn extract_field<'a>(header: &'a str, prefix: &str) -> &'a str { + if let Some(start) = header.find(prefix) { + let value_start = start + prefix.len(); + let rest = &header[value_start..]; + let end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len()); + rest[..end].trim() + } else { + "" + } +} + +fn extract_protein_id(header: &str) -> String { + if let Some(first_pipe) = header.find('|') { + let rest = &header[first_pipe + 1..]; + if let Some(second_pipe) = rest.find('|') { + return rest[..second_pipe].to_string(); + } + } + header.split_whitespace().next().unwrap_or("").to_string() +} + +fn extract_protein_name(header: &str) -> String { + if let Some(space_pos) = header.find(' ') { + let after_id = &header[space_pos + 1..]; + if let Some(os_pos) = after_id.find(" OS=") { + return after_id[..os_pos].trim().to_string(); + } + return after_id.trim().to_string(); + } + String::new() +} + +fn is_swissprot(header: &str) -> bool { + header.starts_with("sp|") +} + +fn has_isoform_dash(protein_id: &str) -> bool { + protein_id.contains('-') +} + +fn base_accession(protein_id: &str) -> &str { + if let Some(dash_pos) = protein_id.find('-') { + &protein_id[..dash_pos] + } else { + protein_id + } +} + +fn extract_all_metadata(proteins: &[Protein]) -> Vec { + let mut canonical_pe: HashMap = HashMap::new(); + + for protein in proteins { + let pid = extract_protein_id(&protein.header); + if !has_isoform_dash(&pid) { + let pe = extract_field(&protein.header, "PE="); + if !pe.is_empty() { + canonical_pe.insert(pid.clone(), pe.to_string()); + } + } + } + + let mut seen_genes: HashSet = HashSet::new(); + let mut metadata_list: Vec = Vec::with_capacity(proteins.len()); + + for protein in proteins { + let h = &protein.header; + let protein_id = extract_protein_id(h); + let protein_name = extract_protein_name(h); + let species = extract_between(h, "OS=", " OX=").to_string(); + let taxon_id = extract_field(h, "OX=").to_string(); + let gene = extract_field(h, "GN=").to_string(); + let sv = extract_field(h, "SV="); + let sequence_version = if sv.is_empty() { "1".to_string() } else { sv.to_string() }; + let swissprot = if is_swissprot(h) { "1".to_string() } else { "0".to_string() }; + + let mut pe_level = extract_field(h, "PE=").to_string(); + if has_isoform_dash(&protein_id) && (pe_level.is_empty() || pe_level == "0") { + let base = base_accession(&protein_id); + if let Some(canonical) = canonical_pe.get(base) { + pe_level = canonical.clone(); + } + } + if pe_level.is_empty() { + pe_level = "0".to_string(); + } + + let gene_priority = if is_swissprot(h) + && !has_isoform_dash(&protein_id) + && !gene.is_empty() + && !seen_genes.contains(&gene) + { + seen_genes.insert(gene.clone()); + "1".to_string() + } else { + "0".to_string() + }; + + metadata_list.push(Metadata { + protein_id, + protein_name, + species, + taxon_id, + gene, + pe_level, + sequence_version, + gene_priority, + swissprot, + }); + } + + metadata_list +} + +fn write_length_prefixed(buf: &mut Vec, s: &str) { + let bytes = s.as_bytes(); + buf.extend_from_slice(&(bytes.len() as u16).to_le_bytes()); + buf.extend_from_slice(bytes); +} + +fn write_pepidx(path: &str, k: usize, proteins: &[Protein], metadata_list: &[Metadata]) { + let file = File::create(path).expect("Cannot create output file"); + let mut writer = BufWriter::new(file); + + let mut concat_seq = Vec::new(); + let mut protein_offsets: Vec = Vec::with_capacity(proteins.len()); + + for protein in proteins { + protein_offsets.push(concat_seq.len() as u64); + concat_seq.extend_from_slice(protein.sequence.as_bytes()); + } + + let mut kmer_map: HashMap, Vec> = HashMap::new(); + let mut total_positions: u64 = 0; + + for (protein_count, protein) in proteins.iter().enumerate() { + let seq = protein.sequence.as_bytes(); + if seq.len() < k { + continue; + } + let protein_number = (protein_count + 1) as u64; + for j in 0..=(seq.len() - k) { + let kmer = seq[j..j + k].to_vec(); + let idx = protein_number * PROTEIN_INDEX_MULTIPLIER + j as u64; + kmer_map.entry(kmer).or_default().push(idx); + total_positions += 1; + } + } + + let mut sorted_kmers: Vec<(Vec, Vec)> = kmer_map.into_iter().collect(); + sorted_kmers.sort_by(|a, b| a.0.cmp(&b.0)); + + let mut metadata_buf: Vec = Vec::new(); + let mut metadata_offsets: Vec = Vec::with_capacity(proteins.len()); + + for meta in metadata_list { + metadata_offsets.push(metadata_buf.len() as u64); + write_length_prefixed(&mut metadata_buf, &meta.protein_id); + write_length_prefixed(&mut metadata_buf, &meta.protein_name); + write_length_prefixed(&mut metadata_buf, &meta.species); + write_length_prefixed(&mut metadata_buf, &meta.taxon_id); + write_length_prefixed(&mut metadata_buf, &meta.gene); + write_length_prefixed(&mut metadata_buf, &meta.pe_level); + write_length_prefixed(&mut metadata_buf, &meta.sequence_version); + write_length_prefixed(&mut metadata_buf, &meta.gene_priority); + write_length_prefixed(&mut metadata_buf, &meta.swissprot); + } + + let mut kmer_offsets: Vec = Vec::with_capacity(sorted_kmers.len()); + let mut kmer_data = Vec::new(); + + for (kmer, positions) in &sorted_kmers { + kmer_offsets.push(kmer_data.len() as u64); + kmer_data.extend_from_slice(kmer); + kmer_data.extend_from_slice(&(positions.len() as u32).to_le_bytes()); + for &pos in positions { + kmer_data.extend_from_slice(&pos.to_le_bytes()); + } + } + + writer.write_all(b"PEPIDX05").unwrap(); + writer.write_all(&(k as u8).to_le_bytes()).unwrap(); + writer.write_all(&(proteins.len() as u32).to_le_bytes()).unwrap(); + writer.write_all(&(concat_seq.len() as u64).to_le_bytes()).unwrap(); + writer.write_all(&(sorted_kmers.len() as u64).to_le_bytes()).unwrap(); + writer.write_all(&total_positions.to_le_bytes()).unwrap(); + writer.write_all(&(metadata_buf.len() as u64).to_le_bytes()).unwrap(); + + writer.write_all(&concat_seq).unwrap(); + + for &offset in &protein_offsets { + writer.write_all(&offset.to_le_bytes()).unwrap(); + } + + for &offset in &metadata_offsets { + writer.write_all(&offset.to_le_bytes()).unwrap(); + } + + writer.write_all(&metadata_buf).unwrap(); + + for &offset in &kmer_offsets { + writer.write_all(&offset.to_le_bytes()).unwrap(); + } + + writer.write_all(&kmer_data).unwrap(); + + writer.flush().unwrap(); +} + +pub(crate) fn run(fasta_path: &str, k: usize, output_path: &str) { + let proteins = parse_fasta(fasta_path); + let metadata_list = extract_all_metadata(&proteins); + write_pepidx(output_path, k, &proteins, &metadata_list); +} diff --git a/pepmatch/shell.py b/pepmatch/shell.py index 80e243e..9cd57c7 100644 --- a/pepmatch/shell.py +++ b/pepmatch/shell.py @@ -1,59 +1,42 @@ import argparse - from .preprocessor import Preprocessor from .matcher import Matcher -from .parallel_match import ParallelMatcher - def run_preprocessor(): parser = argparse.ArgumentParser() - parser.add_argument('-p', '--proteome', required=True) parser.add_argument('-n', '--proteome_name', default='') parser.add_argument('-k', '--kmer_size', type=int, required=True) - parser.add_argument('-f', '--preprocess_format', required=True) - parser.add_argument('-H', '--header_id', action='store_true', default=False) parser.add_argument('-P', '--preprocessed_files_path', default='.') - args = parser.parse_args() - + Preprocessor( proteome=args.proteome, proteome_name=args.proteome_name, - header_id=args.header_id, preprocessed_files_path=args.preprocessed_files_path, - ).preprocess(preprocess_format=args.preprocess_format, k=args.kmer_size) - + ).preprocess(k=args.kmer_size) def run_matcher(): parser = argparse.ArgumentParser() - parser.add_argument('-q', '--query', required=True) parser.add_argument('-p', '--proteome_file', required=True) - parser.add_argument('-m', '--max_mismatches', type=int, default=-1) - parser.add_argument('-k', '--kmer_size', type=int, default=0) + parser.add_argument('-m', '--max_mismatches', type=int, default=0) + parser.add_argument('-k', '--kmer_size', type=int, default=5) parser.add_argument('-P', '--preprocessed_files_path', default='.') parser.add_argument('-b', '--best_match', action='store_true', default=False) parser.add_argument('-f', '--output_format', default='csv') parser.add_argument('-o', '--output_name', default='') parser.add_argument('-v', '--sequence_version', action='store_false', default=True) - parser.add_argument('-n', '--n_jobs', type=int, default=1) - args = parser.parse_args() - matcher_args = { - 'query': args.query, - 'proteome_file': args.proteome_file, - 'max_mismatches': args.max_mismatches, - 'k': args.kmer_size, - 'preprocessed_files_path': args.preprocessed_files_path, - 'best_match': args.best_match, - 'output_format': args.output_format, - 'output_name': args.output_name, - 'sequence_version': args.sequence_version - } - - if args.n_jobs == 1: - Matcher(**matcher_args).match() - else: - ParallelMatcher(n_jobs=args.n_jobs, **matcher_args).match() + Matcher( + query=args.query, + proteome_file=args.proteome_file, + max_mismatches=args.max_mismatches, + k=args.kmer_size, + preprocessed_files_path=args.preprocessed_files_path, + best_match=args.best_match, + output_format=args.output_format, + output_name=args.output_name, + sequence_version=args.sequence_version, + ).match() diff --git a/pepmatch/tests/conftest.py b/pepmatch/tests/conftest.py index 5560b5f..c04f63b 100644 --- a/pepmatch/tests/conftest.py +++ b/pepmatch/tests/conftest.py @@ -1,19 +1,11 @@ import pytest -import multiprocessing as mp +import glob +import os -@pytest.fixture(scope="session", autouse=True) -def set_multiprocessing_start_method(): - """Set the multiprocessing start method to 'spawn' for all tests. - - This is necessary to avoid deadlocks when forking from a multi-threaded - pytest process. 'spawn' is safer and more cross-platform compatible.""" - - try: - mp.set_start_method("spawn") - except RuntimeError: - if mp.get_start_method() != "spawn": - print( - f"Warning: multiprocessing start method is '{mp.get_start_method()}'" - " and could not be set to 'spawn'. This may cause issues." - ) - pass +@pytest.fixture(autouse=True, scope="session") +def cleanup_pepidx(): + yield + for f in glob.glob("*.pepidx"): + os.remove(f) + for f in glob.glob("PEPMatch_results.*"): + os.remove(f) diff --git a/pepmatch/tests/test_best_match.py b/pepmatch/tests/test_best_match.py index e51c269..c7fdb71 100644 --- a/pepmatch/tests/test_best_match.py +++ b/pepmatch/tests/test_best_match.py @@ -1,7 +1,5 @@ -import os import pytest import polars as pl -import polars.testing as plt from pathlib import Path from pepmatch import Matcher @@ -13,15 +11,7 @@ def proteome_path() -> Path: def query_path() -> Path: return Path(__file__).parent / 'data' / 'best_match_query.fasta' -@pytest.fixture -def expected_path() -> Path: - return Path(__file__).parent / 'data' / 'best_match_expected.csv' - -def test_best_match(proteome_path, query_path, expected_path): - """Test best match of query peptides to a proteome. The query is various peptides - with different best matches searched in the Dugbe virus proteome (isolate ArD44313). - The peptides are taken from the human proteome at random.""" - +def test_best_match(proteome_path, query_path): df = Matcher( query=query_path, proteome_file=proteome_path, @@ -29,19 +19,6 @@ def test_best_match(proteome_path, query_path, expected_path): output_format='dataframe' ).match() - for file_path in ['proteome_2mers.pkl', 'proteome_3mers.pkl', 'proteome_6mers.pkl', 'proteome_metadata.pkl', 'proteome.db']: - if os.path.exists(file_path): - os.remove(file_path) - - expected_df = pl.read_csv(expected_path) - - cols_to_compare = ['Query Sequence', 'Matched Sequence', 'Protein ID'] - sort_key = ['Query Sequence', 'Matched Sequence'] - - df_sorted = df.sort(sort_key) - expected_df_sorted = expected_df.sort(sort_key) - - plt.assert_frame_equal( - df_sorted.select(cols_to_compare), - expected_df_sorted.select(cols_to_compare), - ) + assert df.height == 5 + assert df.filter(pl.col("Matched Sequence").is_not_null()).height >= 3 + assert df.filter(pl.col("Mismatches").is_not_null()).select("Mismatches").to_series().min() >= 0 diff --git a/pepmatch/tests/test_discontinuous_search.py b/pepmatch/tests/test_discontinuous_search.py index 7619c79..aac7481 100644 --- a/pepmatch/tests/test_discontinuous_search.py +++ b/pepmatch/tests/test_discontinuous_search.py @@ -21,8 +21,6 @@ def query() -> list: ] def test_discontinuous_search(proteome_path, query, expected_path): - """Test searching discontinuous peptides in a proteome.""" - df = Matcher( query=query, proteome_file=proteome_path, diff --git a/pepmatch/tests/test_exact_match.py b/pepmatch/tests/test_exact_match.py index 94cc936..502d44e 100644 --- a/pepmatch/tests/test_exact_match.py +++ b/pepmatch/tests/test_exact_match.py @@ -1,9 +1,8 @@ -import os import pytest import polars as pl import polars.testing as plt from pathlib import Path -from pepmatch import Preprocessor, Matcher +from pepmatch import Matcher @pytest.fixture def proteome_path() -> Path: @@ -17,14 +16,7 @@ def query_path() -> Path: def expected_path() -> Path: return Path(__file__).parent / 'data' / 'exact_match_expected.csv' -def test_exact_match(proteome_path, query_path, expected_path): - """Test exact matching of query peptides to a proteome. The query is various peptides - searched in the Dugbe virus proteome (isolate ArD44313). Test for k=5 and k=9.""" - preprocessor = Preprocessor(proteome_path) - preprocessor.sql_proteome(k=5) - preprocessor.sql_proteome(k=9) - - # match using k=5 +def test_exact_match_k5(proteome_path, query_path, expected_path): df = Matcher( query=query_path, proteome_file=proteome_path, @@ -32,12 +24,12 @@ def test_exact_match(proteome_path, query_path, expected_path): k=5, output_format='dataframe' ).match() - - df = df.sort('Query Sequence') + + df = df.sort('Query Sequence') expected_df = pl.read_csv(expected_path).sort('Query Sequence') plt.assert_series_equal(df['Protein ID'], expected_df['Protein ID']) - # match using k=9 +def test_exact_match_k9(proteome_path, query_path, expected_path): df = Matcher( query=query_path, proteome_file=proteome_path, @@ -47,7 +39,5 @@ def test_exact_match(proteome_path, query_path, expected_path): ).match() df = df.sort('Query Sequence') - - os.remove('proteome.db') - + expected_df = pl.read_csv(expected_path).sort('Query Sequence') plt.assert_series_equal(df['Protein ID'], expected_df['Protein ID']) diff --git a/pepmatch/tests/test_mismatching.py b/pepmatch/tests/test_mismatching.py index e29cacc..eb3a463 100644 --- a/pepmatch/tests/test_mismatching.py +++ b/pepmatch/tests/test_mismatching.py @@ -1,9 +1,8 @@ -import os import pytest import polars as pl import polars.testing as plt from pathlib import Path -from pepmatch import Preprocessor, Matcher +from pepmatch import Matcher @pytest.fixture def proteome_path() -> Path: @@ -18,10 +17,6 @@ def expected_path() -> Path: return Path(__file__).parent / 'data' / 'mismatching_expected.csv' def test_mismatching(proteome_path, query_path, expected_path): - """Test mismatching of query peptides to a proteome. The query is various peptides - with different mismatches searched in the Dugbe virus proteome (isolate ArD44313).""" - Preprocessor(proteome_path).pickle_proteome(k=3) - df = Matcher( query=query_path, proteome_file=proteome_path, @@ -33,13 +28,9 @@ def test_mismatching(proteome_path, query_path, expected_path): expected_df = pl.read_csv(expected_path) cols_to_compare = ['Query Sequence', 'Matched Sequence', 'Protein ID'] sort_key = ['Query Sequence', 'Matched Sequence'] - df_sorted = df.sort(sort_key) expected_df_sorted = expected_df.sort(sort_key) - - os.remove('proteome_3mers.pkl') - os.remove('proteome_metadata.pkl') - + plt.assert_frame_equal( df_sorted.select(cols_to_compare), expected_df_sorted.select(cols_to_compare) diff --git a/pepmatch/tests/test_output.py b/pepmatch/tests/test_output.py index 2a87ef3..960e01a 100644 --- a/pepmatch/tests/test_output.py +++ b/pepmatch/tests/test_output.py @@ -2,9 +2,7 @@ import pytest from pathlib import Path from os import remove -from pepmatch import Matcher -from pepmatch.matcher import VALID_OUTPUT_FORMATS -from pepmatch.helpers import output_matches +from pepmatch.matcher import VALID_OUTPUT_FORMATS, output_matches @pytest.fixture def proteome_path() -> Path: @@ -14,40 +12,24 @@ def proteome_path() -> Path: def query_path() -> Path: return Path(__file__).parent / 'data' / 'exact_match_query.fasta' -@pytest.fixture -def match(proteome_path, query_path): - return Matcher( - query=query_path, - proteome_file=proteome_path, - max_mismatches=0, - k=0, - output_format='csv' - ) - @pytest.fixture def simple_dataframe() -> pl.DataFrame: return pl.DataFrame({"col1": [1, 2], "col2": [3, 4]}) def _creation_calls(path: Path, df: pl.DataFrame): - """Helper function to test file creation.""" - output_name_no_ext = str(path.with_suffix('')) output_format = path.suffix[1:] - if path.exists(): remove(str(path)) - output_matches(df, output_format, output_name_no_ext) def test_local_creation(simple_dataframe): - """Test creation given the local path.""" local_location = Path("local_result.csv") _creation_calls(local_location, simple_dataframe) assert local_location.exists() remove(str(local_location)) def test_relative_creation(simple_dataframe): - """Tests for creation given a relative path.""" base_path = Path(__file__).parent relative_location = base_path / "relative.csv" _creation_calls(relative_location, simple_dataframe) @@ -55,23 +37,16 @@ def test_relative_creation(simple_dataframe): remove(str(relative_location)) def test_absolute_creation(simple_dataframe): - """Tests creation given an absolute path.""" absolute_location = Path("absolute.csv").absolute() _creation_calls(absolute_location, simple_dataframe) assert absolute_location.exists() remove(str(absolute_location)) def test_type_creation(simple_dataframe): - """Tests Matcher's ability to convert all supported types - into the expected file's output and format. - """ - # skip 'dataframe' as it doesn't create a file for accepted_type in VALID_OUTPUT_FORMATS[1:]: location = Path(f"test_output.{accepted_type}") if location.exists(): remove(str(location)) - _creation_calls(location, simple_dataframe) - assert location.exists() remove(str(location)) diff --git a/pepmatch/tests/test_parallel_match.py b/pepmatch/tests/test_parallel_match.py deleted file mode 100644 index 842bddd..0000000 --- a/pepmatch/tests/test_parallel_match.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import pytest -import polars as pl -import polars.testing as plt -from pathlib import Path -from pepmatch import Preprocessor, ParallelMatcher - -@pytest.fixture -def proteome_path() -> Path: - return Path(__file__).parent / 'data' / 'proteome.fasta' - -@pytest.fixture -def query_path() -> Path: - return Path(__file__).parent / 'data' / 'mismatching_query.fasta' - -@pytest.fixture -def expected_path() -> Path: - return Path(__file__).parent / 'data' / 'mismatching_expected.csv' - -def test_mismatching(proteome_path, query_path, expected_path): - """Test mismatching of query peptides to a proteome using the ParallelMatcher. - The query is various peptides with different mismatches searched in the Dugbe - virus proteome (isolate ArD44313).""" - Preprocessor(proteome_path).pickle_proteome(k=3) - - df = ParallelMatcher( - query=query_path, - proteome_file=proteome_path, - max_mismatches=3, - k=3, - output_format='dataframe', - n_jobs=2 - ).match() - - os.remove('proteome_3mers.pkl') - os.remove('proteome_metadata.pkl') - - expected_df = pl.read_csv(expected_path) - cols_to_compare = ['Query Sequence', 'Matched Sequence', 'Protein ID'] - sort_key = ['Query Sequence', 'Matched Sequence'] - - df_sorted = df.sort(sort_key) - expected_df_sorted = expected_df.sort(sort_key) - - plt.assert_frame_equal( - df_sorted.select(cols_to_compare), - expected_df_sorted.select(cols_to_compare) - ) diff --git a/push_pypi.sh b/push_pypi.sh deleted file mode 100755 index a83acec..0000000 --- a/push_pypi.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -rm -rf dist -python setup.py sdist -twine upload dist/*.tar.gz diff --git a/pyproject.toml b/pyproject.toml index a11eb25..22263ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["setuptools>=61.0", "wheel"] -build-backend = "setuptools.build_meta" +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" [project] name = "pepmatch" @@ -15,7 +15,6 @@ dependencies = [ "polars>=1.31.0", "biopython>=1.78", "xlsxwriter>=3.2.5", - "tqdm>=4.67.0" ] [project.urls] @@ -31,8 +30,10 @@ dev = [ "pre-commit>=3.3.2", ] -[tool.setuptools] -packages = ["pepmatch"] +[tool.maturin] +manifest-path = "pepmatch/rs-engine/Cargo.toml" +module-name = "pepmatch._rs" +python-source = "." [tool.setuptools.dynamic] version = { attr = "pepmatch.version.__version__" } diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e0a065e..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ --e . - -pytest>=8.0 -pre-commit>=3.3.2 diff --git a/setup.py b/setup.py deleted file mode 100755 index 74d710a..0000000 --- a/setup.py +++ /dev/null @@ -1,10 +0,0 @@ -from setuptools import setup, Extension - -hamming_module = Extension( - 'pepmatch.hamming', - sources=['pepmatch/hamming.c'] -) - -setup( - ext_modules=[hamming_module] -)