From 9abda83b3c369fb39af26ea0b42d298b6ddb1920 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 15 Feb 2026 20:51:01 +0100 Subject: [PATCH 1/5] Implement BondList in Rust --- Cargo.toml | 1 + src/biotite/structure/bonds.py | 664 ++++++++++ src/biotite/structure/bonds.pyx | 2036 ------------------------------ src/rust/structure/bonds.rs | 1412 +++++++++++++++++++++ src/rust/structure/io/mod.rs | 2 +- src/rust/structure/io/pdb/mod.rs | 11 +- src/rust/structure/mod.rs | 11 +- tests/structure/test_bonds.py | 35 +- 8 files changed, 2114 insertions(+), 2058 deletions(-) create mode 100644 src/biotite/structure/bonds.py delete mode 100644 src/biotite/structure/bonds.pyx create mode 100644 src/rust/structure/bonds.rs diff --git a/Cargo.toml b/Cargo.toml index c82070384..76e0711c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2018" numpy = "0.27" ndarray = "0.17" num-traits = "0.2" +smallvec = "1" [dependencies.pyo3] version = "0.27" diff --git a/src/biotite/structure/bonds.py b/src/biotite/structure/bonds.py new file mode 100644 index 000000000..0b894ee3f --- /dev/null +++ b/src/biotite/structure/bonds.py @@ -0,0 +1,664 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +This module allows efficient search of atoms in a defined radius around +a location. +""" + +__name__ = "biotite.structure" +__author__ = "Patrick Kunzmann" +__all__ = [ + "BondList", + "BondType", + "connect_via_distances", + "connect_via_residue_names", + "find_connected", + "find_rotatable_bonds", +] + +import itertools +from enum import IntEnum +import networkx as nx +import numpy as np +from biotite.rust.structure import BondList, bond_type_members +from biotite.structure.error import BadStructureError + + +def _without_aromaticity(self): + """ + Get the non-aromatic counterpart of this bond type. + + If this bond type is already non-aromatic, it is returned unchanged. + + Returns + ------- + BondType + The non-aromatic counterpart of this bond type. + + Examples + -------- + >>> BondType.AROMATIC_DOUBLE.without_aromaticity() + + >>> BondType.SINGLE.without_aromaticity() + + """ + match self: + case BondType.AROMATIC_SINGLE: + return BondType.SINGLE + case BondType.AROMATIC_DOUBLE: + return BondType.DOUBLE + case BondType.AROMATIC_TRIPLE: + return BondType.TRIPLE + case BondType.AROMATIC: + return BondType.ANY + case _: + return self + + +# Create BondType IntEnum dynamically from Rust enum members +BondType = IntEnum( + "BondType", + {name: value for name, value in bond_type_members().items()}, + module=__name__, +) +BondType.__doc__ = """ +This enum type represents the type of a chemical bond. + +- ``ANY`` - Used if the actual type is unknown +- ``SINGLE`` - Single bond +- ``DOUBLE`` - Double bond +- ``TRIPLE`` - Triple bond +- ``QUADRUPLE`` - A quadruple bond +- ``AROMATIC_SINGLE`` - Aromatic bond with a single formal bond +- ``AROMATIC_DOUBLE`` - Aromatic bond with a double formal bond +- ``AROMATIC_TRIPLE`` - Aromatic bond with a triple formal bond +- ``COORDINATION`` - Coordination complex involving a metal atom +- ``AROMATIC`` - Aromatic bond without specification of the formal bond +""" +BondType.without_aromaticity = _without_aromaticity + + +# fmt: off +_DEFAULT_DISTANCE_RANGE = { + # Taken from Allen et al. + # min - 2*std max + 2*std + ("B", "C" ) : (1.556 - 2*0.015, 1.556 + 2*0.015), + ("BR", "C" ) : (1.875 - 2*0.029, 1.966 + 2*0.029), + ("BR", "O" ) : (1.581 - 2*0.007, 1.581 + 2*0.007), + ("C", "C" ) : (1.174 - 2*0.011, 1.588 + 2*0.025), + ("C", "CL") : (1.713 - 2*0.011, 1.849 + 2*0.011), + ("C", "F" ) : (1.320 - 2*0.009, 1.428 + 2*0.009), + ("C", "H" ) : (1.059 - 2*0.030, 1.099 + 2*0.007), + ("C", "I" ) : (2.095 - 2*0.015, 2.162 + 2*0.015), + ("C", "N" ) : (1.325 - 2*0.009, 1.552 + 2*0.023), + ("C", "O" ) : (1.187 - 2*0.011, 1.477 + 2*0.008), + ("C", "P" ) : (1.791 - 2*0.006, 1.855 + 2*0.019), + ("C", "S" ) : (1.630 - 2*0.014, 1.863 + 2*0.015), + ("C", "SE") : (1.893 - 2*0.013, 1.970 + 2*0.032), + ("C", "SI") : (1.837 - 2*0.012, 1.888 + 2*0.023), + ("CL", "O" ) : (1.414 - 2*0.026, 1.414 + 2*0.026), + ("CL", "P" ) : (1.997 - 2*0.035, 2.008 + 2*0.035), + ("CL", "S" ) : (2.072 - 2*0.023, 2.072 + 2*0.023), + ("CL", "SI") : (2.072 - 2*0.009, 2.072 + 2*0.009), + ("F", "N" ) : (1.406 - 2*0.016, 1.406 + 2*0.016), + ("F", "P" ) : (1.495 - 2*0.016, 1.579 + 2*0.025), + ("F", "S" ) : (1.640 - 2*0.011, 1.640 + 2*0.011), + ("F", "SI") : (1.588 - 2*0.014, 1.694 + 2*0.013), + ("H", "N" ) : (1.009 - 2*0.022, 1.033 + 2*0.022), + ("H", "O" ) : (0.967 - 2*0.010, 1.015 + 2*0.017), + ("I", "O" ) : (2.144 - 2*0.028, 2.144 + 2*0.028), + ("N", "N" ) : (1.124 - 2*0.015, 1.454 + 2*0.021), + ("N", "O" ) : (1.210 - 2*0.011, 1.463 + 2*0.012), + ("N", "P" ) : (1.571 - 2*0.013, 1.697 + 2*0.015), + ("N", "S" ) : (1.541 - 2*0.022, 1.710 + 2*0.019), + ("N", "SI") : (1.711 - 2*0.019, 1.748 + 2*0.022), + ("O", "P" ) : (1.449 - 2*0.007, 1.689 + 2*0.024), + ("O", "S" ) : (1.423 - 2*0.008, 1.580 + 2*0.015), + ("O", "SI") : (1.622 - 2*0.014, 1.680 + 2*0.008), + ("P", "P" ) : (2.214 - 2*0.022, 2.214 + 2*0.022), + ("P", "S" ) : (1.913 - 2*0.014, 1.954 + 2*0.005), + ("P", "SE") : (2.093 - 2*0.019, 2.093 + 2*0.019), + ("P", "SI") : (2.264 - 2*0.019, 2.264 + 2*0.019), + ("S", "S" ) : (1.897 - 2*0.012, 2.070 + 2*0.022), + ("S", "SE") : (2.193 - 2*0.015, 2.193 + 2*0.015), + ("S", "SI") : (2.145 - 2*0.020, 2.145 + 2*0.020), + ("SE", "SE") : (2.340 - 2*0.024, 2.340 + 2*0.024), + ("SI", "SE") : (2.359 - 2*0.012, 2.359 + 2*0.012), +} +# fmt: on + + +def connect_via_distances( + atoms, + distance_range=None, + inter_residue=True, + default_bond_type=BondType.ANY, + periodic=False, +): + """ + connect_via_distances(atoms, distance_range=None, inter_residue=True, + default_bond_type=BondType.ANY, periodic=False) + + Create a :class:`BondList` for a given atom array, based on + pairwise atom distances. + + A :attr:`BondType.ANY`, bond is created for two atoms within the + same residue, if the distance between them is within the expected + bond distance range. + Bonds between two adjacent residues are created for the atoms + expected to connect these residues, i.e. ``'C'`` and ``'N'`` for + peptides and ``"O3'"`` and ``'P'`` for nucleotides. + + Parameters + ---------- + atoms : AtomArray + The structure to create the :class:`BondList` for. + distance_range : dict of tuple(str, str) -> tuple(float, float), optional + Custom minimum and maximum bond distances. + The dictionary keys are tuples of chemical elements representing + the atoms to be potentially bonded. + The order of elements within each tuple does not matter. + The dictionary values are the minimum and maximum bond distance, + respectively, for the given combination of elements. + This parameter updates the default dictionary. + Hence, the default bond distances for missing element pairs are + still taken from the default dictionary. + The default bond distances are taken from :footcite:`Allen1987`. + inter_residue : bool, optional + If true, connections between consecutive amino acids and + nucleotides are also added. + default_bond_type : BondType or int, optional + By default, all created bonds have :attr:`BondType.ANY`. + An alternative :class:`BondType` can be given in this parameter. + periodic : bool, optional + If set to true, bonds can also be detected in periodic + boundary conditions. + The `box` attribute of `atoms` is required in this case. + + Returns + ------- + BondList + The created bond list. + + See Also + -------- + connect_via_residue_names + + Notes + ----- + This method might miss bonds, if the bond distance is unexpectedly + high or low, or it might create false bonds, if two atoms within a + residue are accidentally in the right distance. + A more accurate method for determining bonds is + :func:`connect_via_residue_names()`. + + References + ---------- + + .. footbibliography:: + """ + from biotite.structure.atoms import AtomArray + from biotite.structure.geometry import distance + from biotite.structure.residues import get_residue_starts + + if not isinstance(atoms, AtomArray): + raise TypeError(f"Expected 'AtomArray', not '{type(atoms).__name__}'") + if periodic: + if atoms.box is None: + raise BadStructureError("Atom array has no box") + box = atoms.box + else: + box = None + + # Prepare distance dictionary... + if distance_range is None: + distance_range = {} + # Merge default and custom entries + dist_ranges = {} + for key, val in itertools.chain( + _DEFAULT_DISTANCE_RANGE.items(), distance_range.items() + ): + element1, element2 = key + # Add entries for both element orders + dist_ranges[(element1.upper(), element2.upper())] = val + dist_ranges[(element2.upper(), element1.upper())] = val + + bonds = [] + coord = atoms.coord + elements = atoms.element + + residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) + # Omit exclusive stop in 'residue_starts' + for i in range(len(residue_starts) - 1): + curr_start_i = residue_starts[i] + next_start_i = residue_starts[i + 1] + + elements_in_res = elements[curr_start_i:next_start_i] + coord_in_res = coord[curr_start_i:next_start_i] + # Matrix containing all pairwise atom distances in the residue + distances = distance( + coord_in_res[:, np.newaxis, :], coord_in_res[np.newaxis, :, :], box + ) + for atom_index1 in range(len(elements_in_res)): + for atom_index2 in range(atom_index1): + dist_range = dist_ranges.get( + (elements_in_res[atom_index1], elements_in_res[atom_index2]) + ) + if dist_range is None: + # No bond distance entry for this element + # combination -> skip + continue + else: + min_dist, max_dist = dist_range + dist = distances[atom_index1, atom_index2] + if dist >= min_dist and dist <= max_dist: + # Convert BondType to int if necessary + bt_int = ( + int(default_bond_type) + if hasattr(default_bond_type, "__int__") + else default_bond_type + ) + bonds.append( + ( + curr_start_i + atom_index1, + curr_start_i + atom_index2, + bt_int, + ) + ) + + if bonds: + bond_list = BondList(atoms.array_length(), np.array(bonds, dtype=np.int64)) + else: + bond_list = BondList(atoms.array_length()) + + if inter_residue: + inter_bonds = _connect_inter_residue(atoms, residue_starts) + if default_bond_type == BondType.ANY: + # As all bonds should be of type ANY, convert also + # inter-residue bonds to ANY + inter_bonds.remove_bond_order() + return bond_list.merge(inter_bonds) + else: + return bond_list + + +def connect_via_residue_names(atoms, inter_residue=True, custom_bond_dict=None): + """ + connect_via_residue_names(atoms, inter_residue=True, custom_bond_dict=None) + + Create a :class:`BondList` for a given atom array (stack), based on + the deposited bonds for each residue in the RCSB ``components.cif`` + dataset. + + Bonds between two adjacent residues are created for the atoms + expected to connect these residues, i.e. ``'C'`` and ``'N'`` for + peptides and ``"O3'"`` and ``'P'`` for nucleotides. + + Parameters + ---------- + atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) + The structure to create the :class:`BondList` for. + inter_residue : bool, optional + If true, connections between consecutive amino acids and + nucleotides are also added. + custom_bond_dict : dict (str -> dict ((str, str) -> int)), optional + A dictionary of dictionaries: + The outer dictionary maps residue names to inner dictionaries. + The inner dictionary maps tuples of two atom names to their + respective :class:`BondType` (represented as integer). + If given, these bonds are used instead of the bonds read from + ``components.cif``. + + Returns + ------- + BondList + The created bond list. + No bonds are added for residues that are not found in + ``components.cif``. + + See Also + -------- + connect_via_distances + + Notes + ----- + This method can only find bonds for residues in the RCSB + *Chemical Component Dictionary*, unless `custom_bond_dict` is set. + Although this includes most molecules one encounters, this will fail + for exotic molecules, e.g. specialized inhibitors. + + .. currentmodule:: biotite.structure.info + + To supplement `custom_bond_dict` with bonds for residues from the + *Chemical Component Dictionary* you can use + :meth:`bonds_in_residue()`. + + >>> import pprint + >>> custom_bond_dict = { + ... "XYZ": { + ... ("A", "B"): BondType.SINGLE, + ... ("B", "C"): BondType.SINGLE + ... } + ... } + >>> # Supplement with bonds for common residues + >>> custom_bond_dict["ALA"] = bonds_in_residue("ALA") + >>> pp = pprint.PrettyPrinter(width=40) + >>> pp.pprint(custom_bond_dict) + {'ALA': {('C', 'O'): , + ('C', 'OXT'): , + ('CA', 'C'): , + ('CA', 'CB'): , + ('CA', 'HA'): , + ('CB', 'HB1'): , + ('CB', 'HB2'): , + ('CB', 'HB3'): , + ('N', 'CA'): , + ('N', 'H'): , + ('N', 'H2'): , + ('OXT', 'HXT'): }, + 'XYZ': {('A', 'B'): , + ('B', 'C'): }} + """ + from biotite.structure.info.bonds import bonds_in_residue + from biotite.structure.residues import get_residue_starts + + bonds = [] + atom_names = atoms.atom_name + res_names = atoms.res_name + + residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) + # Omit exclusive stop in 'residue_starts' + for res_i in range(len(residue_starts) - 1): + curr_start_i = residue_starts[res_i] + next_start_i = residue_starts[res_i + 1] + + if custom_bond_dict is None: + bond_dict_for_res = bonds_in_residue(res_names[curr_start_i]) + else: + bond_dict_for_res = custom_bond_dict.get(res_names[curr_start_i], {}) + + atom_names_in_res = atom_names[curr_start_i:next_start_i] + for (atom_name1, atom_name2), bond_type in bond_dict_for_res.items(): + atom_indices1 = np.where(atom_names_in_res == atom_name1)[0].astype( + np.int64, copy=False + ) + atom_indices2 = np.where(atom_names_in_res == atom_name2)[0].astype( + np.int64, copy=False + ) + # In rare cases the same atom name may appear multiple times + # (e.g. in altlocs) + # -> create all possible bond combinations + # Convert BondType to int if necessary + bt_int = int(bond_type) if hasattr(bond_type, "__int__") else bond_type + for i in range(len(atom_indices1)): + for j in range(len(atom_indices2)): + bonds.append( + ( + curr_start_i + atom_indices1[i], + curr_start_i + atom_indices2[j], + bt_int, + ) + ) + + if bonds: + bond_list = BondList(atoms.array_length(), np.array(bonds, dtype=np.int64)) + else: + bond_list = BondList(atoms.array_length()) + + if inter_residue: + inter_bonds = _connect_inter_residue(atoms, residue_starts) + return bond_list.merge(inter_bonds) + else: + return bond_list + + +_PEPTIDE_LINKS = ["PEPTIDE LINKING", "L-PEPTIDE LINKING", "D-PEPTIDE LINKING"] +_NUCLEIC_LINKS = ["RNA LINKING", "DNA LINKING"] + + +def _connect_inter_residue(atoms, residue_starts): + """ + Create a :class:`BondList` containing the bonds between adjacent + amino acid or nucleotide residues. + + Parameters + ---------- + atoms : AtomArray or AtomArrayStack + The structure to create the :class:`BondList` for. + residue_starts : ndarray, dtype=int + Return value of + ``get_residue_starts(atoms, add_exclusive_stop=True)``. + + Returns + ------- + BondList + A bond list containing all inter residue bonds. + """ + from biotite.structure.info.misc import link_type + + bonds = [] + atom_names = atoms.atom_name + res_names = atoms.res_name + res_ids = atoms.res_id + chain_ids = atoms.chain_id + + # Iterate over all starts excluding: + # - the last residue and + # - exclusive end index of 'atoms' + for i in range(len(residue_starts) - 2): + curr_start_i = residue_starts[i] + next_start_i = residue_starts[i + 1] + after_next_start_i = residue_starts[i + 2] + + # Check if the current and next residue is in the same chain + if chain_ids[next_start_i] != chain_ids[curr_start_i]: + continue + # Check if the current and next residue + # have consecutive residue IDs + # (Same residue ID is also possible if insertion code is used) + if res_ids[next_start_i] - res_ids[curr_start_i] > 1: + continue + + # Get link type for this residue from RCSB components.cif + curr_link = link_type(res_names[curr_start_i]) + next_link = link_type(res_names[next_start_i]) + + if curr_link in _PEPTIDE_LINKS and next_link in _PEPTIDE_LINKS: + curr_connect_atom_name = "C" + next_connect_atom_name = "N" + elif curr_link in _NUCLEIC_LINKS and next_link in _NUCLEIC_LINKS: + curr_connect_atom_name = "O3'" + next_connect_atom_name = "P" + else: + # Create no bond if the connection types of consecutive + # residues are not compatible + continue + + # Index in atom array for atom name in current residue + # Addition of 'curr_start_i' is necessary, as only a slice of + # 'atom_names' is taken, beginning at 'curr_start_i' + curr_connect_indices = ( + curr_start_i + + np.where(atom_names[curr_start_i:next_start_i] == curr_connect_atom_name)[ + 0 + ] + ) + # Index in atom array for atom name in next residue + next_connect_indices = ( + next_start_i + + np.where( + atom_names[next_start_i:after_next_start_i] == next_connect_atom_name + )[0] + ) + if len(curr_connect_indices) == 0 or len(next_connect_indices) == 0: + # The connector atoms are not found in the adjacent residues + # -> skip this bond + continue + + bonds.append( + (curr_connect_indices[0], next_connect_indices[0], int(BondType.SINGLE)) + ) + + if bonds: + return BondList(atoms.array_length(), np.array(bonds, dtype=np.int64)) + else: + return BondList(atoms.array_length()) + + +def find_connected(bond_list, root, as_mask=False): + """ + find_connected(bond_list, root, as_mask=False) + + Get indices to all atoms that are directly or indirectly connected + to the root atom indicated by the given index. + + An atom is *connected* to the `root` atom, if that atom is reachable + by traversing an arbitrary number of bonds, starting from the + `root`. + Effectively, this means that all atoms are *connected* to `root`, + that are in the same molecule as `root`. + Per definition `root` is also *connected* to itself. + + Parameters + ---------- + bond_list : BondList + The reference bond list. + root : int + The index of the root atom. + as_mask : bool, optional + If true, the connected atom indices are returned as boolean + mask. + By default, the connected atom indices are returned as integer + array. + + Returns + ------- + connected : ndarray, dtype=int or ndarray, dtype=bool + Either a boolean mask or an integer array, representing the + connected atoms. + In case of a boolean mask: ``connected[i] == True``, if the atom + with index ``i`` is connected. + + Examples + -------- + Consider a system with 4 atoms, where only the last atom is not + bonded with the other ones (``0-1-2 3``): + + >>> bonds = BondList(4) + >>> bonds.add_bond(0, 1) + >>> bonds.add_bond(1, 2) + >>> print(find_connected(bonds, 0)) + [0 1 2] + >>> print(find_connected(bonds, 1)) + [0 1 2] + >>> print(find_connected(bonds, 2)) + [0 1 2] + >>> print(find_connected(bonds, 3)) + [3] + """ + all_bonds, _ = bond_list.get_all_bonds() + + if root >= bond_list.get_atom_count(): + raise ValueError( + f"Root atom index {root} is out of bounds for bond list " + f"representing {bond_list.get_atom_count()} atoms" + ) + + is_connected_mask = np.zeros(bond_list.get_atom_count(), dtype=np.uint8) + # Find connections in a recursive way, + # by visiting all atoms that are reachable by a bond + _find_connected_recursive(root, is_connected_mask, all_bonds) + if as_mask: + return is_connected_mask.astype(bool) + else: + return np.where(is_connected_mask)[0] + + +def _find_connected_recursive(index, is_connected_mask, all_bonds): + """Recursive helper for find_connected.""" + if is_connected_mask[index]: + # This atom has already been visited + # -> exit condition + return + is_connected_mask[index] = 1 + + for j in range(all_bonds.shape[1]): + connected_index = all_bonds[index, j] + if connected_index == -1: + # Ignore padding values + continue + _find_connected_recursive(connected_index, is_connected_mask, all_bonds) + + +def find_rotatable_bonds(bonds): + """ + find_rotatable_bonds(bonds) + + Find all rotatable bonds in a given :class:`BondList`. + + The following conditions must be true for a bond to be counted as + rotatable: + + 1. The bond must be a single bond (``BondType.SINGLE``) + 2. The connected atoms must not be within the same cycle/ring + 3. Both connected atoms must not be terminal, e.g. not a *C-H* + bond, as rotation about such bonds would not change any + coordinates + + Parameters + ---------- + bonds : BondList + The bonds to find the rotatable bonds in. + + Returns + ------- + rotatable_bonds : BondList + The subset of the input `bonds` that contains only rotatable + bonds. + + Examples + -------- + + >>> molecule = residue("TYR") + >>> for i, j, _ in find_rotatable_bonds(molecule.bonds).as_array(): + ... print(molecule.atom_name[i], molecule.atom_name[j]) + N CA + CA C + CA CB + C OXT + CB CG + CZ OH + """ + bond_graph = bonds.as_graph() + cycles = nx.algorithms.cycles.cycle_basis(bond_graph) + + number_of_partners = np.count_nonzero(bonds.get_all_bonds()[0] != -1, axis=1) + + rotatable_bonds = [] + bonds_array = bonds.as_array() + for i, j, bond_type in bonds_array: + # Can only rotate about single bonds + # Furthermore, it makes no sense to rotate about a bond, + # that leads to a single atom + if ( + bond_type == BondType.SINGLE + and number_of_partners[i] > 1 + and number_of_partners[j] > 1 + ): + # Cannot rotate about a bond, if the two connected atoms + # are in a cycle + in_same_cycle = False + for cycle in cycles: + if i in cycle and j in cycle: + in_same_cycle = True + break + if not in_same_cycle: + rotatable_bonds.append((i, j, int(bond_type))) + if rotatable_bonds: + return BondList( + bonds.get_atom_count(), np.array(rotatable_bonds, dtype=np.int64) + ) + else: + return BondList(bonds.get_atom_count()) diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx deleted file mode 100644 index 02b04c797..000000000 --- a/src/biotite/structure/bonds.pyx +++ /dev/null @@ -1,2036 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -""" -This module allows efficient search of atoms in a defined radius around -a location. -""" - -__name__ = "biotite.structure" -__author__ = "Patrick Kunzmann" -__all__ = ["BondList", "BondType", - "connect_via_distances", "connect_via_residue_names", - "find_connected", "find_rotatable_bonds"] - -cimport cython -cimport numpy as np -from libc.stdlib cimport free, realloc - -from collections.abc import Sequence -import itertools -import numbers -from enum import IntEnum -import networkx as nx -import numpy as np -from .error import BadStructureError -from ..copyable import Copyable - -ctypedef np.uint64_t ptr -ctypedef np.uint8_t uint8 -ctypedef np.uint16_t uint16 -ctypedef np.uint32_t uint32 -ctypedef np.uint64_t uint64 -ctypedef np.int8_t int8 -ctypedef np.int16_t int16 -ctypedef np.int32_t int32 -ctypedef np.int64_t int64 - - -ctypedef fused IndexType: - uint8 - uint16 - uint32 - uint64 - int8 - int16 - int32 - int64 - - -class BondType(IntEnum): - """ - This enum type represents the type of a chemical bond. - - - ``ANY`` - Used if the actual type is unknown - - ``SINGLE`` - Single bond - - ``DOUBLE`` - Double bond - - ``TRIPLE`` - Triple bond - - ``QUADRUPLE`` - A quadruple bond - - ``AROMATIC_SINGLE`` - Aromatic bond with a single formal bond - - ``AROMATIC_DOUBLE`` - Aromatic bond with a double formal bond - - ``AROMATIC_TRIPLE`` - Aromatic bond with a triple formal bond - - ``AROMATIC`` - Aromatic bond without specification of the formal bond - - ``COORDINATION`` - Coordination complex involving a metal atom - """ - ANY = 0 - SINGLE = 1 - DOUBLE = 2 - TRIPLE = 3 - QUADRUPLE = 4 - AROMATIC_SINGLE = 5 - AROMATIC_DOUBLE = 6 - AROMATIC_TRIPLE = 7 - COORDINATION = 8 - AROMATIC = 9 - - - def without_aromaticity(self): - """ - Remove aromaticity from the bond type. - - :attr:`BondType.AROMATIC_{ORDER}` is converted into - :attr:`BondType.{ORDER}`. - - Returns - ------- - new_bond_type : BondType - The :class:`BondType` without aromaticity. - - Examples - -------- - - >>> print(BondType.AROMATIC_DOUBLE.without_aromaticity().name) - DOUBLE - """ - if self == BondType.AROMATIC_SINGLE: - return BondType.SINGLE - elif self == BondType.AROMATIC_DOUBLE: - return BondType.DOUBLE - elif self == BondType.AROMATIC_TRIPLE: - return BondType.TRIPLE - elif self == BondType.AROMATIC: - return BondType.ANY - else: - return self - - -@cython.boundscheck(False) -@cython.wraparound(False) -class BondList(Copyable): - """ - __init__(atom_count, bonds=None) - - A bond list stores indices of atoms - (usually of an :class:`AtomArray` or :class:`AtomArrayStack`) - that form chemical bonds together with the type (or order) of the - bond. - - Internally the bonds are stored as *n x 3* :class:`ndarray`. - For each row, the first column specifies the index of the first - atom, the second column the index of the second atom involved in the - bond. - The third column stores an integer that is interpreted as member - of the the :class:`BondType` enum, that specifies the order of the - bond. - - When indexing a :class:`BondList`, the index is not forwarded to the - internal :class:`ndarray`. Instead the indexing behavior is - consistent with indexing an :class:`AtomArray` or - :class:`AtomArrayStack`: - Bonds with at least one atom index that is not covered by the index - are removed, atom indices that occur after an uncovered atom index - move up. - Effectively, this means that after indexing an :class:`AtomArray` - and a :class:`BondList` with the same index, the atom indices in the - :class:`BondList` will still point to the same atoms in the - :class:`AtomArray`. - Indexing a :class:`BondList` with a single integer is equivalent - to calling :func:`get_bonds()`. - - The same consistency applies to adding :class:`BondList` instances - via the '+' operator: - The atom indices of the second :class:`BondList` are increased by - the atom count of the first :class:`BondList` and then both - :class:`BondList` objects are merged. - - Parameters - ---------- - atom_count : int - A positive integer, that specifies the number of atoms the - :class:`BondList` refers to - (usually the length of an atom array (stack)). - Effectively, this value is the exclusive maximum for the indices - stored in the :class:`BondList`. - bonds : ndarray, shape=(n,2) or shape=(n,3), dtype=int, optional - This array contains the indices of atoms which are bonded: - For each row, the first column specifies the first atom, - the second row the second atom involved in a chemical bond. - If an *n x 3* array is provided, the additional column - specifies a :class:`BondType` instead of :attr:`BondType.ANY`. - By default, the created :class:`BondList` is empty. - - Notes - ----- - When initially providing the bonds as :class:`ndarray`, the input is - sanitized: Redundant bonds are removed, and each bond entry is - sorted so that the lower one of the two atom indices is in the first - column. - If a bond appears multiple times with different bond types, the - first bond takes precedence. - - Examples - -------- - - Construct a :class:`BondList`, where a central atom (index 1) is - connected to three other atoms (index 0, 3 and 4): - - >>> bond_list = BondList(5, np.array([(1,0),(1,3),(1,4)])) - >>> print(bond_list) - [[0 1 0] - [1 3 0] - [1 4 0]] - - Remove the first atom (index 0) via indexing: - The bond containing index 0 is removed, since the corresponding atom - does not exist anymore. Since all other atoms move up in their - position, the indices in the bond list are decreased by one: - - >>> bond_list = bond_list[1:] - >>> print(bond_list) - [[0 2 0] - [0 3 0]] - - :class:`BondList` objects can be associated to an :class:`AtomArray` - or :class:`AtomArrayStack`. - The following snippet shows this for a benzene molecule: - - >>> benzene = AtomArray(12) - >>> # Omit filling most required annotation categories for brevity - >>> benzene.atom_name = np.array( - ... ["C1", "C2", "C3", "C4", "C5", "C6", "H1", "H2", "H3", "H4", "H5", "H6"] - ... ) - >>> benzene.bonds = BondList( - ... benzene.array_length(), - ... np.array([ - ... # Bonds between carbon atoms in the ring - ... (0, 1, BondType.AROMATIC_SINGLE), - ... (1, 2, BondType.AROMATIC_DOUBLE), - ... (2, 3, BondType.AROMATIC_SINGLE), - ... (3, 4, BondType.AROMATIC_DOUBLE), - ... (4, 5, BondType.AROMATIC_SINGLE), - ... (5, 0, BondType.AROMATIC_DOUBLE), - ... # Bonds between carbon and hydrogen - ... (0, 6, BondType.SINGLE), - ... (1, 7, BondType.SINGLE), - ... (2, 8, BondType.SINGLE), - ... (3, 9, BondType.SINGLE), - ... (4, 10, BondType.SINGLE), - ... (5, 11, BondType.SINGLE), - ... ]) - ... ) - >>> for i, j, bond_type in benzene.bonds.as_array(): - ... print( - ... f"{BondType(bond_type).name} bond between " - ... f"{benzene.atom_name[i]} and {benzene.atom_name[j]}" - ... ) - AROMATIC_SINGLE bond between C1 and C2 - AROMATIC_DOUBLE bond between C2 and C3 - AROMATIC_SINGLE bond between C3 and C4 - AROMATIC_DOUBLE bond between C4 and C5 - AROMATIC_SINGLE bond between C5 and C6 - AROMATIC_DOUBLE bond between C1 and C6 - SINGLE bond between C1 and H1 - SINGLE bond between C2 and H2 - SINGLE bond between C3 and H3 - SINGLE bond between C4 and H4 - SINGLE bond between C5 and H5 - SINGLE bond between C6 and H6 - - Obtain the bonded atoms for the :math:`C_1`: - - >>> bonds, types = benzene.bonds.get_bonds(0) - >>> print(bonds) - [1 5 6] - >>> print(types) - [5 6 1] - >>> print(f"C1 is bonded to {', '.join(benzene.atom_name[bonds])}") - C1 is bonded to C2, C6, H1 - - Cut the benzene molecule in half. - Although the first half of the atoms are missing the indices of - the cropped :class:`BondList` still represents the bonds of the - remaining atoms: - - >>> half_benzene = benzene[ - ... np.isin(benzene.atom_name, ["C4", "C5", "C6", "H4", "H5", "H6"]) - ... ] - >>> for i, j, bond_type in half_benzene.bonds.as_array(): - ... print( - ... f"{BondType(bond_type).name} bond between " - ... f"{half_benzene.atom_name[i]} and {half_benzene.atom_name[j]}" - ... ) - AROMATIC_DOUBLE bond between C4 and C5 - AROMATIC_SINGLE bond between C5 and C6 - SINGLE bond between C4 and H4 - SINGLE bond between C5 and H5 - SINGLE bond between C6 and H6 - """ - - def __init__(self, uint32 atom_count, np.ndarray bonds=None): - self._atom_count = atom_count - - if bonds is not None and len(bonds) > 0: - if bonds.ndim != 2: - raise ValueError("Expected a 2D-ndarray for input bonds") - - self._bonds = np.zeros((bonds.shape[0], 3), dtype=np.uint32) - if bonds.shape[1] == 3: - # Input contains bonds (index 0 and 1) - # including the bond type value (index 2) - # Bond indices: - self._bonds[:,:2] = np.sort( - # Indices are sorted per bond - # so that the lower index is at the first position - _to_positive_index_array(bonds[:,:2], atom_count), axis=1 - ) - # Bond type: - if (bonds[:, 2] >= len(BondType)).any(): - raise ValueError( - f"BondType {np.max(bonds[:, 2])} is invalid" - ) - self._bonds[:,2] = bonds[:, 2] - - # Indices are sorted per bond - # so that the lower index is at the first position - elif bonds.shape[1] == 2: - # Input contains the bonds without bond type - # -> Default: Set bond type ANY (0) - self._bonds[:,:2] = np.sort( - # Indices are sorted per bond - # so that the lower index is at the first position - _to_positive_index_array(bonds[:,:2], atom_count), axis=1 - ) - else: - raise ValueError( - "Input array containing bonds must be either of shape " - "(n,2) or (n,3)" - ) - self._remove_redundant_bonds() - self._max_bonds_per_atom = self._get_max_bonds_per_atom() - - else: - # Create empty bond list - self._bonds = np.zeros((0, 3), dtype=np.uint32) - self._max_bonds_per_atom = 0 - - @staticmethod - def concatenate(bonds_lists): - """ - Concatenate multiple :class:`BondList` objects into a single - :class:`BondList`, respectively. - - Parameters - ---------- - bonds_lists : iterable object of BondList - The bond lists to be concatenated. - - Returns - ------- - concatenated_bonds : BondList - The concatenated bond lists. - - Examples - -------- - - >>> bonds1 = BondList(2, np.array([(0, 1)])) - >>> bonds2 = BondList(3, np.array([(0, 1), (0, 2)])) - >>> merged_bonds = BondList.concatenate([bonds1, bonds2]) - >>> print(merged_bonds.get_atom_count()) - 5 - >>> print(merged_bonds.as_array()[:, :2]) - [[0 1] - [2 3] - [2 4]] - """ - # Ensure that the bonds_lists can be iterated over multiple times - if not isinstance(bonds_lists, Sequence): - bonds_lists = list(bonds_lists) - - cdef np.ndarray merged_bonds = np.concatenate( - [bond_list._bonds for bond_list in bonds_lists] - ) - # Offset the indices of appended bonds list - # (consistent with addition of AtomArray) - cdef int start = 0, stop = 0 - cdef int cum_atom_count = 0 - for bond_list in bonds_lists: - stop = start + bond_list._bonds.shape[0] - merged_bonds[start : stop, :2] += cum_atom_count - cum_atom_count += bond_list._atom_count - start = stop - - cdef merged_bond_list = BondList(cum_atom_count) - # Array is not used in constructor to prevent unnecessary - # maximum and redundant bond calculation - merged_bond_list._bonds = merged_bonds - merged_bond_list._max_bonds_per_atom = max( - [bond_list._max_bonds_per_atom for bond_list in bonds_lists] - ) - return merged_bond_list - - def __copy_create__(self): - # Create empty bond list to prevent - # unnecessary removal of redundant atoms - # and calculation of maximum bonds per atom - return BondList(self._atom_count) - - def __copy_fill__(self, clone): - # The bonds are added here - clone._bonds = self._bonds.copy() - clone._max_bonds_per_atom = self._max_bonds_per_atom - - def offset_indices(self, int offset): - """ - offset_indices(offset) - - Increase all atom indices in the :class:`BondList` by the given - offset. - - Implicitly this increases the atom count. - - Parameters - ---------- - offset : int - The atom indices are increased by this value. - Must be positive. - - Examples - -------- - - >>> bond_list = BondList(5, np.array([(1,0),(1,3),(1,4)])) - >>> print(bond_list) - [[0 1 0] - [1 3 0] - [1 4 0]] - >>> bond_list.offset_indices(2) - >>> print(bond_list) - [[2 3 0] - [3 5 0] - [3 6 0]] - """ - if offset < 0: - raise ValueError("Offest must be positive") - self._bonds[:,:2] += offset - self._atom_count += offset - - def as_array(self): - """ - as_array() - - Obtain a copy of the internal :class:`ndarray`. - - Returns - ------- - array : ndarray, shape=(n,3), dtype=np.uint32 - Copy of the internal :class:`ndarray`. - For each row, the first column specifies the index of the - first atom, the second column the index of the second atom - involved in the bond. - The third column stores the :class:`BondType`. - """ - return self._bonds.copy() - - def as_set(self): - """ - as_set() - - Obtain a set representation of the :class:`BondList`. - - Returns - ------- - bond_set : set of tuple(int, int, int) - A set of tuples. - Each tuple represents one bond: - The first integer represents the first atom, - the second integer represents the second atom, - the third integer represents the :class:`BondType`. - """ - cdef uint32[:,:] all_bonds_v = self._bonds - cdef int i - cdef set bond_set = set() - for i in range(all_bonds_v.shape[0]): - bond_set.add( - (all_bonds_v[i,0], all_bonds_v[i,1], all_bonds_v[i,2]) - ) - return bond_set - - def as_graph(self): - """ - as_graph() - - Obtain a graph representation of the :class:`BondList`. - - Returns - ------- - bond_set : Graph - A *NetworkX* :class:`Graph`. - The atom indices are nodes, the bonds are edges. - Each edge has a ``"bond_type"`` attribute containing the - :class:`BondType`. - - Examples - -------- - - >>> bond_list = BondList(5, np.array([(1,0,2), (1,3,1), (1,4,1)])) - >>> graph = bond_list.as_graph() - >>> print(graph.nodes) - [0, 1, 3, 4] - >>> print(graph.edges) - [(0, 1), (1, 3), (1, 4)] - >>> for i, j in graph.edges: - ... print(i, j, graph.get_edge_data(i, j)) - 0 1 {'bond_type': } - 1 3 {'bond_type': } - 1 4 {'bond_type': } - """ - cdef int i - - cdef uint32[:,:] all_bonds_v = self._bonds - - g = nx.Graph() - cdef list edges = [None] * all_bonds_v.shape[0] - for i in range(all_bonds_v.shape[0]): - edges[i] = ( - all_bonds_v[i,0], all_bonds_v[i,1], - {"bond_type": BondType(all_bonds_v[i,2])} - ) - g.add_edges_from(edges) - return g - - def remove_aromaticity(self): - """ - Remove aromaticity from the bond types. - - :attr:`BondType.AROMATIC_{ORDER}` is converted into - :attr:`BondType.{ORDER}`. - - Examples - -------- - - >>> bond_list = BondList(3) - >>> bond_list.add_bond(0, 1, BondType.AROMATIC_SINGLE) - >>> bond_list.add_bond(1, 2, BondType.AROMATIC_DOUBLE) - >>> bond_list.remove_aromaticity() - >>> for i, j, bond_type in bond_list.as_array(): - ... print(i, j, BondType(bond_type).name) - 0 1 SINGLE - 1 2 DOUBLE - """ - for aromatic_type, non_aromatic_type in [ - (BondType.AROMATIC_SINGLE, BondType.SINGLE), - (BondType.AROMATIC_DOUBLE, BondType.DOUBLE), - (BondType.AROMATIC_TRIPLE, BondType.TRIPLE), - (BondType.AROMATIC, BondType.ANY), - ]: - mask = self._bonds[:, 2] == aromatic_type - self._bonds[mask, 2] = non_aromatic_type - - def remove_kekulization(self): - """ - Remove the bond order information from aromatic bonds, i.e. convert all - aromatic bonds to :attr:`BondType.ANY`. - - Examples - -------- - - >>> bond_list = BondList(3) - >>> bond_list.add_bond(0, 1, BondType.AROMATIC_SINGLE) - >>> bond_list.add_bond(1, 2, BondType.AROMATIC_DOUBLE) - >>> bond_list.remove_kekulization() - >>> for i, j, bond_type in bond_list.as_array(): - ... print(i, j, BondType(bond_type).name) - 0 1 AROMATIC - 1 2 AROMATIC - """ - kekulized_mask = np.isin( - self._bonds[:, 2], - ( - BondType.AROMATIC_SINGLE, - BondType.AROMATIC_DOUBLE, - BondType.AROMATIC_TRIPLE, - ), - ) - self._bonds[kekulized_mask, 2] = BondType.AROMATIC - - def remove_bond_order(self): - """ - Convert all bonds to :attr:`BondType.ANY`. - """ - self._bonds[:,2] = BondType.ANY - - def convert_bond_type(self, original_bond_type, new_bond_type): - """ - convert_bond_type(original_bond_type, new_bond_type) - - Convert all occurences of a given bond type into another bond type. - - Parameters - ---------- - original_bond_type : BondType or int - The bond type to convert. - new_bond_type : BondType or int - The new bond type. - - Examples - -------- - - >>> bond_list = BondList(4) - >>> bond_list.add_bond(0, 1, BondType.DOUBLE) - >>> bond_list.add_bond(1, 2, BondType.COORDINATION) - >>> bond_list.add_bond(2, 3, BondType.COORDINATION) - >>> for i, j, bond_type in bond_list.as_array(): - ... print(i, j, BondType(bond_type).name) - 0 1 DOUBLE - 1 2 COORDINATION - 2 3 COORDINATION - >>> bond_list.convert_bond_type(BondType.COORDINATION, BondType.SINGLE) - >>> for i, j, bond_type in bond_list.as_array(): - ... print(i, j, BondType(bond_type).name) - 0 1 DOUBLE - 1 2 SINGLE - 2 3 SINGLE - """ - mask = self._bonds[:, 2] == original_bond_type - self._bonds[mask, 2] = new_bond_type - - def get_atom_count(self): - """ - get_atom_count() - - Get the atom count. - - Returns - ------- - atom_count : int - The atom count. - """ - return self._atom_count - - def get_bond_count(self): - """ - get_bond_count() - - Get the amount of bonds. - - Returns - ------- - bond_count : int - The amount of bonds. This is equal to the length of the - internal :class:`ndarray` containing the bonds. - """ - return len(self._bonds) - - def get_bonds(self, int32 atom_index): - """ - get_bonds(atom_index) - - Obtain the indices of the atoms bonded to the atom with the - given index as well as the corresponding bond types. - - Parameters - ---------- - atom_index : int - The index of the atom to get the bonds for. - - Returns - ------- - bonds : np.ndarray, dtype=np.uint32, shape=(k,) - The indices of connected atoms. - bond_types : np.ndarray, dtype=np.uint8, shape=(k,) - Array of integers, interpreted as :class:`BondType` - instances. - This array specifies the type (or order) of the bonds to - the connected atoms. - - Examples - -------- - - >>> bond_list = BondList(5, np.array([(1,0),(1,3),(1,4)])) - >>> bonds, types = bond_list.get_bonds(1) - >>> print(bonds) - [0 3 4] - """ - cdef int i=0, j=0 - - cdef uint32 index = _to_positive_index(atom_index, self._atom_count) - - cdef uint32[:,:] all_bonds_v = self._bonds - # Pessimistic array allocation: - # assume size is equal to the atom with most bonds - cdef np.ndarray bonds = np.zeros(self._max_bonds_per_atom, - dtype=np.uint32) - cdef uint32[:] bonds_v = bonds - cdef np.ndarray bond_types = np.zeros(self._max_bonds_per_atom, - dtype=np.uint8) - cdef uint8[:] bond_types_v = bond_types - - for i in range(all_bonds_v.shape[0]): - # If a bond is found for the desired atom index - # at the first or second position of the bond, - # then append the index of the respective other position - if all_bonds_v[i,0] == index: - bonds_v[j] = all_bonds_v[i,1] - bond_types_v[j] = all_bonds_v[i,2] - j += 1 - elif all_bonds_v[i,1] == index: - bonds_v[j] = all_bonds_v[i,0] - bond_types_v[j] = all_bonds_v[i,2] - j += 1 - - # Trim to correct size - bonds = bonds[:j] - bond_types = bond_types[:j] - - return bonds, bond_types - - - def get_all_bonds(self): - """ - get_all_bonds() - - For each atom index, give the indices of the atoms bonded to - this atom as well as the corresponding bond types. - - Returns - ------- - bonds : np.ndarray, dtype=np.uint32, shape=(n,k) - The indices of connected atoms. - The first dimension represents the atoms, - the second dimension represents the indices of atoms bonded - to the respective atom. - Atoms can have have different numbers of atoms bonded to - them. - Therefore, the length of the second dimension *k* is equal - to the maximum number of bonds for an atom in this - :class:`BondList`. - For atoms with less bonds, the corresponding entry in the - array is padded with ``-1`` values. - bond_types : np.ndarray, dtype=np.uint32, shape=(n,k) - Array of integers, interpreted as :class:`BondType` - instances. - This array specifies the bond type (or order) corresponding - to the returned `bonds`. - It uses the same ``-1``-padding. - - Examples - -------- - - >>> # BondList for benzene - >>> bond_list = BondList( - ... 12, - ... np.array([ - ... # Bonds between the carbon atoms in the ring - ... (0, 1, BondType.AROMATIC_SINGLE), - ... (1, 2, BondType.AROMATIC_DOUBLE), - ... (2, 3, BondType.AROMATIC_SINGLE), - ... (3, 4, BondType.AROMATIC_DOUBLE), - ... (4, 5, BondType.AROMATIC_SINGLE), - ... (5, 0, BondType.AROMATIC_DOUBLE), - ... # Bonds between carbon and hydrogen - ... (0, 6, BondType.SINGLE), - ... (1, 7, BondType.SINGLE), - ... (2, 8, BondType.SINGLE), - ... (3, 9, BondType.SINGLE), - ... (4, 10, BondType.SINGLE), - ... (5, 11, BondType.SINGLE), - ... ]) - ... ) - >>> bonds, types = bond_list.get_all_bonds() - >>> print(bonds) - [[ 1 5 6] - [ 0 2 7] - [ 1 3 8] - [ 2 4 9] - [ 3 5 10] - [ 4 0 11] - [ 0 -1 -1] - [ 1 -1 -1] - [ 2 -1 -1] - [ 3 -1 -1] - [ 4 -1 -1] - [ 5 -1 -1]] - >>> print(types) - [[ 5 6 1] - [ 5 6 1] - [ 6 5 1] - [ 5 6 1] - [ 6 5 1] - [ 5 6 1] - [ 1 -1 -1] - [ 1 -1 -1] - [ 1 -1 -1] - [ 1 -1 -1] - [ 1 -1 -1] - [ 1 -1 -1]] - >>> for i in range(bond_list.get_atom_count()): - ... bonds_for_atom = bonds[i] - ... # Remove trailing '-1' values - ... bonds_for_atom = bonds_for_atom[bonds_for_atom != -1] - ... print(f"{i}: {bonds_for_atom}") - 0: [1 5 6] - 1: [0 2 7] - 2: [1 3 8] - 3: [2 4 9] - 4: [ 3 5 10] - 5: [ 4 0 11] - 6: [0] - 7: [1] - 8: [2] - 9: [3] - 10: [4] - 11: [5] - """ - cdef int i=0 - cdef uint32 atom_index_i, atom_index_j, bond_type - - cdef uint32[:,:] all_bonds_v = self._bonds - # The size of 2nd dimension is equal to the atom with most bonds - # Since each atom can have an individual number of bonded atoms, - # The arrays are padded with '-1' - cdef np.ndarray bonds = np.full( - (self._atom_count, self._max_bonds_per_atom), -1, dtype=np.int32 - ) - cdef int32[:,:] bonds_v = bonds - cdef np.ndarray bond_types = np.full( - (self._atom_count, self._max_bonds_per_atom), -1, dtype=np.int8 - ) - cdef int8[:,:] bond_types_v = bond_types - # Track the number of already found bonds for each given index - cdef np.ndarray lengths = np.zeros(self._atom_count, dtype=np.uint32) - cdef uint32[:] lengths_v = lengths - - for i in range(all_bonds_v.shape[0]): - atom_index_i = all_bonds_v[i,0] - atom_index_j = all_bonds_v[i,1] - bond_type = all_bonds_v[i,2] - # Add second bonded atom for the first bonded atom - # and vice versa - # Use 'lengths' variable to append the value - bonds_v[atom_index_i, lengths_v[atom_index_i]] = atom_index_j - bonds_v[atom_index_j, lengths_v[atom_index_j]] = atom_index_i - bond_types_v[atom_index_i, lengths_v[atom_index_i]] = bond_type - bond_types_v[atom_index_j, lengths_v[atom_index_j]] = bond_type - # Increment lengths - lengths_v[atom_index_i] += 1 - lengths_v[atom_index_j] += 1 - - return bonds, bond_types - - - def adjacency_matrix(self): - r""" - adjacency_matrix(bond_list) - - Represent this :class:`BondList` as adjacency matrix. - - The adjacency matrix is a quadratic matrix with boolean values - according to - - .. math:: - - M_{i,j} = - \begin{cases} - \text{True}, & \text{if } \text{Atom}_i \text{ and } \text{Atom}_j \text{ form a bond} \\ - \text{False}, & \text{otherwise} - \end{cases}. - - Returns - ------- - matrix : ndarray, dtype=bool, shape=(n,n) - The created adjacency matrix. - - Examples - -------- - - >>> # BondList for formaldehyde - >>> bond_list = BondList( - ... 4, - ... np.array([ - ... # Bond between carbon and oxygen - ... (0, 1, BondType.DOUBLE), - ... # Bonds between carbon and hydrogen - ... (0, 2, BondType.SINGLE), - ... (0, 3, BondType.SINGLE), - ... ]) - ... ) - >>> print(bond_list.adjacency_matrix()) - [[False True True True] - [ True False False False] - [ True False False False] - [ True False False False]] - """ - matrix = np.zeros( - (self._atom_count, self._atom_count), dtype=bool - ) - matrix[self._bonds[:,0], self._bonds[:,1]] = True - matrix[self._bonds[:,1], self._bonds[:,0]] = True - return matrix - - - def bond_type_matrix(self): - r""" - adjacency_matrix(bond_list) - - Represent this :class:`BondList` as a matrix depicting the bond - type. - - The matrix is a quadratic matrix: - - .. math:: - - M_{i,j} = - \begin{cases} - \text{BondType}_{ij}, & \text{if } \text{Atom}_i \text{ and } \text{Atom}_j \text{ form a bond} \\ - -1, & \text{otherwise} - \end{cases}. - - Returns - ------- - matrix : ndarray, dtype=bool, shape=(n,n) - The created bond type matrix. - - Examples - -------- - - >>> # BondList for formaldehyde - >>> bond_list = BondList( - ... 4, - ... np.array([ - ... # Bond between carbon and oxygen - ... (0, 1, BondType.DOUBLE), - ... # Bonds between carbon and hydrogen - ... (0, 2, BondType.SINGLE), - ... (0, 3, BondType.SINGLE), - ... ]) - ... ) - >>> print(bond_list.bond_type_matrix()) - [[-1 2 1 1] - [ 2 -1 -1 -1] - [ 1 -1 -1 -1] - [ 1 -1 -1 -1]] - """ - matrix = np.full( - (self._atom_count, self._atom_count), -1, dtype=np.int8 - ) - matrix[self._bonds[:,0], self._bonds[:,1]] = self._bonds[:,2] - matrix[self._bonds[:,1], self._bonds[:,0]] = self._bonds[:,2] - return matrix - - - def add_bond(self, int32 atom_index1, int32 atom_index2, - bond_type=BondType.ANY): - """ - add_bond(atom_index1, atom_index2, bond_type=BondType.ANY) - - Add a bond to the :class:`BondList`. - - If the bond is already existent, only the bond type is updated. - - Parameters - ---------- - atom_index1, atom_index2 : int - The indices of the atoms to create a bond for. - bond_type : BondType or int, optional - The type of the bond. Default is :attr:`BondType.ANY`. - """ - if bond_type >= len(BondType): - raise ValueError(f"BondType {bond_type} is invalid") - - cdef uint32 index1 = _to_positive_index(atom_index1, self._atom_count) - cdef uint32 index2 = _to_positive_index(atom_index2, self._atom_count) - _sort(&index1, &index2) - - cdef int i - cdef uint32[:,:] all_bonds_v = self._bonds - # Check if bond is already existent in list - cdef bint in_list = False - for i in range(all_bonds_v.shape[0]): - # Since the bonds have the atom indices sorted - # the reverse check is omitted - if (all_bonds_v[i,0] == index1 and all_bonds_v[i,1] == index2): - in_list = True - # If in list, update bond type - all_bonds_v[i,2] = int(bond_type) - break - if not in_list: - self._bonds = np.append( - self._bonds, - np.array( - [(index1, index2, int(bond_type))], dtype=np.uint32 - ), - axis=0 - ) - self._max_bonds_per_atom = self._get_max_bonds_per_atom() - - def remove_bond(self, int32 atom_index1, int32 atom_index2): - """ - remove_bond(atom_index1, atom_index2) - - Remove a bond from the :class:`BondList`. - - If the bond is not existent in the :class:`BondList`, nothing happens. - - Parameters - ---------- - atom_index1, atom_index2 : int - The indices of the atoms whose bond should be removed. - """ - cdef uint32 index1 = _to_positive_index(atom_index1, self._atom_count) - cdef uint32 index2 = _to_positive_index(atom_index2, self._atom_count) - _sort(&index1, &index2) - - # Find the bond in bond list - cdef int i - cdef uint32[:,:] all_bonds_v = self._bonds - for i in range(all_bonds_v.shape[0]): - # Since the bonds have the atom indices sorted - # the reverse check is omitted - if (all_bonds_v[i,0] == index1 and all_bonds_v[i,1] == index2): - self._bonds = np.delete(self._bonds, i, axis=0) - # The maximum bonds per atom is not recalculated, - # as the value can only be decreased on bond removal - # Since this value is only used for pessimistic array allocation - # in 'get_bonds()', the slightly larger memory usage is a better - # option than the repetitive call of _get_max_bonds_per_atom() - - def remove_bonds_to(self, int32 atom_index): - """ - remove_bonds_to(self, atom_index) - - Remove all bonds from the :class:`BondList` where the given atom - is involved. - - Parameters - ---------- - atom_index : int - The index of the atom whose bonds should be removed. - """ - cdef uint32 index = _to_positive_index(atom_index, self._atom_count) - - cdef np.ndarray mask = np.ones(len(self._bonds), dtype=np.uint8) - cdef uint8[:] mask_v = mask - - # Find the bond in bond list - cdef int i - cdef uint32[:,:] all_bonds_v = self._bonds - for i in range(all_bonds_v.shape[0]): - if (all_bonds_v[i,0] == index or all_bonds_v[i,1] == index): - mask_v[i] = False - # Remove the bonds - self._bonds = self._bonds[mask.astype(bool, copy=False)] - # The maximum bonds per atom is not recalculated - # (see 'remove_bond()') - - def remove_bonds(self, bond_list): - """ - remove_bonds(bond_list) - - Remove multiple bonds from the :class:`BondList`. - - All bonds present in `bond_list` are removed from this instance. - If a bond is not existent in this instance, nothing happens. - Only the bond indices, not the bond types, are relevant for - this. - - Parameters - ---------- - bond_list : BondList - The bonds in `bond_list` are removed from this instance. - """ - cdef int i=0, j=0 - - # All bonds in the own BondList - cdef uint32[:,:] all_bonds_v = self._bonds - # The bonds that should be removed - cdef uint32[:,:] rem_bonds_v = bond_list._bonds - cdef np.ndarray mask = np.ones(all_bonds_v.shape[0], dtype=np.uint8) - cdef uint8[:] mask_v = mask - for i in range(all_bonds_v.shape[0]): - for j in range(rem_bonds_v.shape[0]): - if all_bonds_v[i,0] == rem_bonds_v[j,0] \ - and all_bonds_v[i,1] == rem_bonds_v[j,1]: - mask_v[i] = False - - # Remove the bonds - self._bonds = self._bonds[mask.astype(bool, copy=False)] - # The maximum bonds per atom is not recalculated - # (see 'remove_bond()') - - def merge(self, bond_list): - """ - merge(bond_list) - - Merge another :class:`BondList` with this instance into a new - object. - If a bond appears in both :class:`BondList`'s, the - :class:`BondType` from the given `bond_list` takes precedence. - - The internal :class:`ndarray` instances containing the bonds are - simply concatenated and the new atom count is the maximum of - both bond lists. - - Parameters - ---------- - bond_list : BondList - This bond list is merged with this instance. - - Returns - ------- - bond_list : BondList - The merged :class:`BondList`. - - Notes - ----- - This is not equal to using the `+` operator. - - Examples - -------- - - >>> bond_list1 = BondList(3, np.array([(0,1),(1,2)])) - >>> bond_list2 = BondList(5, np.array([(2,3),(3,4)])) - >>> merged_list = bond_list2.merge(bond_list1) - >>> print(merged_list.get_atom_count()) - 5 - >>> print(merged_list) - [[0 1 0] - [1 2 0] - [2 3 0] - [3 4 0]] - - The BondList given as parameter takes precedence: - - >>> # Specifiy bond type to see where a bond is taken from - >>> bond_list1 = BondList(4, np.array([ - ... (0, 1, BondType.SINGLE), - ... (1, 2, BondType.SINGLE) - ... ])) - >>> bond_list2 = BondList(4, np.array([ - ... (1, 2, BondType.DOUBLE), # This one is a duplicate - ... (2, 3, BondType.DOUBLE) - ... ])) - >>> merged_list = bond_list2.merge(bond_list1) - >>> print(merged_list) - [[0 1 1] - [1 2 1] - [2 3 2]] - """ - return BondList( - max(self._atom_count, bond_list._atom_count), - np.concatenate( - [bond_list.as_array(), self.as_array()], - axis=0 - ) - ) - - def __add__(self, bond_list): - return BondList.concatenate([self, bond_list]) - - def __getitem__(self, index): - ## Variables for both, integer and boolean index arrays - cdef uint32[:,:] all_bonds_v - cdef int i - cdef uint32* index1_ptr - cdef uint32* index2_ptr - cdef np.ndarray removal_filter - cdef uint8[:] removal_filter_v - - ## Variables for integer arrays - cdef int32[:] inverse_index_v - cdef int32 new_index1, new_index2 - - ## Variables for boolean mask - # Boolean mask representation of the index - cdef np.ndarray mask - cdef uint8[:] mask_v - # Boolean mask for removal of bonds - cdef np.ndarray offsets - cdef uint32[:] offsets_v - - if isinstance(index, numbers.Integral): - ## Handle single index - return self.get_bonds(index) - - elif isinstance(index, np.ndarray) and index.dtype == bool: - ## Handle boolean masks - copy = self.copy() - all_bonds_v = copy._bonds - # Use 'uint8' instead of 'bool' for memory view - mask = np.frombuffer(index, dtype=np.uint8) - - # Each time an atom is missing in the mask, - # the offset is increased by one - offsets = np.cumsum( - ~mask.astype(bool, copy=False), dtype=np.uint32 - ) - removal_filter = np.ones(all_bonds_v.shape[0], dtype=np.uint8) - removal_filter_v = removal_filter - mask_v = mask - offsets_v = offsets - # If an atom in a bond is not masked, - # the bond is removed from the list - # If an atom is masked, - # its index value is decreased by the respective offset - # The offset is necessary, removing atoms in an AtomArray - # decreases the index of the following atoms - for i in range(all_bonds_v.shape[0]): - # Usage of pointer to increase performance - # as redundant indexing is avoided - index1_ptr = &all_bonds_v[i,0] - index2_ptr = &all_bonds_v[i,1] - if mask_v[index1_ptr[0]] and mask_v[index2_ptr[0]]: - # Both atoms involved in bond are masked - # -> decrease atom index by offset - index1_ptr[0] -= offsets_v[index1_ptr[0]] - index2_ptr[0] -= offsets_v[index2_ptr[0]] - else: - # At least one atom involved in bond is not masked - # -> remove bond - removal_filter_v[i] = False - # Apply the bond removal filter - copy._bonds = copy._bonds[removal_filter.astype(bool, copy=False)] - copy._atom_count = len(np.nonzero(mask)[0]) - copy._max_bonds_per_atom = copy._get_max_bonds_per_atom() - return copy - - else: - ## Convert any other type of index into index array, as it preserves order - copy = self.copy() - all_bonds_v = copy._bonds - index = _to_index_array(index, self._atom_count) - index = _to_positive_index_array(index, self._atom_count) - - # The inverse index is required to efficiently obtain - # the new index of an atom in case of an unsorted index - # array - inverse_index_v = _invert_index(index, self._atom_count) - removal_filter = np.ones(all_bonds_v.shape[0], dtype=np.uint8) - removal_filter_v = removal_filter - for i in range(all_bonds_v.shape[0]): - # Usage of pointer to increase performance - # as redundant indexing is avoided - index1_ptr = &all_bonds_v[i,0] - index2_ptr = &all_bonds_v[i,1] - new_index1 = inverse_index_v[index1_ptr[0]] - new_index2 = inverse_index_v[index2_ptr[0]] - if new_index1 != -1 and new_index2 != -1: - # Both atoms involved in bond are included - # by index array - # -> assign new atom indices - index1_ptr[0] = new_index1 - index2_ptr[0] = new_index2 - else: - # At least one atom in bond is not included - # -> remove bond - removal_filter_v[i] = False - - copy._bonds = copy._bonds[ - removal_filter.astype(bool, copy=False) - ] - # Again, sort indices per bond - # as the correct order is not guaranteed anymore - # for unsorted index arrays - copy._bonds[:,:2] = np.sort(copy._bonds[:,:2], axis=1) - copy._atom_count = len(index) - copy._max_bonds_per_atom = copy._get_max_bonds_per_atom() - return copy - - def __iter__(self): - raise TypeError("'BondList' object is not iterable") - - def __str__(self): - return str(self.as_array()) - - def __eq__(self, item): - if not isinstance(item, BondList): - return False - return (self._atom_count == item._atom_count and - self.as_set() == item.as_set()) - - def __contains__(self, item): - if not isinstance(item, tuple) and len(tuple) != 2: - raise TypeError("Expected a tuple of atom indices") - - cdef int i=0 - - cdef uint32 match_index1, match_index2 - # Sort indices for faster search in loop - cdef uint32 atom_index1 = min(item) - cdef uint32 atom_index2 = max(item) - - cdef uint32[:,:] all_bonds_v = self._bonds - for i in range(all_bonds_v.shape[0]): - match_index1 = all_bonds_v[i,0] - match_index2 = all_bonds_v[i,1] - if atom_index1 == match_index1 and atom_index2 == match_index2: - return True - - return False - - - def _get_max_bonds_per_atom(self): - if self._atom_count == 0: - return 0 - - cdef int i - cdef uint32[:,:] all_bonds_v = self._bonds - # Create an array that counts number of occurences of each index - cdef np.ndarray index_count = np.zeros(self._atom_count, - dtype=np.uint32) - cdef uint32[:] index_count_v = index_count - for i in range(all_bonds_v.shape[0]): - # Increment count of both indices found in bond list at i - index_count_v[all_bonds_v[i,0]] += 1 - index_count_v[all_bonds_v[i,1]] += 1 - return np.max(index_count_v) - - def _remove_redundant_bonds(self): - cdef int j - cdef uint32[:,:] all_bonds_v = self._bonds - # Boolean mask for final removal of redundant atoms - # Unfortunately views of boolean ndarrays are not supported - # -> use uint8 array - cdef np.ndarray redundancy_filter = np.ones(all_bonds_v.shape[0], - dtype=np.uint8) - cdef uint8[:] redundancy_filter_v = redundancy_filter - # Array of pointers to C-arrays - # The array is indexed with the atom indices in the bond list - # The respective C-array contains the indices of bonded atoms - cdef ptr[:] ptrs_v = np.zeros(self._atom_count, dtype=np.uint64) - # Stores the length of the C-arrays - cdef int[:] array_len_v = np.zeros(self._atom_count, dtype=np.int32) - # Iterate over bond list: - # If bond is already listed in the array of pointers, - # set filter to false at that position - # Else add bond to array of pointers - cdef uint32 i1, i2 - cdef uint32* array_ptr - cdef int length - - try: - for j in range(all_bonds_v.shape[0]): - i1 = all_bonds_v[j,0] - i2 = all_bonds_v[j,1] - # Since the bonds have the atom indices sorted - # the reverse check is omitted - if _in_array(ptrs_v[i1], i2, array_len_v[i1]): - redundancy_filter_v[j] = False - else: - # Append bond in respective C-array - # and update C-array length - length = array_len_v[i1] +1 - array_ptr = ptrs_v[i1] - array_ptr = realloc( - array_ptr, length * sizeof(uint32) - ) - if not array_ptr: - raise MemoryError() - array_ptr[length-1] = i2 - ptrs_v[i1] = array_ptr - array_len_v[i1] = length - - finally: - # Free pointers - for i in range(ptrs_v.shape[0]): - free(ptrs_v[i]) - - # Eventually remove redundant bonds - self._bonds = self._bonds[redundancy_filter.astype(bool, copy=False)] - - -cdef uint32 _to_positive_index(int32 index, uint32 array_length) except -1: - """ - Convert a potentially negative index into a positive index. - """ - cdef uint32 pos_index - if index < 0: - pos_index = (array_length + index) - if pos_index < 0: - raise IndexError( - f"Index {index} is out of range " - f"for an atom count of {array_length}" - ) - return pos_index - else: - if index >= array_length: - raise IndexError( - f"Index {index} is out of range " - f"for an atom count of {array_length}" - ) - return index - - -def _to_positive_index_array(index_array, length): - """ - Convert potentially negative values in an array into positive - values and check for out-of-bounds values. - """ - index_array = index_array.copy() - orig_shape = index_array.shape - index_array = index_array.flatten() - negatives = index_array < 0 - index_array[negatives] = length + index_array[negatives] - if (index_array < 0).any(): - raise IndexError( - f"Index {np.min(index_array)} is out of range " - f"for an atom count of {length}" - ) - if (index_array >= length).any(): - raise IndexError( - f"Index {np.max(index_array)} is out of range " - f"for an atom count of {length}" - ) - return index_array.reshape(orig_shape) - - -def _to_index_array(object index, uint32 length): - """ - Convert an index of arbitrary type into an index array. - """ - if isinstance(index, np.ndarray) and np.issubdtype(index.dtype, np.integer): - return index - else: - # Convert into index array - all_indices = np.arange(length, dtype=np.uint32) - return all_indices[index] - - -cdef inline bint _in_array(uint32* array, uint32 atom_index, int array_length): - """ - Test whether a value (`atom_index`) is in a C-array `array`. - """ - cdef int i = 0 - if array == NULL: - return False - for i in range(array_length): - if array[i] == atom_index: - return True - return False - - -cdef inline void _sort(uint32* index1_ptr, uint32* index2_ptr): - cdef uint32 swap - if index1_ptr[0] > index2_ptr[0]: - # Swap indices - swap = index1_ptr[0] - index1_ptr[0] = index2_ptr[0] - index2_ptr[0] = swap - - -@cython.wraparound(False) -# Do bounds check, as the input indices may be out of bounds -def _invert_index(IndexType[:] index_v, uint32 length): - """ - Invert an input index array, so that - if *input[i] = j*, *output[j] = i*. - For all elements *j*, that are not in *input*, *output[j]* = -1. - """ - cdef int32 i - cdef IndexType index_val - inverse_index = np.full(length, -1, dtype=np.int32) - cdef int32[:] inverse_index_v = inverse_index - - for i in range(index_v.shape[0]): - index_val = index_v[i] - if inverse_index_v[index_val] != -1: - # One index can theoretically appear multiple times - # This is currently not supported - raise NotImplementedError( - f"Duplicate indices are not supported, " - f"but index {index_val} appeared multiple times" - ) - inverse_index_v[index_val] = i - - - return inverse_index - - - - -# fmt: off -_DEFAULT_DISTANCE_RANGE = { - # Taken from Allen et al. - # min - 2*std max + 2*std - ("B", "C" ) : (1.556 - 2*0.015, 1.556 + 2*0.015), - ("BR", "C" ) : (1.875 - 2*0.029, 1.966 + 2*0.029), - ("BR", "O" ) : (1.581 - 2*0.007, 1.581 + 2*0.007), - ("C", "C" ) : (1.174 - 2*0.011, 1.588 + 2*0.025), - ("C", "CL") : (1.713 - 2*0.011, 1.849 + 2*0.011), - ("C", "F" ) : (1.320 - 2*0.009, 1.428 + 2*0.009), - ("C", "H" ) : (1.059 - 2*0.030, 1.099 + 2*0.007), - ("C", "I" ) : (2.095 - 2*0.015, 2.162 + 2*0.015), - ("C", "N" ) : (1.325 - 2*0.009, 1.552 + 2*0.023), - ("C", "O" ) : (1.187 - 2*0.011, 1.477 + 2*0.008), - ("C", "P" ) : (1.791 - 2*0.006, 1.855 + 2*0.019), - ("C", "S" ) : (1.630 - 2*0.014, 1.863 + 2*0.015), - ("C", "SE") : (1.893 - 2*0.013, 1.970 + 2*0.032), - ("C", "SI") : (1.837 - 2*0.012, 1.888 + 2*0.023), - ("CL", "O" ) : (1.414 - 2*0.026, 1.414 + 2*0.026), - ("CL", "P" ) : (1.997 - 2*0.035, 2.008 + 2*0.035), - ("CL", "S" ) : (2.072 - 2*0.023, 2.072 + 2*0.023), - ("CL", "SI") : (2.072 - 2*0.009, 2.072 + 2*0.009), - ("F", "N" ) : (1.406 - 2*0.016, 1.406 + 2*0.016), - ("F", "P" ) : (1.495 - 2*0.016, 1.579 + 2*0.025), - ("F", "S" ) : (1.640 - 2*0.011, 1.640 + 2*0.011), - ("F", "SI") : (1.588 - 2*0.014, 1.694 + 2*0.013), - ("H", "N" ) : (1.009 - 2*0.022, 1.033 + 2*0.022), - ("H", "O" ) : (0.967 - 2*0.010, 1.015 + 2*0.017), - ("I", "O" ) : (2.144 - 2*0.028, 2.144 + 2*0.028), - ("N", "N" ) : (1.124 - 2*0.015, 1.454 + 2*0.021), - ("N", "O" ) : (1.210 - 2*0.011, 1.463 + 2*0.012), - ("N", "P" ) : (1.571 - 2*0.013, 1.697 + 2*0.015), - ("N", "S" ) : (1.541 - 2*0.022, 1.710 + 2*0.019), - ("N", "SI") : (1.711 - 2*0.019, 1.748 + 2*0.022), - ("O", "P" ) : (1.449 - 2*0.007, 1.689 + 2*0.024), - ("O", "S" ) : (1.423 - 2*0.008, 1.580 + 2*0.015), - ("O", "SI") : (1.622 - 2*0.014, 1.680 + 2*0.008), - ("P", "P" ) : (2.214 - 2*0.022, 2.214 + 2*0.022), - ("P", "S" ) : (1.913 - 2*0.014, 1.954 + 2*0.005), - ("P", "SE") : (2.093 - 2*0.019, 2.093 + 2*0.019), - ("P", "SI") : (2.264 - 2*0.019, 2.264 + 2*0.019), - ("S", "S" ) : (1.897 - 2*0.012, 2.070 + 2*0.022), - ("S", "SE") : (2.193 - 2*0.015, 2.193 + 2*0.015), - ("S", "SI") : (2.145 - 2*0.020, 2.145 + 2*0.020), - ("SE", "SE") : (2.340 - 2*0.024, 2.340 + 2*0.024), - ("SI", "SE") : (2.359 - 2*0.012, 2.359 + 2*0.012), -} -# fmt: on - -def connect_via_distances(atoms, dict distance_range=None, bint inter_residue=True, - default_bond_type=BondType.ANY, bint periodic=False): - """ - connect_via_distances(atoms, distance_range=None, inter_residue=True, - default_bond_type=BondType.ANY, periodic=False) - - Create a :class:`BondList` for a given atom array, based on - pairwise atom distances. - - A :attr:`BondType.ANY`, bond is created for two atoms within the - same residue, if the distance between them is within the expected - bond distance range. - Bonds between two adjacent residues are created for the atoms - expected to connect these residues, i.e. ``'C'`` and ``'N'`` for - peptides and ``"O3'"`` and ``'P'`` for nucleotides. - - Parameters - ---------- - atoms : AtomArray - The structure to create the :class:`BondList` for. - distance_range : dict of tuple(str, str) -> tuple(float, float), optional - Custom minimum and maximum bond distances. - The dictionary keys are tuples of chemical elements representing - the atoms to be potentially bonded. - The order of elements within each tuple does not matter. - The dictionary values are the minimum and maximum bond distance, - respectively, for the given combination of elements. - This parameter updates the default dictionary. - Hence, the default bond distances for missing element pairs are - still taken from the default dictionary. - The default bond distances are taken from :footcite:`Allen1987`. - inter_residue : bool, optional - If true, connections between consecutive amino acids and - nucleotides are also added. - default_bond_type : BondType or int, optional - By default, all created bonds have :attr:`BondType.ANY`. - An alternative :class:`BondType` can be given in this parameter. - periodic : bool, optional - If set to true, bonds can also be detected in periodic - boundary conditions. - The `box` attribute of `atoms` is required in this case. - - Returns - ------- - BondList - The created bond list. - - See Also - -------- - connect_via_residue_names - - Notes - ----- - This method might miss bonds, if the bond distance is unexpectedly - high or low, or it might create false bonds, if two atoms within a - residue are accidentally in the right distance. - A more accurate method for determining bonds is - :func:`connect_via_residue_names()`. - - References - ---------- - - .. footbibliography:: - """ - from .atoms import AtomArray - from .geometry import distance - from .residues import get_residue_starts - - cdef list bonds = [] - cdef int i - cdef int curr_start_i, next_start_i - cdef np.ndarray coord = atoms.coord - cdef np.ndarray coord_in_res - cdef np.ndarray distances - cdef float dist - cdef np.ndarray elements = atoms.element - cdef np.ndarray elements_in_res - cdef int atom_index1, atom_index2 - cdef dict dist_ranges = {} - cdef tuple dist_range - cdef float min_dist, max_dist - - if not isinstance(atoms, AtomArray): - raise TypeError(f"Expected 'AtomArray', not '{type(atoms).__name__}'") - if periodic: - if atoms.box is None: - raise BadStructureError("Atom array has no box") - box = atoms.box - else: - box = None - - # Prepare distance dictionary... - if distance_range is None: - distance_range = {} - # Merge default and custom entries - for key, val in itertools.chain( - _DEFAULT_DISTANCE_RANGE.items(), distance_range.items() - ): - element1, element2 = key - # Add entries for both element orders - dist_ranges[(element1.upper(), element2.upper())] = val - dist_ranges[(element2.upper(), element1.upper())] = val - - residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) - # Omit exclsive stop in 'residue_starts' - for i in range(len(residue_starts)-1): - curr_start_i = residue_starts[i] - next_start_i = residue_starts[i+1] - - elements_in_res = elements[curr_start_i : next_start_i] - coord_in_res = coord[curr_start_i : next_start_i] - # Matrix containing all pairwise atom distances in the residue - distances = distance( - coord_in_res[:, np.newaxis, :], - coord_in_res[np.newaxis, :, :], - box - ) - for atom_index1 in range(len(elements_in_res)): - for atom_index2 in range(atom_index1): - dist_range = dist_ranges.get(( - elements_in_res[atom_index1], - elements_in_res[atom_index2] - )) - if dist_range is None: - # No bond distance entry for this element - # combination -> skip - continue - else: - min_dist, max_dist = dist_range - dist = distances[atom_index1, atom_index2] - if dist >= min_dist and dist <= max_dist: - bonds.append(( - curr_start_i + atom_index1, - curr_start_i + atom_index2, - default_bond_type - )) - - bond_list = BondList(atoms.array_length(), np.array(bonds)) - - if inter_residue: - inter_bonds = _connect_inter_residue(atoms, residue_starts) - if default_bond_type == BondType.ANY: - # As all bonds should be of type ANY, convert also - # inter-residue bonds to ANY - inter_bonds.remove_bond_order() - return bond_list.merge(inter_bonds) - else: - return bond_list - - - -def connect_via_residue_names(atoms, bint inter_residue=True, - dict custom_bond_dict=None): - """ - connect_via_residue_names(atoms, inter_residue=True, custom_bond_dict=None) - - Create a :class:`BondList` for a given atom array (stack), based on - the deposited bonds for each residue in the RCSB ``components.cif`` - dataset. - - Bonds between two adjacent residues are created for the atoms - expected to connect these residues, i.e. ``'C'`` and ``'N'`` for - peptides and ``"O3'"`` and ``'P'`` for nucleotides. - - Parameters - ---------- - atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) - The structure to create the :class:`BondList` for. - inter_residue : bool, optional - If true, connections between consecutive amino acids and - nucleotides are also added. - custom_bond_dict : dict (str -> dict ((str, str) -> int)), optional - A dictionary of dictionaries: - The outer dictionary maps residue names to inner dictionaries. - The inner dictionary maps tuples of two atom names to their - respective :class:`BondType` (represented as integer). - If given, these bonds are used instead of the bonds read from - ``components.cif``. - - Returns - ------- - BondList - The created bond list. - No bonds are added for residues that are not found in - ``components.cif``. - - See Also - -------- - connect_via_distances - - Notes - ----- - This method can only find bonds for residues in the RCSB - *Chemical Component Dictionary*, unless `custom_bond_dict` is set. - Although this includes most molecules one encounters, this will fail - for exotic molecules, e.g. specialized inhibitors. - - .. currentmodule:: biotite.structure.info - - To supplement `custom_bond_dict` with bonds for residues from the - *Chemical Component Dictionary* you can use - :meth:`bonds_in_residue()`. - - >>> import pprint - >>> custom_bond_dict = { - ... "XYZ": { - ... ("A", "B"): BondType.SINGLE, - ... ("B", "C"): BondType.SINGLE - ... } - ... } - >>> # Supplement with bonds for common residues - >>> custom_bond_dict["ALA"] = bonds_in_residue("ALA") - >>> pp = pprint.PrettyPrinter(width=40) - >>> pp.pprint(custom_bond_dict) - {'ALA': {('C', 'O'): , - ('C', 'OXT'): , - ('CA', 'C'): , - ('CA', 'CB'): , - ('CA', 'HA'): , - ('CB', 'HB1'): , - ('CB', 'HB2'): , - ('CB', 'HB3'): , - ('N', 'CA'): , - ('N', 'H'): , - ('N', 'H2'): , - ('OXT', 'HXT'): }, - 'XYZ': {('A', 'B'): , - ('B', 'C'): }} - """ - from .info.bonds import bonds_in_residue - from .residues import get_residue_starts - - cdef list bonds = [] - cdef int res_i - cdef int i, j - cdef int curr_start_i, next_start_i - cdef np.ndarray atom_names = atoms.atom_name - cdef np.ndarray atom_names_in_res - cdef np.ndarray res_names = atoms.res_name - cdef str atom_name1, atom_name2 - cdef int64[:] atom_indices1, atom_indices2 - cdef dict bond_dict_for_res - - residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) - # Omit exclsive stop in 'residue_starts' - for res_i in range(len(residue_starts)-1): - curr_start_i = residue_starts[res_i] - next_start_i = residue_starts[res_i+1] - - if custom_bond_dict is None: - bond_dict_for_res = bonds_in_residue(res_names[curr_start_i]) - else: - bond_dict_for_res = custom_bond_dict.get( - res_names[curr_start_i], {} - ) - - atom_names_in_res = atom_names[curr_start_i : next_start_i] - for (atom_name1, atom_name2), bond_type in bond_dict_for_res.items(): - atom_indices1 = np.where(atom_names_in_res == atom_name1)[0] \ - .astype(np.int64, copy=False) - atom_indices2 = np.where(atom_names_in_res == atom_name2)[0] \ - .astype(np.int64, copy=False) - # In rare cases the same atom name may appear multiple times - # (e.g. in altlocs) - # -> create all possible bond combinations - for i in range(atom_indices1.shape[0]): - for j in range(atom_indices2.shape[0]): - bonds.append(( - curr_start_i + atom_indices1[i], - curr_start_i + atom_indices2[j], - bond_type - )) - - bond_list = BondList(atoms.array_length(), np.array(bonds)) - - if inter_residue: - inter_bonds = _connect_inter_residue(atoms, residue_starts) - return bond_list.merge(inter_bonds) - else: - return bond_list - - - -_PEPTIDE_LINKS = ["PEPTIDE LINKING", "L-PEPTIDE LINKING", "D-PEPTIDE LINKING"] -_NUCLEIC_LINKS = ["RNA LINKING", "DNA LINKING"] - -def _connect_inter_residue(atoms, residue_starts): - """ - Create a :class:`BondList` containing the bonds between adjacent - amino acid or nucleotide residues. - - Parameters - ---------- - atoms : AtomArray or AtomArrayStack - The structure to create the :class:`BondList` for. - residue_starts : ndarray, dtype=int - Return value of - ``get_residue_starts(atoms, add_exclusive_stop=True)``. - - Returns - ------- - BondList - A bond list containing all inter residue bonds. - """ - from .info.misc import link_type - - cdef list bonds = [] - cdef int i - cdef np.ndarray atom_names = atoms.atom_name - cdef np.ndarray res_names = atoms.res_name - cdef np.ndarray res_ids = atoms.res_id - cdef np.ndarray chain_ids = atoms.chain_id - cdef int curr_start_i, next_start_i, after_next_start_i - cdef str curr_connect_atom_name, next_connect_atom_name - cdef np.ndarray curr_connect_indices, next_connect_indices - - # Iterate over all starts excluding: - # - the last residue and - # - exclusive end index of 'atoms' - for i in range(len(residue_starts)-2): - curr_start_i = residue_starts[i] - next_start_i = residue_starts[i+1] - after_next_start_i = residue_starts[i+2] - - # Check if the current and next residue is in the same chain - if chain_ids[next_start_i] != chain_ids[curr_start_i]: - continue - # Check if the current and next residue - # have consecutive residue IDs - # (Same residue ID is also possible if insertion code is used) - if res_ids[next_start_i] - res_ids[curr_start_i] > 1: - continue - - # Get link type for this residue from RCSB components.cif - curr_link = link_type(res_names[curr_start_i]) - next_link = link_type(res_names[next_start_i]) - - if curr_link in _PEPTIDE_LINKS and next_link in _PEPTIDE_LINKS: - curr_connect_atom_name = "C" - next_connect_atom_name = "N" - elif curr_link in _NUCLEIC_LINKS and next_link in _NUCLEIC_LINKS: - curr_connect_atom_name = "O3'" - next_connect_atom_name = "P" - else: - # Create no bond if the connection types of consecutive - # residues are not compatible - continue - - # Index in atom array for atom name in current residue - # Addition of 'curr_start_i' is necessary, as only a slice of - # 'atom_names' is taken, beginning at 'curr_start_i' - curr_connect_indices = curr_start_i + np.where( - atom_names[curr_start_i : next_start_i] - == curr_connect_atom_name - )[0] - # Index in atom array for atom name in next residue - next_connect_indices = next_start_i + np.where( - atom_names[next_start_i : after_next_start_i] - == next_connect_atom_name - )[0] - if len(curr_connect_indices) == 0 or len(next_connect_indices) == 0: - # The connector atoms are not found in the adjacent residues - # -> skip this bond - continue - - bonds.append(( - curr_connect_indices[0], - next_connect_indices[0], - BondType.SINGLE - )) - - return BondList(atoms.array_length(), np.array(bonds, dtype=np.uint32)) - - - -def find_connected(bond_list, uint32 root, bint as_mask=False): - """ - find_connected(bond_list, root, as_mask=False) - - Get indices to all atoms that are directly or indirectly connected - to the root atom indicated by the given index. - - An atom is *connected* to the `root` atom, if that atom is reachable - by traversing an arbitrary number of bonds, starting from the - `root`. - Effectively, this means that all atoms are *connected* to `root`, - that are in the same molecule as `root`. - Per definition `root` is also *connected* to itself. - - Parameters - ---------- - bond_list : BondList - The reference bond list. - root : int - The index of the root atom. - as_mask : bool, optional - If true, the connected atom indices are returned as boolean - mask. - By default, the connected atom indices are returned as integer - array. - - Returns - ------- - connected : ndarray, dtype=int or ndarray, dtype=bool - Either a boolean mask or an integer array, representing the - connected atoms. - In case of a boolean mask: ``connected[i] == True``, if the atom - with index ``i`` is connected. - - Examples - -------- - Consider a system with 4 atoms, where only the last atom is not - bonded with the other ones (``0-1-2 3``): - - >>> bonds = BondList(4) - >>> bonds.add_bond(0, 1) - >>> bonds.add_bond(1, 2) - >>> print(find_connected(bonds, 0)) - [0 1 2] - >>> print(find_connected(bonds, 1)) - [0 1 2] - >>> print(find_connected(bonds, 2)) - [0 1 2] - >>> print(find_connected(bonds, 3)) - [3] - """ - all_bonds, _ = bond_list.get_all_bonds() - - if root >= bond_list.get_atom_count(): - raise ValueError( - f"Root atom index {root} is out of bounds for bond list " - f"representing {bond_list.get_atom_count()} atoms" - ) - - cdef uint8[:] is_connected_mask = np.zeros( - bond_list.get_atom_count(), dtype=np.uint8 - ) - # Find connections in a recursive way, - # by visiting all atoms that are reachable by a bond - _find_connected(bond_list, root, is_connected_mask, all_bonds) - if as_mask: - return is_connected_mask - else: - return np.where(np.asarray(is_connected_mask))[0] - - -cdef _find_connected(bond_list, - int32 index, - uint8[:] is_connected_mask, - int32[:,:] all_bonds): - if is_connected_mask[index]: - # This atom has already been visited - # -> exit condition - return - is_connected_mask[index] = True - - cdef int32 j - cdef int32 connected_index - for j in range(all_bonds.shape[1]): - connected_index = all_bonds[index, j] - if connected_index == -1: - # Ignore padding values - continue - _find_connected( - bond_list, connected_index, is_connected_mask, all_bonds - ) - - -def find_rotatable_bonds(bonds): - """ - find_rotatable_bonds(bonds) - - Find all rotatable bonds in a given :class:`BondList`. - - The following conditions must be true for a bond to be counted as - rotatable: - - 1. The bond must be a single bond (``BondType.SINGLE``) - 2. The connected atoms must not be within the same cycle/ring - 3. Both connected atoms must not be terminal, e.g. not a *C-H* - bond, as rotation about such bonds would not change any - coordinates - - Parameters - ---------- - bonds : BondList - The bonds to find the rotatable bonds in. - - Returns - ------- - rotatable_bonds : BondList - The subset of the input `bonds` that contains only rotatable - bonds. - - Examples - -------- - - >>> molecule = residue("TYR") - >>> for i, j, _ in find_rotatable_bonds(molecule.bonds).as_array(): - ... print(molecule.atom_name[i], molecule.atom_name[j]) - N CA - CA C - CA CB - C OXT - CB CG - CZ OH - """ - cdef uint32 i, j - cdef uint32 bond_type - cdef uint32 SINGLE = int(BondType.SINGLE) - cdef bint in_same_cycle - - bond_graph = bonds.as_graph() - cycles = nx.algorithms.cycles.cycle_basis(bond_graph) - - cdef int64[:] number_of_partners_v = np.count_nonzero( - bonds.get_all_bonds()[0] != -1, - axis=1 - ).astype(np.int64, copy=False) - - rotatable_bonds = [] - cdef uint32[:,:] bonds_v = bonds.as_array() - for i, j, bond_type in bonds_v: - # Can only rotate about single bonds - # Furthermore, it makes no sense to rotate about a bond, - # that leads to a single atom - if bond_type == BondType.SINGLE \ - and number_of_partners_v[i] > 1 \ - and number_of_partners_v[j] > 1: - # Cannot rotate about a bond, if the two connected atoms - # are in a cycle - in_same_cycle = False - for cycle in cycles: - if i in cycle and j in cycle: - in_same_cycle = True - if not in_same_cycle: - rotatable_bonds.append((i,j, bond_type)) - return BondList(bonds.get_atom_count(), np.array(rotatable_bonds)) diff --git a/src/rust/structure/bonds.rs b/src/rust/structure/bonds.rs new file mode 100644 index 000000000..73df5465b --- /dev/null +++ b/src/rust/structure/bonds.rs @@ -0,0 +1,1412 @@ +use numpy::ndarray::{Array1, Array2}; +use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2}; +use pyo3::exceptions; +use pyo3::prelude::*; +use pyo3::types::{PyModule, PySet, PyTuple}; +use smallvec::SmallVec; +use std::collections::{HashMap, HashSet}; +use std::convert::TryFrom; + +/// Rust reflection of the :class:`BondType` enum. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +#[repr(u8)] +pub enum BondType { + Any = 0, + Single = 1, + Double = 2, + Triple = 3, + Quadruple = 4, + AromaticSingle = 5, + AromaticDouble = 6, + AromaticTriple = 7, + Coordination = 8, + Aromatic = 9, +} + +impl BondType { + fn without_aromaticity(&self) -> Self { + match self { + BondType::AromaticSingle => BondType::Single, + BondType::AromaticDouble => BondType::Double, + BondType::AromaticTriple => BondType::Triple, + BondType::Aromatic => BondType::Any, + _ => *self, + } + } +} + +#[pyfunction] +pub fn bond_type_members() -> HashMap { + let mut map = HashMap::new(); + map.insert("ANY".to_string(), BondType::Any as u8); + map.insert("SINGLE".to_string(), BondType::Single as u8); + map.insert("DOUBLE".to_string(), BondType::Double as u8); + map.insert("TRIPLE".to_string(), BondType::Triple as u8); + map.insert("QUADRUPLE".to_string(), BondType::Quadruple as u8); + map.insert( + "AROMATIC_SINGLE".to_string(), + BondType::AromaticSingle as u8, + ); + map.insert( + "AROMATIC_DOUBLE".to_string(), + BondType::AromaticDouble as u8, + ); + map.insert( + "AROMATIC_TRIPLE".to_string(), + BondType::AromaticTriple as u8, + ); + map.insert("COORDINATION".to_string(), BondType::Coordination as u8); + map.insert("AROMATIC".to_string(), BondType::Aromatic as u8); + map +} + +impl TryFrom for BondType { + type Error = PyErr; + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(BondType::Any), + 1 => Ok(BondType::Single), + 2 => Ok(BondType::Double), + 3 => Ok(BondType::Triple), + 4 => Ok(BondType::Quadruple), + 5 => Ok(BondType::AromaticSingle), + 6 => Ok(BondType::AromaticDouble), + 7 => Ok(BondType::AromaticTriple), + 8 => Ok(BondType::Coordination), + 9 => Ok(BondType::Aromatic), + _ => Err(exceptions::PyValueError::new_err(format!( + "BondType {} is invalid", + value + ))), + } + } +} + +impl<'a, 'py> FromPyObject<'a, 'py> for BondType { + type Error = PyErr; + + fn extract(ob: pyo3::Borrowed<'a, 'py, PyAny>) -> Result { + let value: u8 = ob.extract()?; + BondType::try_from(value) + } +} + +impl<'py> IntoPyObject<'py> for BondType { + type Target = PyAny; + type Output = Bound<'py, PyAny>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + let bond_type_class = py.import("biotite.structure")?.getattr("BondType")?; + bond_type_class.call1((self as u8,)) + } +} + +/// Internal representation of a single bond. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub struct Bond { + pub atom1: usize, + pub atom2: usize, + pub bond_type: BondType, +} + +impl Bond { + /// Create a new bond with normalized atom indices (smaller index first). + /// Negative indices are resolved relative to `atom_count`. + fn new(atom1: isize, atom2: isize, bond_type: BondType, atom_count: usize) -> PyResult { + let a1 = to_positive_index(atom1, atom_count)?; + let a2 = to_positive_index(atom2, atom_count)?; + let (a1, a2) = if a1 <= a2 { (a1, a2) } else { (a2, a1) }; + Ok(Bond { + atom1: a1, + atom2: a2, + bond_type, + }) + } + + /// Check whether two bonds connect the same atoms, ignoring the bond type. + fn equal_atoms(&self, other: &Bond) -> bool { + self.atom1 == other.atom1 && self.atom2 == other.atom2 + } +} + +/// __init__(atom_count, bonds=None) +/// +/// A bond list stores indices of atoms +/// (usually of an :class:`AtomArray` or :class:`AtomArrayStack`) +/// that form chemical bonds together with the type (or order) of the +/// bond. +/// +/// Internally the bonds are stored as *n x 3* :class:`ndarray`. +/// For each row, the first column specifies the index of the first +/// atom, the second column the index of the second atom involved in the +/// bond. +/// The third column stores an integer that is interpreted as member +/// of the the :class:`BondType` enum, that specifies the order of the +/// bond. +/// +/// When indexing a :class:`BondList`, the index is not forwarded to the +/// internal :class:`ndarray`. Instead the indexing behavior is +/// consistent with indexing an :class:`AtomArray` or +/// :class:`AtomArrayStack`: +/// Bonds with at least one atom index that is not covered by the index +/// are removed, atom indices that occur after an uncovered atom index +/// move up. +/// Effectively, this means that after indexing an :class:`AtomArray` +/// and a :class:`BondList` with the same index, the atom indices in the +/// :class:`BondList` will still point to the same atoms in the +/// :class:`AtomArray`. +/// Indexing a :class:`BondList` with a single integer is equivalent +/// to calling :func:`get_bonds()`. +/// +/// The same consistency applies to adding :class:`BondList` instances +/// via the '+' operator: +/// The atom indices of the second :class:`BondList` are increased by +/// the atom count of the first :class:`BondList` and then both +/// :class:`BondList` objects are merged. +/// +/// Parameters +/// ---------- +/// atom_count : int +/// A positive integer, that specifies the number of atoms the +/// :class:`BondList` refers to +/// (usually the length of an atom array (stack)). +/// Effectively, this value is the exclusive maximum for the indices +/// stored in the :class:`BondList`. +/// bonds : ndarray, shape=(n,2) or shape=(n,3), dtype=int, optional +/// This array contains the indices of atoms which are bonded: +/// For each row, the first column specifies the first atom, +/// the second row the second atom involved in a chemical bond. +/// If an *n x 3* array is provided, the additional column +/// specifies a :class:`BondType` instead of :attr:`BondType.ANY`. +/// By default, the created :class:`BondList` is empty. +/// +/// Notes +/// ----- +/// When initially providing the bonds as :class:`ndarray`, the input is +/// sanitized: Redundant bonds are removed, and each bond entry is +/// sorted so that the lower one of the two atom indices is in the first +/// column. +/// If a bond appears multiple times with different bond types, the +/// first bond takes precedence. +/// +/// Examples +/// -------- +/// +/// Construct a :class:`BondList`, where a central atom (index 1) is +/// connected to three other atoms (index 0, 3 and 4): +/// +/// >>> bond_list = BondList(5, np.array([(1,0),(1,3),(1,4)])) +/// >>> print(bond_list) +/// [[0 1 0] +/// [1 3 0] +/// [1 4 0]] +/// +/// Remove the first atom (index 0) via indexing: +/// The bond containing index 0 is removed, since the corresponding atom +/// does not exist anymore. Since all other atoms move up in their +/// position, the indices in the bond list are decreased by one: +/// +/// >>> bond_list = bond_list[1:] +/// >>> print(bond_list) +/// [[0 2 0] +/// [0 3 0]] +/// +/// :class:`BondList` objects can be associated to an :class:`AtomArray` +/// or :class:`AtomArrayStack`. +/// The following snippet shows this for a benzene molecule: +/// +/// >>> benzene = AtomArray(12) +/// >>> # Omit filling most required annotation categories for brevity +/// >>> benzene.atom_name = np.array( +/// ... ["C1", "C2", "C3", "C4", "C5", "C6", "H1", "H2", "H3", "H4", "H5", "H6"] +/// ... ) +/// >>> benzene.bonds = BondList( +/// ... benzene.array_length(), +/// ... np.array([ +/// ... # Bonds between carbon atoms in the ring +/// ... (0, 1, BondType.AROMATIC_SINGLE), +/// ... (1, 2, BondType.AROMATIC_DOUBLE), +/// ... (2, 3, BondType.AROMATIC_SINGLE), +/// ... (3, 4, BondType.AROMATIC_DOUBLE), +/// ... (4, 5, BondType.AROMATIC_SINGLE), +/// ... (5, 0, BondType.AROMATIC_DOUBLE), +/// ... # Bonds between carbon and hydrogen +/// ... (0, 6, BondType.SINGLE), +/// ... (1, 7, BondType.SINGLE), +/// ... (2, 8, BondType.SINGLE), +/// ... (3, 9, BondType.SINGLE), +/// ... (4, 10, BondType.SINGLE), +/// ... (5, 11, BondType.SINGLE), +/// ... ]) +/// ... ) +/// >>> for i, j, bond_type in benzene.bonds.as_array(): +/// ... print( +/// ... f"{BondType(bond_type).name} bond between " +/// ... f"{benzene.atom_name[i]} and {benzene.atom_name[j]}" +/// ... ) +/// AROMATIC_SINGLE bond between C1 and C2 +/// AROMATIC_DOUBLE bond between C2 and C3 +/// AROMATIC_SINGLE bond between C3 and C4 +/// AROMATIC_DOUBLE bond between C4 and C5 +/// AROMATIC_SINGLE bond between C5 and C6 +/// AROMATIC_DOUBLE bond between C1 and C6 +/// SINGLE bond between C1 and H1 +/// SINGLE bond between C2 and H2 +/// SINGLE bond between C3 and H3 +/// SINGLE bond between C4 and H4 +/// SINGLE bond between C5 and H5 +/// SINGLE bond between C6 and H6 +/// +/// Obtain the bonded atoms for the :math:`C_1`: +/// +/// >>> bonds, types = benzene.bonds.get_bonds(0) +/// >>> print(bonds) +/// [1 5 6] +/// >>> print(types) +/// [5 6 1] +/// >>> print(f"C1 is bonded to {', '.join(benzene.atom_name[bonds])}") +/// C1 is bonded to C2, C6, H1 +/// +/// Cut the benzene molecule in half. +/// Although the first half of the atoms are missing the indices of +/// the cropped :class:`BondList` still represents the bonds of the +/// remaining atoms: +/// +/// >>> half_benzene = benzene[ +/// ... np.isin(benzene.atom_name, ["C4", "C5", "C6", "H4", "H5", "H6"]) +/// ... ] +/// >>> for i, j, bond_type in half_benzene.bonds.as_array(): +/// ... print( +/// ... f"{BondType(bond_type).name} bond between " +/// ... f"{half_benzene.atom_name[i]} and {half_benzene.atom_name[j]}" +/// ... ) +/// AROMATIC_DOUBLE bond between C4 and C5 +/// AROMATIC_SINGLE bond between C5 and C6 +/// SINGLE bond between C4 and H4 +/// SINGLE bond between C5 and H5 +/// SINGLE bond between C6 and H6 +#[pyclass(module = "biotite.structure", subclass)] +#[derive(Clone)] +pub struct BondList { + atom_count: usize, + bonds: Vec, +} + +#[pymethods] +impl BondList { + #[new] + #[pyo3(signature = (atom_count, bonds=None))] + fn new<'py>( + py: Python<'py>, + atom_count: usize, + bonds: Option<&Bound<'py, PyAny>>, + ) -> PyResult { + match bonds { + Some(bonds_obj) => { + // Import numpy and convert to array with int64 dtype + let np = PyModule::import(py, "numpy")?; + let array_obj = np.call_method1("asarray", (bonds_obj,))?; + + // Get the shape + let shape_attr = array_obj.getattr("shape")?; + let shape: Vec = shape_attr.extract()?; + + // Validate the shape + if shape.len() != 2 { + return Err(exceptions::PyValueError::new_err( + "Expected a 2D-ndarray for input bonds", + )); + } + + let n_bonds = shape[0]; + let n_cols = shape[1]; + + if n_cols != 2 && n_cols != 3 { + return Err(exceptions::PyValueError::new_err( + "Input array containing bonds must be either of shape (n,2) or (n,3)", + )); + } + + let bonds_array: PyReadonlyArray2 = np + .call_method1("asarray", (array_obj, np.getattr("intp")?))? + .extract()?; + let array = bonds_array.as_array(); + + let mut bond_vec: Vec = Vec::with_capacity(n_bonds); + for i in 0..n_bonds { + let bond_type = if n_cols == 3 { + let bt_val = array[[i, 2]]; + if bt_val < 0 { + return Err(exceptions::PyValueError::new_err(format!( + "BondType {} is invalid", + bt_val + ))); + } + BondType::try_from(bt_val as u8)? + } else { + BondType::Any + }; + bond_vec.push(Bond::new( + array[[i, 0]], + array[[i, 1]], + bond_type, + atom_count, + )?); + } + + let mut bond_list = BondList { + atom_count, + bonds: bond_vec, + }; + bond_list.remove_redundant_bonds(); + Ok(bond_list) + } + None => Ok(BondList { + atom_count, + bonds: Vec::new(), + }), + } + } + + /// Concatenate multiple :class:`BondList` objects into a single + /// :class:`BondList`, respectively. + /// + /// Parameters + /// ---------- + /// bonds_lists : iterable object of BondList + /// The bond lists to be concatenated. + /// + /// Returns + /// ------- + /// concatenated_bonds : BondList + /// The concatenated bond lists. + /// + /// Examples + /// -------- + /// + /// >>> bonds1 = BondList(2, np.array([(0, 1)])) + /// >>> bonds2 = BondList(3, np.array([(0, 1), (0, 2)])) + /// >>> merged_bonds = BondList.concatenate([bonds1, bonds2]) + /// >>> print(merged_bonds.get_atom_count()) + /// 5 + /// >>> print(merged_bonds.as_array()[:, :2]) + /// [[0 1] + /// [2 3] + /// [2 4]] + #[staticmethod] + fn concatenate(bond_lists: Vec) -> PyResult { + if bond_lists.is_empty() { + return Ok(BondList { + atom_count: 0, + bonds: Vec::new(), + }); + } + + let total_bonds: usize = bond_lists.iter().map(|bl| bl.bonds.len()).sum(); + let mut merged_bonds: Vec = Vec::with_capacity(total_bonds); + let mut cum_atom_count: usize = 0; + + for bond_list in &bond_lists { + for bond in &bond_list.bonds { + merged_bonds.push(Bond { + atom1: bond.atom1 + cum_atom_count, + atom2: bond.atom2 + cum_atom_count, + bond_type: bond.bond_type, + }); + } + cum_atom_count += bond_list.atom_count; + } + + Ok(BondList { + atom_count: cum_atom_count, + bonds: merged_bonds, + }) + } + + /// offset_indices(offset) + /// + /// Increase all atom indices in the :class:`BondList` by the given + /// offset. + /// + /// Implicitly this increases the atom count. + /// + /// Parameters + /// ---------- + /// offset : int + /// The atom indices are increased by this value. + /// Must be positive. + /// + /// Examples + /// -------- + /// + /// >>> bond_list = BondList(5, np.array([(1,0),(1,3),(1,4)])) + /// >>> print(bond_list) + /// [[0 1 0] + /// [1 3 0] + /// [1 4 0]] + /// >>> bond_list.offset_indices(2) + /// >>> print(bond_list) + /// [[2 3 0] + /// [3 5 0] + /// [3 6 0]] + fn offset_indices(&mut self, offset: isize) -> PyResult<()> { + if offset < 0 { + return Err(exceptions::PyValueError::new_err("Offset must be positive")); + } + let offset = offset as usize; + for bond in &mut self.bonds { + bond.atom1 += offset; + bond.atom2 += offset; + } + self.atom_count += offset; + Ok(()) + } + + /// as_array() + /// + /// Obtain a copy of the internal :class:`ndarray`. + /// + /// Returns + /// ------- + /// array : ndarray, shape=(n,3), dtype=np.uint32 + /// Copy of the internal :class:`ndarray`. + /// For each row, the first column specifies the index of the + /// first atom, the second column the index of the second atom + /// involved in the bond. + /// The third column stores the :class:`BondType`. + fn as_array<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { + let n_bonds = self.bonds.len(); + let mut data: Vec = Vec::with_capacity(n_bonds * 3); + for bond in &self.bonds { + data.push(bond.atom1 as u32); + data.push(bond.atom2 as u32); + data.push(bond.bond_type as u32); + } + Array2::from_shape_vec((n_bonds, 3), data) + .expect("Shape mismatch") + .into_pyarray(py) + } + + /// Obtain a set representation of the :class:`BondList`. + /// + /// Returns + /// ------- + /// bond_set : set of tuple(int, int, BondType) + /// A set of tuples. + /// Each tuple represents one bond: + /// The first integer represents the first atom, + /// the second integer represents the second atom, + /// the third value represents the :class:`BondType`. + fn as_set<'py>(&self, py: Python<'py>) -> PyResult> { + let set = PySet::empty(py)?; + for bond in &self.bonds { + let tuple = PyTuple::new( + py, + [ + bond.atom1.into_pyobject(py)?.into_any(), + bond.atom2.into_pyobject(py)?.into_any(), + bond.bond_type.into_pyobject(py)?.into_any(), + ], + )?; + set.add(tuple)?; + } + Ok(set) + } + + /// as_graph() + /// + /// Obtain a graph representation of the :class:`BondList`. + /// + /// Returns + /// ------- + /// bond_set : Graph + /// A *NetworkX* :class:`Graph`. + /// The atom indices are nodes, the bonds are edges. + /// Each edge has a ``"bond_type"`` attribute containing the + /// :class:`BondType`. + /// + /// Examples + /// -------- + /// + /// >>> bond_list = BondList(5, np.array([(1,0,2), (1,3,1), (1,4,1)])) + /// >>> graph = bond_list.as_graph() + /// >>> print(graph.nodes) + /// [0, 1, 3, 4] + /// >>> print(graph.edges) + /// [(0, 1), (1, 3), (1, 4)] + /// >>> for i, j in graph.edges: + /// ... print(i, j, graph.get_edge_data(i, j)) + /// 0 1 {'bond_type': } + /// 1 3 {'bond_type': } + /// 1 4 {'bond_type': } + fn as_graph<'py>(&self, py: Python<'py>) -> PyResult> { + let nx = PyModule::import(py, "networkx")?; + let graph = nx.call_method0("Graph")?; + + let edges = pyo3::types::PyList::empty(py); + for bond in &self.bonds { + let edge_data = pyo3::types::PyDict::new(py); + edge_data.set_item("bond_type", bond.bond_type.into_pyobject(py)?)?; + let edge_tuple = PyTuple::new( + py, + [ + bond.atom1.into_pyobject(py)?.into_any(), + bond.atom2.into_pyobject(py)?.into_any(), + edge_data.into_any(), + ], + )?; + edges.append(edge_tuple)?; + } + + graph.call_method1("add_edges_from", (edges,))?; + Ok(graph) + } + + /// Remove aromaticity from the bond types. + /// + /// :attr:`BondType.AROMATIC_{ORDER}` is converted into + /// :attr:`BondType.{ORDER}`. + /// + /// Examples + /// -------- + /// + /// >>> bond_list = BondList(3) + /// >>> bond_list.add_bond(0, 1, BondType.AROMATIC_SINGLE) + /// >>> bond_list.add_bond(1, 2, BondType.AROMATIC_DOUBLE) + /// >>> bond_list.remove_aromaticity() + /// >>> for i, j, bond_type in bond_list.as_array(): + /// ... print(i, j, BondType(bond_type).name) + /// 0 1 SINGLE + /// 1 2 DOUBLE + fn remove_aromaticity(&mut self) { + for bond in &mut self.bonds { + bond.bond_type = bond.bond_type.without_aromaticity(); + } + } + + /// Remove the bond order information from aromatic bonds, i.e. convert all + /// aromatic bonds to :attr:`BondType.ANY`. + /// + /// Examples + /// -------- + /// + /// >>> bond_list = BondList(3) + /// >>> bond_list.add_bond(0, 1, BondType.AROMATIC_SINGLE) + /// >>> bond_list.add_bond(1, 2, BondType.AROMATIC_DOUBLE) + /// >>> bond_list.remove_kekulization() + /// >>> for i, j, bond_type in bond_list.as_array(): + /// ... print(i, j, BondType(bond_type).name) + /// 0 1 AROMATIC + /// 1 2 AROMATIC + fn remove_kekulization(&mut self) { + for bond in &mut self.bonds { + if matches!( + bond.bond_type, + BondType::AromaticSingle | BondType::AromaticDouble | BondType::AromaticTriple + ) { + bond.bond_type = BondType::Aromatic; + } + } + } + + /// Convert all bonds to :attr:`BondType.ANY`. + fn remove_bond_order(&mut self) { + for bond in &mut self.bonds { + bond.bond_type = BondType::Any; + } + } + + /// convert_bond_type(original_bond_type, new_bond_type) + /// + /// Convert all occurences of a given bond type into another bond type. + /// + /// Parameters + /// ---------- + /// original_bond_type : BondType + /// The bond type to convert. + /// new_bond_type : BondType + /// The new bond type. + /// + /// Examples + /// -------- + /// + /// >>> bond_list = BondList(4) + /// >>> bond_list.add_bond(0, 1, BondType.DOUBLE) + /// >>> bond_list.add_bond(1, 2, BondType.COORDINATION) + /// >>> bond_list.add_bond(2, 3, BondType.COORDINATION) + /// >>> for i, j, bond_type in bond_list.as_array(): + /// ... print(i, j, BondType(bond_type).name) + /// 0 1 DOUBLE + /// 1 2 COORDINATION + /// 2 3 COORDINATION + /// >>> bond_list.convert_bond_type(BondType.COORDINATION, BondType.SINGLE) + /// >>> for i, j, bond_type in bond_list.as_array(): + /// ... print(i, j, BondType(bond_type).name) + /// 0 1 DOUBLE + /// 1 2 SINGLE + /// 2 3 SINGLE + fn convert_bond_type(&mut self, original_bond_type: BondType, new_bond_type: BondType) { + for bond in &mut self.bonds { + if bond.bond_type == original_bond_type { + bond.bond_type = new_bond_type; + } + } + } + + /// Get the number atoms represent by this :class:`BondList`. + /// + /// Returns + /// ------- + /// atom_count : int + /// The atom count. + fn get_atom_count(&self) -> usize { + self.atom_count + } + + /// Get the total number of bonds. + /// + /// Returns + /// ------- + /// bond_count : int + /// The number of bonds. This is equal to the length of the + /// internal :class:`ndarray` containing the bonds. + fn get_bond_count(&self) -> usize { + self.bonds.len() + } + + /// get_bonds(atom_index) + /// + /// Obtain the indices of the atoms bonded to the atom with the + /// given index as well as the corresponding bond types. + /// + /// Parameters + /// ---------- + /// atom_index : int + /// The index of the atom to get the bonds for. + /// + /// Returns + /// ------- + /// bonds : np.ndarray, dtype=np.uint32, shape=(k,) + /// The indices of connected atoms. + /// bond_types : np.ndarray, dtype=np.uint8, shape=(k,) + /// Array of integers, interpreted as :class:`BondType` + /// instances. + /// This array specifies the type (or order) of the bonds to + /// the connected atoms. + /// + /// Examples + /// -------- + /// + /// >>> bond_list = BondList(5, np.array([(1,0),(1,3),(1,4)])) + /// >>> bonds, types = bond_list.get_bonds(1) + /// >>> print(bonds) + /// [0 3 4] + #[allow(clippy::type_complexity)] + fn get_bonds<'py>( + &self, + py: Python<'py>, + atom_index: isize, + ) -> PyResult<(Bound<'py, PyArray1>, Bound<'py, PyArray1>)> { + let index = to_positive_index(atom_index, self.atom_count)?; + + let mut bonded_atoms: Vec = Vec::new(); + let mut bond_types: Vec = Vec::new(); + + for bond in &self.bonds { + if bond.atom1 == index { + bonded_atoms.push(bond.atom2 as u32); + bond_types.push(bond.bond_type as u8); + } else if bond.atom2 == index { + bonded_atoms.push(bond.atom1 as u32); + bond_types.push(bond.bond_type as u8); + } + } + + Ok(( + Array1::from_vec(bonded_atoms).into_pyarray(py), + Array1::from_vec(bond_types).into_pyarray(py), + )) + } + + /// For each atom index, give the indices of the atoms bonded to + /// this atom as well as the corresponding bond types. + /// + /// Returns + /// ------- + /// bonds : np.ndarray, dtype=np.int32, shape=(n,k) + /// The indices of connected atoms. + /// The first dimension represents the atoms, + /// the second dimension represents the indices of atoms bonded + /// to the respective atom. + /// Atoms can have have different numbers of atoms bonded to + /// them. + /// Therefore, the length of the second dimension *k* is equal + /// to the maximum number of bonds for an atom in this + /// :class:`BondList`. + /// For atoms with less bonds, the corresponding entry in the + /// array is padded with ``-1`` values. + /// bond_types : np.ndarray, dtype=np.int8, shape=(n,k) + /// Array of integers, interpreted as :class:`BondType` + /// instances. + /// This array specifies the bond type (or order) corresponding + /// to the returned `bonds`. + /// It uses the same ``-1``-padding. + /// + /// Examples + /// -------- + /// + /// >>> # BondList for benzene + /// >>> bond_list = BondList( + /// ... 12, + /// ... np.array([ + /// ... # Bonds between the carbon atoms in the ring + /// ... (0, 1, BondType.AROMATIC_SINGLE), + /// ... (1, 2, BondType.AROMATIC_DOUBLE), + /// ... (2, 3, BondType.AROMATIC_SINGLE), + /// ... (3, 4, BondType.AROMATIC_DOUBLE), + /// ... (4, 5, BondType.AROMATIC_SINGLE), + /// ... (5, 0, BondType.AROMATIC_DOUBLE), + /// ... # Bonds between carbon and hydrogen + /// ... (0, 6, BondType.SINGLE), + /// ... (1, 7, BondType.SINGLE), + /// ... (2, 8, BondType.SINGLE), + /// ... (3, 9, BondType.SINGLE), + /// ... (4, 10, BondType.SINGLE), + /// ... (5, 11, BondType.SINGLE), + /// ... ]) + /// ... ) + /// >>> bonds, types = bond_list.get_all_bonds() + /// >>> print(bonds) + /// [[ 1 5 6 -1] + /// [ 0 2 7 -1] + /// [ 1 3 8 -1] + /// [ 2 4 9 -1] + /// [ 3 5 10 -1] + /// [ 4 0 11 -1] + /// [ 0 -1 -1 -1] + /// [ 1 -1 -1 -1] + /// [ 2 -1 -1 -1] + /// [ 3 -1 -1 -1] + /// [ 4 -1 -1 -1] + /// [ 5 -1 -1 -1]] + /// >>> print(types) + /// [[ 5 6 1 -1] + /// [ 5 6 1 -1] + /// [ 6 5 1 -1] + /// [ 5 6 1 -1] + /// [ 6 5 1 -1] + /// [ 5 6 1 -1] + /// [ 1 -1 -1 -1] + /// [ 1 -1 -1 -1] + /// [ 1 -1 -1 -1] + /// [ 1 -1 -1 -1] + /// [ 1 -1 -1 -1] + /// [ 1 -1 -1 -1]] + /// >>> for i in range(bond_list.get_atom_count()): + /// ... bonds_for_atom = bonds[i] + /// ... # Remove trailing '-1' values + /// ... bonds_for_atom = bonds_for_atom[bonds_for_atom != -1] + /// ... print(f"{i}: {bonds_for_atom}") + /// 0: [1 5 6] + /// 1: [0 2 7] + /// 2: [1 3 8] + /// 3: [2 4 9] + /// 4: [ 3 5 10] + /// 5: [ 4 0 11] + /// 6: [0] + /// 7: [1] + /// 8: [2] + /// 9: [3] + /// 10: [4] + /// 11: [5] + fn get_all_bonds<'py>( + &self, + py: Python<'py>, + ) -> (Bound<'py, PyArray2>, Bound<'py, PyArray2>) { + let n_atoms = self.atom_count; + + // Collect bonds per atom using SmallVec to avoid heap allocation, + // as usually in chemistry an atom has a maximum of 4 bonds + let mut per_atom: Vec> = vec![SmallVec::new(); n_atoms]; + + for bond in &self.bonds { + let bt = bond.bond_type as i8; + per_atom[bond.atom1].push((bond.atom2 as i32, bt)); + per_atom[bond.atom2].push((bond.atom1 as i32, bt)); + } + + let max_bonds_per_atom = per_atom.iter().map(|v| v.len()).max().unwrap_or(0); + let mut bonds_matrix = Array2::from_elem((n_atoms, max_bonds_per_atom), -1i32); + let mut types_matrix = Array2::from_elem((n_atoms, max_bonds_per_atom), -1i8); + + for (i, entries) in per_atom.iter().enumerate() { + for (j, &(atom_idx, bt)) in entries.iter().enumerate() { + bonds_matrix[[i, j]] = atom_idx; + types_matrix[[i, j]] = bt; + } + } + + (bonds_matrix.into_pyarray(py), types_matrix.into_pyarray(py)) + } + + /// Represent this :class:`BondList` as adjacency matrix. + /// + /// The adjacency matrix is a quadratic matrix with boolean values + /// according to + /// + /// .. math:: + /// + /// M_{i,j} = + /// \begin{cases} + /// \text{True}, & \text{if } \text{Atom}_i \text{ and } \text{Atom}_j \text{ form a bond} \\ + /// \text{False}, & \text{otherwise} + /// \end{cases}. + /// + /// Returns + /// ------- + /// matrix : ndarray, dtype=bool, shape=(n,n) + /// The created adjacency matrix. + /// + /// Examples + /// -------- + /// + /// >>> # BondList for formaldehyde + /// >>> bond_list = BondList( + /// ... 4, + /// ... np.array([ + /// ... # Bond between carbon and oxygen + /// ... (0, 1, BondType.DOUBLE), + /// ... # Bonds between carbon and hydrogen + /// ... (0, 2, BondType.SINGLE), + /// ... (0, 3, BondType.SINGLE), + /// ... ]) + /// ... ) + /// >>> print(bond_list.adjacency_matrix()) + /// [[False True True True] + /// [ True False False False] + /// [ True False False False] + /// [ True False False False]] + fn adjacency_matrix<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { + let n = self.atom_count; + let mut matrix = Array2::from_elem((n, n), false); + for bond in &self.bonds { + matrix[[bond.atom1, bond.atom2]] = true; + matrix[[bond.atom2, bond.atom1]] = true; + } + matrix.into_pyarray(py) + } + + /// Represent this :class:`BondList` as a matrix depicting the bond + /// type. + /// + /// The matrix is a quadratic matrix: + /// + /// .. math:: + /// + /// M_{i,j} = + /// \begin{cases} + /// \text{BondType}_{ij}, & \text{if } \text{Atom}_i \text{ and } \text{Atom}_j \text{ form a bond} \\ + /// -1, & \text{otherwise} + /// \end{cases}. + /// + /// Returns + /// ------- + /// matrix : ndarray, dtype=np.int8, shape=(n,n) + /// The created bond type matrix. + /// + /// Examples + /// -------- + /// + /// >>> # BondList for formaldehyde + /// >>> bond_list = BondList( + /// ... 4, + /// ... np.array([ + /// ... # Bond between carbon and oxygen + /// ... (0, 1, BondType.DOUBLE), + /// ... # Bonds between carbon and hydrogen + /// ... (0, 2, BondType.SINGLE), + /// ... (0, 3, BondType.SINGLE), + /// ... ]) + /// ... ) + /// >>> print(bond_list.bond_type_matrix()) + /// [[-1 2 1 1] + /// [ 2 -1 -1 -1] + /// [ 1 -1 -1 -1] + /// [ 1 -1 -1 -1]] + fn bond_type_matrix<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { + let n = self.atom_count; + let mut matrix = Array2::from_elem((n, n), -1i8); + for bond in &self.bonds { + let bt = bond.bond_type as i8; + matrix[[bond.atom1, bond.atom2]] = bt; + matrix[[bond.atom2, bond.atom1]] = bt; + } + matrix.into_pyarray(py) + } + + /// add_bond(atom_index1, atom_index2, bond_type=BondType.ANY) + /// + /// Add a bond to the :class:`BondList`. + /// + /// If the bond is already existent, only the bond type is updated. + /// + /// Parameters + /// ---------- + /// atom_index1, atom_index2 : int + /// The indices of the atoms to create a bond for. + /// bond_type : BondType + /// The type of the bond. Default is :attr:`BondType.ANY`. + #[pyo3(signature = (atom_index1, atom_index2, bond_type=BondType::Any))] + fn add_bond( + &mut self, + atom_index1: isize, + atom_index2: isize, + bond_type: BondType, + ) -> PyResult<()> { + let new_bond = Bond::new(atom_index1, atom_index2, bond_type, self.atom_count)?; + + // Check if bond already exists + for bond in &mut self.bonds { + if bond.equal_atoms(&new_bond) { + bond.bond_type = new_bond.bond_type; + return Ok(()); + } + } + + // Bond doesn't exist, add it + self.bonds.push(new_bond); + Ok(()) + } + + /// remove_bond(atom_index1, atom_index2) + /// + /// Remove a bond from the :class:`BondList`. + /// + /// If the bond is not existent in the :class:`BondList`, nothing happens. + /// + /// Parameters + /// ---------- + /// atom_index1, atom_index2 : int + /// The indices of the atoms whose bond should be removed. + fn remove_bond(&mut self, atom_index1: isize, atom_index2: isize) -> PyResult<()> { + let target = Bond::new(atom_index1, atom_index2, BondType::Any, self.atom_count)?; + self.bonds.retain(|bond| !bond.equal_atoms(&target)); + Ok(()) + } + + /// remove_bonds_to(self, atom_index) + /// + /// Remove all bonds from the :class:`BondList` where the given atom + /// is involved. + /// + /// Parameters + /// ---------- + /// atom_index : int + /// The index of the atom whose bonds should be removed. + fn remove_bonds_to(&mut self, atom_index: isize) -> PyResult<()> { + let index = to_positive_index(atom_index, self.atom_count)?; + self.bonds + .retain(|bond| bond.atom1 != index && bond.atom2 != index); + Ok(()) + } + + /// remove_bonds(bond_list) + /// + /// Remove multiple bonds from the :class:`BondList`. + /// + /// All bonds present in `bond_list` are removed from this instance. + /// If a bond is not existent in this instance, nothing happens. + /// Only the bond indices, not the bond types, are relevant for + /// this. + /// + /// Parameters + /// ---------- + /// bond_list : BondList + /// The bonds in `bond_list` are removed from this instance. + fn remove_bonds(&mut self, bond_list: &BondList) { + let bonds_to_remove: HashSet<(usize, usize)> = + bond_list.bonds.iter().map(|b| (b.atom1, b.atom2)).collect(); + self.bonds + .retain(|bond| !bonds_to_remove.contains(&(bond.atom1, bond.atom2))); + } + + /// merge(bond_list) + /// + /// Merge another :class:`BondList` with this instance into a new + /// object. + /// If a bond appears in both :class:`BondList`'s, the + /// :class:`BondType` from the given `bond_list` takes precedence. + /// + /// The internal :class:`ndarray` instances containing the bonds are + /// simply concatenated and the new atom count is the maximum of + /// both bond lists. + /// + /// Parameters + /// ---------- + /// bond_list : BondList + /// This bond list is merged with this instance. + /// + /// Returns + /// ------- + /// bond_list : BondList + /// The merged :class:`BondList`. + /// + /// Notes + /// ----- + /// This is not equal to using the `+` operator. + /// + /// Examples + /// -------- + /// + /// >>> bond_list1 = BondList(3, np.array([(0,1),(1,2)])) + /// >>> bond_list2 = BondList(5, np.array([(2,3),(3,4)])) + /// >>> merged_list = bond_list2.merge(bond_list1) + /// >>> print(merged_list.get_atom_count()) + /// 5 + /// >>> print(merged_list) + /// [[0 1 0] + /// [1 2 0] + /// [2 3 0] + /// [3 4 0]] + /// + /// The BondList given as parameter takes precedence: + /// + /// >>> # Specifiy bond type to see where a bond is taken from + /// >>> bond_list1 = BondList(4, np.array([ + /// ... (0, 1, BondType.SINGLE), + /// ... (1, 2, BondType.SINGLE) + /// ... ])) + /// >>> bond_list2 = BondList(4, np.array([ + /// ... (1, 2, BondType.DOUBLE), # This one is a duplicate + /// ... (2, 3, BondType.DOUBLE) + /// ... ])) + /// >>> merged_list = bond_list2.merge(bond_list1) + /// >>> print(merged_list) + /// [[0 1 1] + /// [1 2 1] + /// [2 3 2]] + fn merge(&self, bond_list: &BondList) -> BondList { + let new_atom_count = self.atom_count.max(bond_list.atom_count); + + // Concatenate bonds: other bond_list takes precedence (put first) + let total_bonds = self.bonds.len() + bond_list.bonds.len(); + let mut merged_bonds: Vec = Vec::with_capacity(total_bonds); + merged_bonds.extend(bond_list.bonds.iter().cloned()); + merged_bonds.extend(self.bonds.iter().cloned()); + + let mut result = BondList { + atom_count: new_atom_count, + bonds: merged_bonds, + }; + result.remove_redundant_bonds(); + result + } + + fn __add__(&self, other: &BondList) -> BondList { + BondList::concatenate(vec![self.clone(), other.clone()]).unwrap() + } + + fn __getitem__<'py>( + &self, + py: Python<'py>, + index: &Bound<'py, PyAny>, + ) -> PyResult> { + // Handle single integer index + if let Ok(int_index) = index.extract::() { + let (bonds, types) = self.get_bonds(py, int_index)?; + let tuple = PyTuple::new(py, [bonds.into_any(), types.into_any()])?; + return Ok(tuple.into_any()); + } + + // Handle slices + if let Ok(slice) = index.cast::() { + let slice = slice.indices(self.atom_count as isize)?; + let index_array: Vec = (0..slice.slicelength as isize) + .map(|i| slice.start + i * slice.step) + .collect(); + return self.index_with_int_array(py, &index_array); + } + + // Convert to numpy array for uniform handling + let np = PyModule::import(py, "numpy")?; + let array = np.call_method1("asarray", (index,))?; + let kind: String = array.getattr("dtype")?.getattr("kind")?.extract()?; + + if kind == "b" { + // Boolean mask + let bool_array: PyReadonlyArray1 = array.extract()?; + let ndarray_view = bool_array.as_array(); + if ndarray_view.len() != self.atom_count { + return Err(exceptions::PyIndexError::new_err(format!( + "Boolean index has length {}, expected {}", + ndarray_view.len(), + self.atom_count + ))); + } + if let Some(array_view) = ndarray_view.as_slice() { + let result = self.index_with_bool_mask(array_view)?; + Ok(result.into_pyobject(py)?.into_any()) + } else { + // Underlying buffer is not contiguous -> copy into new vector + let vec: Vec = ndarray_view.iter().copied().collect(); + let result = self.index_with_bool_mask(&vec)?; + Ok(result.into_pyobject(py)?.into_any()) + } + } else { + // Integer array index + let int_array: PyReadonlyArray1 = np + .call_method1("asarray", (&array, np.getattr("intp")?))? + .extract()?; + let ndarray_view = int_array.as_array(); + if let Some(array_view) = ndarray_view.as_slice() { + self.index_with_int_array(py, array_view) + } else { + // Underlying buffer is not contiguous -> copy into new vector + let vec: Vec = ndarray_view.iter().copied().collect(); + self.index_with_int_array(py, &vec) + } + } + } + + fn __contains__(&self, item: (isize, isize)) -> PyResult { + let target = match Bond::new(item.0, item.1, BondType::Any, self.atom_count) { + Ok(bond) => bond, + // If the atom indices are out of range, the bond is not part of the bond list + Err(e) => return Ok(false), + }; + Ok(self.bonds.iter().any(|bond| bond.equal_atoms(&target))) + } + + fn __eq__(&self, other: &BondList) -> bool { + if self.atom_count != other.atom_count { + return false; + } + let self_set: HashSet<&Bond> = self.bonds.iter().collect(); + let other_set: HashSet<&Bond> = other.bonds.iter().collect(); + self_set == other_set + } + + fn __str__(&self, py: Python<'_>) -> PyResult { + let array = self.as_array(py); + array.call_method0("__str__")?.extract() + } + + fn __iter__(&self) -> PyResult<()> { + Err(exceptions::PyTypeError::new_err( + "'BondList' object is not iterable", + )) + } + + // Pickling support + fn __reduce__<'py>(&self, py: Python<'py>) -> PyResult> { + let struc = PyModule::import(py, "biotite.structure")?; + let cls = struc.getattr("BondList")?; + let from_state = cls.getattr("_from_state")?; + + // Serialize bonds as flat vector of [atom1, atom2, type] arrays + let bonds_data: Vec<[usize; 3]> = self + .bonds + .iter() + .map(|b| [b.atom1, b.atom2, b.bond_type as usize]) + .collect(); + + let args = PyTuple::new( + py, + [ + self.atom_count.into_pyobject(py)?.into_any().unbind(), + bonds_data.into_pyobject(py)?.into_any().unbind(), + ], + )?; + + PyTuple::new(py, [from_state.unbind(), args.into_any().unbind()]) + } + + #[staticmethod] + #[pyo3(name = "_from_state")] + fn from_state(atom_count: usize, bonds_data: Vec<[usize; 3]>) -> PyResult { + let bonds: Vec = bonds_data + .into_iter() + .map(|[a1, a2, bt]| { + Ok(Bond { + atom1: a1, + atom2: a2, + bond_type: BondType::try_from(bt as u8)?, + }) + }) + .collect::>>()?; + + Ok(BondList { atom_count, bonds }) + } + + // Copy support for biotite.Copyable interface + fn copy(&self) -> BondList { + self.clone() + } + + fn __copy__(&self) -> BondList { + self.clone() + } + + fn __deepcopy__(&self, _memo: &Bound<'_, PyAny>) -> BondList { + self.clone() + } +} + +impl BondList { + pub fn get_bonds_ref(&self) -> &[Bond] { + &self.bonds + } + + /// Create an empty bond list for internal use from other Rust modules. + pub fn empty(atom_count: usize) -> Self { + BondList { + atom_count, + bonds: Vec::new(), + } + } + + /// Add a bond to the bond list. + /// + /// This function is unsafe because it does not check for duplicate bonds. + /// It is the responsibility of the caller to ensure that the bond is not already in the list + /// or that ``remove_redundant_bonds()`` is called after adding all bonds. + pub unsafe fn add(&mut self, bond: Bond) -> PyResult<()> { + self.bonds.push(bond); + Ok(()) + } + + /// Remove redundant bonds, keeping the first occurrence. + /// Uses per-atom neighbor lists instead of HashSet for better performance + /// with typical chemistry data (max 4 bonds per atom). + pub fn remove_redundant_bonds(&mut self) { + let mut seen_per_atom: Vec> = + vec![SmallVec::new(); self.atom_count]; + self.bonds.retain(|bond| { + let key = (bond.atom1, bond.atom2); + if seen.contains(&key) { + false + } else { + seen.insert(key); + true + } + }); + } + + /// Index the bond list with a boolean mask. + /// + /// If an atom in a bond is not masked, the bond is removed. + /// If an atom is masked, its index is decreased by the number of + /// preceding unmasked atoms (the offset). + /// The offset is necessary because removing atoms from an AtomArray + /// decreases the indices of the following atoms. + fn index_with_bool_mask(&self, mask: &[bool]) -> PyResult { + // Each time an atom is not in the mask, the offset increases by one + let mut cumulative_offset: usize = 0; + let offsets: Vec = mask + .iter() + .map(|&is_selected| { + if !is_selected { + cumulative_offset += 1; + } + cumulative_offset + }) + .collect(); + + let new_atom_count = mask.iter().filter(|&&x| x).count(); + + let mut new_bonds: Vec = Vec::new(); + for bond in &self.bonds { + if mask[bond.atom1] && mask[bond.atom2] { + // Both atoms involved in bond are masked + // -> decrease atom index by offset + new_bonds.push(Bond { + atom1: bond.atom1 - offsets[bond.atom1], + atom2: bond.atom2 - offsets[bond.atom2], + bond_type: bond.bond_type, + }); + } + // Otherwise at least one atom is not masked -> bond is dropped + } + + Ok(BondList { + atom_count: new_atom_count, + bonds: new_bonds, + }) + } + + /// Index the bond list with an integer array. + /// + /// Uses an inverse index to efficiently obtain the new index of an + /// atom, which is especially important for unsorted index arrays. + /// Bond indices are re-sorted (via `Bond::new`) since the correct + /// order is not guaranteed for unsorted index arrays. + fn index_with_int_array<'py>( + &self, + py: Python<'py>, + indices: &[isize], + ) -> PyResult> { + // Build inverse index: if indices[i] = j, then inverse[j] = i. + // For all atoms j not in the index array, inverse[j] = -1. + let mut inverse_index: Vec = vec![-1; self.atom_count]; + for (new_idx, &orig_idx) in indices.iter().enumerate() { + let pos_idx = to_positive_index(orig_idx, self.atom_count)?; + if inverse_index[pos_idx] != -1 { + return Err(exceptions::PyNotImplementedError::new_err(format!( + "Duplicate indices are not supported, but index {} appeared multiple times", + pos_idx + ))); + } + inverse_index[pos_idx] = new_idx as isize; + } + + let new_atom_count = indices.len(); + let mut new_bonds: Vec = Vec::new(); + + for bond in &self.bonds { + let new_idx1 = inverse_index[bond.atom1]; + let new_idx2 = inverse_index[bond.atom2]; + if new_idx1 != -1 && new_idx2 != -1 { + // Both atoms involved in bond are included by the index array + // -> assign new atom indices + new_bonds.push(Bond::new( + new_idx1, + new_idx2, + bond.bond_type, + new_atom_count, + )?); + } + // Otherwise at least one atom is not included -> bond is dropped + } + + let result = BondList { + atom_count: new_atom_count, + bonds: new_bonds, + }; + Ok(result.into_pyobject(py)?.into_any()) + } +} + +/// Convert a potentially negative index to a positive index. +#[inline(always)] +fn to_positive_index(index: isize, array_length: usize) -> PyResult { + if index < 0 { + let pos_index = array_length as isize + index; + if pos_index < 0 { + return Err(exceptions::PyIndexError::new_err(format!( + "Index {} is out of range for an atom count of {}", + index, array_length + ))); + } + Ok(pos_index as usize) + } else { + if index >= array_length as isize { + return Err(exceptions::PyIndexError::new_err(format!( + "Index {} is out of range for an atom count of {}", + index, array_length + ))); + } + Ok(index as usize) + } +} diff --git a/src/rust/structure/io/mod.rs b/src/rust/structure/io/mod.rs index de64902a1..91c7372df 100644 --- a/src/rust/structure/io/mod.rs +++ b/src/rust/structure/io/mod.rs @@ -1,7 +1,7 @@ use crate::add_subpackage; use pyo3::prelude::*; -mod pdb; +pub mod pdb; pub fn module<'py>(parent_module: &Bound<'py, PyModule>) -> PyResult> { let module = PyModule::new(parent_module.py(), "io")?; diff --git a/src/rust/structure/io/pdb/mod.rs b/src/rust/structure/io/pdb/mod.rs index add89faa8..3f4b87d90 100644 --- a/src/rust/structure/io/pdb/mod.rs +++ b/src/rust/structure/io/pdb/mod.rs @@ -3,14 +3,11 @@ use pyo3::prelude::*; pub mod file; pub mod hybrid36; -use file::*; -use hybrid36::*; - pub fn module<'py>(parent_module: &Bound<'py, PyModule>) -> PyResult> { let module = PyModule::new(parent_module.py(), "pdb")?; - module.add_class::()?; - module.add_function(wrap_pyfunction!(encode_hybrid36, &module)?)?; - module.add_function(wrap_pyfunction!(decode_hybrid36, &module)?)?; - module.add_function(wrap_pyfunction!(max_hybrid36_number, &module)?)?; + module.add_class::()?; + module.add_function(wrap_pyfunction!(hybrid36::encode_hybrid36, &module)?)?; + module.add_function(wrap_pyfunction!(hybrid36::decode_hybrid36, &module)?)?; + module.add_function(wrap_pyfunction!(hybrid36::max_hybrid36_number, &module)?)?; Ok(module) } diff --git a/src/rust/structure/mod.rs b/src/rust/structure/mod.rs index 548340cb6..4135ca82e 100644 --- a/src/rust/structure/mod.rs +++ b/src/rust/structure/mod.rs @@ -1,18 +1,19 @@ use crate::add_subpackage; use pyo3::prelude::*; +pub mod bonds; pub mod celllist; pub mod io; pub mod sasa; pub mod util; -use celllist::*; - pub fn module<'py>(parent_module: &Bound<'py, PyModule>) -> PyResult> { let module = PyModule::new(parent_module.py(), "structure")?; - module.add_class::()?; - module.add_class::()?; - module.add_function(pyo3::wrap_pyfunction!(sasa::sasa, &module)?)?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_function(wrap_pyfunction!(bonds::bond_type_members, &module)?)?; + module.add_function(wrap_pyfunction!(sasa::sasa, &module)?)?; add_subpackage(&module, &io::module(&module)?, "biotite.rust.structure.io")?; Ok(module) } diff --git a/tests/structure/test_bonds.py b/tests/structure/test_bonds.py index 0ac2608d9..fdcdb2127 100644 --- a/tests/structure/test_bonds.py +++ b/tests/structure/test_bonds.py @@ -3,13 +3,13 @@ # information. import itertools +import pickle from os.path import join import numpy as np import pytest import biotite.structure as struc import biotite.structure.info as info import biotite.structure.io as strucio -import biotite.structure.io.pdbx as pdbx from tests.util import data_dir @@ -59,8 +59,19 @@ def test_creation(bond_list): [0, 4, 0], [4, 6, 0], ] - assert bond_list._max_bonds_per_atom == 3 - assert bond_list._atom_count == 7 + assert bond_list.get_atom_count() == 7 + + +@pytest.mark.parametrize("dim", [2, 3]) +def test_empty_creation(dim): + """ + Test creating a :class:`BondList` with empty bonds, which should be equal to a + :class:`BondList` with no provided bonds. + """ + N_ATOMS = 10 + assert struc.BondList( + N_ATOMS, np.zeros((0, dim), dtype=np.int64) + ) == struc.BondList(N_ATOMS) def test_invalid_creation(): @@ -175,7 +186,6 @@ def test_add_two_bond_list(): bond_list1 = struc.BondList(2, np.array([(0, 1)])) # max_bond_per_atom=1 bond_list2 = struc.BondList(3, np.array([(0, 1), (0, 2)])) # max_bond_per_atom=2 added_list = bond_list1 + bond_list2 - assert added_list._max_bonds_per_atom == 2 assert added_list.get_bonds(2)[0].tolist() == [3, 4] assert added_list.as_array().tolist() == [[0, 1, 0], [2, 3, 0], [2, 4, 0]] @@ -246,8 +256,7 @@ def test_concatenation(bond_list): [7, 8, 2], [8, 9, 2], ] - assert bond_list._max_bonds_per_atom == 3 - assert bond_list._atom_count == 10 + assert bond_list.get_atom_count() == 10 def test_indexing(bond_list): @@ -556,7 +565,7 @@ def test_find_rotatable_bonds(res_name, expected_bonds): @pytest.mark.parametrize( - "cif_path, expected_bond_idces", + "cif_path, expected_bond_indices", [ ( join(data_dir("structure"), "3o5r.cif"), @@ -564,7 +573,7 @@ def test_find_rotatable_bonds(res_name, expected_bonds): ) ], ) -def test_canonical_bonds_with_altloc_occupancy(cif_path, expected_bond_idces): +def test_canonical_bonds_with_altloc_occupancy(cif_path, expected_bond_indices): """ Test whether canonical inter-residue bonds are correctly computed when `altloc="occupancy"` and the higher-occupancy atom occurs second in the CIF file. @@ -575,7 +584,15 @@ def test_canonical_bonds_with_altloc_occupancy(cif_path, expected_bond_idces): cif_file.block, altloc="occupancy", include_bonds=True ) - atom1, atom2 = expected_bond_idces + atom1, atom2 = expected_bond_indices # Assert that the canonical inter-residue bond exists assert atom2 in atom_array.bonds.get_bonds(atom1)[0] + + +def test_pickle(bond_list): + """ + Check that a BondList survives a pickle round-trip unchanged. + """ + restored = pickle.loads(pickle.dumps(bond_list)) + assert restored == bond_list From dbef707639e825640763f54094c386b355972cec Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Tue, 17 Feb 2026 10:45:49 +0100 Subject: [PATCH 2/5] Fix enum values --- src/biotite/structure/basepairs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/biotite/structure/basepairs.py b/src/biotite/structure/basepairs.py index c2c8a687e..ac264bb99 100644 --- a/src/biotite/structure/basepairs.py +++ b/src/biotite/structure/basepairs.py @@ -292,9 +292,9 @@ class Edge(IntEnum): This enum type represents the interacting edge for a given base. """ - INVALID = (0,) - WATSON_CRICK = (1,) - HOOGSTEEN = (2,) + INVALID = 0 + WATSON_CRICK = 1 + HOOGSTEEN = 2 SUGAR = 3 @@ -305,8 +305,8 @@ class GlycosidicBond(IntEnum): """ INVALID = 0 - CIS = (1,) - TRANS = (2,) + CIS = 1 + TRANS = 2 def base_pairs_edge(atom_array, base_pairs): From 3bef560b6fac26ffa080e4871cb216a5cce59597 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 1 Mar 2026 10:34:43 +0100 Subject: [PATCH 3/5] Improve `BondList` performance --- src/rust/structure/bonds.rs | 154 ++++++++++++++++++++++++---------- tests/structure/test_bonds.py | 43 ++++++++-- 2 files changed, 144 insertions(+), 53 deletions(-) diff --git a/src/rust/structure/bonds.rs b/src/rust/structure/bonds.rs index 73df5465b..360984b39 100644 --- a/src/rust/structure/bonds.rs +++ b/src/rust/structure/bonds.rs @@ -7,6 +7,8 @@ use smallvec::SmallVec; use std::collections::{HashMap, HashSet}; use std::convert::TryFrom; +const TYPICAL_MAX_BONDS_PER_ATOM: usize = 4; + /// Rust reflection of the :class:`BondType` enum. #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] #[repr(u8)] @@ -478,36 +480,46 @@ impl BondList { /// The third column stores the :class:`BondType`. fn as_array<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { let n_bonds = self.bonds.len(); - let mut data: Vec = Vec::with_capacity(n_bonds * 3); - for bond in &self.bonds { - data.push(bond.atom1 as u32); - data.push(bond.atom2 as u32); - data.push(bond.bond_type as u32); + // Allocate uninitialized numpy array and write directly + let mut array = Array2::::uninit([n_bonds, 3]); + // SAFETY: The indices are created in this function and are within the bounds + unsafe { + let slice = array.as_slice_mut().expect("Array not contiguous"); + for (i, bond) in self.bonds.iter().enumerate() { + let slice_i = i * 3; + slice.get_unchecked_mut(slice_i).write(bond.atom1 as u32); + slice + .get_unchecked_mut(slice_i + 1) + .write(bond.atom2 as u32); + slice + .get_unchecked_mut(slice_i + 2) + .write(bond.bond_type as u32); + } } - Array2::from_shape_vec((n_bonds, 3), data) - .expect("Shape mismatch") - .into_pyarray(py) + // SAFETY: In the previous block, we have written all elements of the array + let array = unsafe { array.assume_init() }; + array.into_pyarray(py) } /// Obtain a set representation of the :class:`BondList`. /// /// Returns /// ------- - /// bond_set : set of tuple(int, int, BondType) + /// bond_set : set of tuple(int, int, int) /// A set of tuples. /// Each tuple represents one bond: /// The first integer represents the first atom, /// the second integer represents the second atom, - /// the third value represents the :class:`BondType`. + /// the third integer represents the :class:`BondType`. fn as_set<'py>(&self, py: Python<'py>) -> PyResult> { let set = PySet::empty(py)?; for bond in &self.bonds { let tuple = PyTuple::new( py, [ - bond.atom1.into_pyobject(py)?.into_any(), - bond.atom2.into_pyobject(py)?.into_any(), - bond.bond_type.into_pyobject(py)?.into_any(), + (bond.atom1).into_pyobject(py)?.into_any(), + (bond.atom2).into_pyobject(py)?.into_any(), + (bond.bond_type as u32).into_pyobject(py)?.into_any(), ], )?; set.add(tuple)?; @@ -742,11 +754,8 @@ impl BondList { /// to the respective atom. /// Atoms can have have different numbers of atoms bonded to /// them. - /// Therefore, the length of the second dimension *k* is equal - /// to the maximum number of bonds for an atom in this - /// :class:`BondList`. - /// For atoms with less bonds, the corresponding entry in the - /// array is padded with ``-1`` values. + /// Hence, each column in the + /// array may be padded with ``-1`` values. /// bond_types : np.ndarray, dtype=np.int8, shape=(n,k) /// Array of integers, interpreted as :class:`BondType` /// instances. @@ -825,30 +834,11 @@ impl BondList { &self, py: Python<'py>, ) -> (Bound<'py, PyArray2>, Bound<'py, PyArray2>) { - let n_atoms = self.atom_count; - - // Collect bonds per atom using SmallVec to avoid heap allocation, - // as usually in chemistry an atom has a maximum of 4 bonds - let mut per_atom: Vec> = vec![SmallVec::new(); n_atoms]; - - for bond in &self.bonds { - let bt = bond.bond_type as i8; - per_atom[bond.atom1].push((bond.atom2 as i32, bt)); - per_atom[bond.atom2].push((bond.atom1 as i32, bt)); - } - - let max_bonds_per_atom = per_atom.iter().map(|v| v.len()).max().unwrap_or(0); - let mut bonds_matrix = Array2::from_elem((n_atoms, max_bonds_per_atom), -1i32); - let mut types_matrix = Array2::from_elem((n_atoms, max_bonds_per_atom), -1i8); - - for (i, entries) in per_atom.iter().enumerate() { - for (j, &(atom_idx, bt)) in entries.iter().enumerate() { - bonds_matrix[[i, j]] = atom_idx; - types_matrix[[i, j]] = bt; - } - } - - (bonds_matrix.into_pyarray(py), types_matrix.into_pyarray(py)) + let (bonds, types) = match self.get_all_bonds_typical() { + Some(result) => result, + None => self.get_all_bonds_atypical(), + }; + (bonds.into_pyarray(py), types.into_pyarray(py)) } /// Represent this :class:`BondList` as adjacency matrix. @@ -1174,7 +1164,7 @@ impl BondList { let target = match Bond::new(item.0, item.1, BondType::Any, self.atom_count) { Ok(bond) => bond, // If the atom indices are out of range, the bond is not part of the bond list - Err(e) => return Ok(false), + Err(_) => return Ok(false), }; Ok(self.bonds.iter().any(|bond| bond.equal_atoms(&target))) } @@ -1284,16 +1274,88 @@ impl BondList { let mut seen_per_atom: Vec> = vec![SmallVec::new(); self.atom_count]; self.bonds.retain(|bond| { - let key = (bond.atom1, bond.atom2); - if seen.contains(&key) { + let neighbors = &mut seen_per_atom[bond.atom1]; + if neighbors.contains(&bond.atom2) { false } else { - seen.insert(key); + neighbors.push(bond.atom2); true } }); } + /// Optimistic one-pass method for `get_all_bonds`. + /// Pre-allocates arrays assuming at most `TYPICAL_MAX_BONDS_PER_ATOM` bonds per atom. + /// Returns `None` if any atom exceeds that limit. + fn get_all_bonds_typical(&self) -> Option<(Array2, Array2)> { + let k = TYPICAL_MAX_BONDS_PER_ATOM; + let total = self.atom_count * k; + let mut bonds_data: Vec = vec![-1i32; total]; + let mut types_data: Vec = vec![-1i8; total]; + let mut counts: Vec = vec![0; self.atom_count]; + + for bond in &self.bonds { + let bt = bond.bond_type as i8; + let a1 = bond.atom1; + let a2 = bond.atom2; + + if counts[a1] >= k || counts[a2] >= k { + return None; + } + + bonds_data[a1 * k + counts[a1]] = a2 as i32; + types_data[a1 * k + counts[a1]] = bt; + counts[a1] += 1; + + bonds_data[a2 * k + counts[a2]] = a1 as i32; + types_data[a2 * k + counts[a2]] = bt; + counts[a2] += 1; + } + + Some(( + Array2::from_shape_vec((self.atom_count, k), bonds_data).expect("Shape mismatch"), + Array2::from_shape_vec((self.atom_count, k), types_data).expect("Shape mismatch"), + )) + } + + /// Two-pass fallback for `get_all_bonds`. + /// First counts bonds per atom to determine the exact column count, + /// then fills the arrays. + fn get_all_bonds_atypical(&self) -> (Array2, Array2) { + let mut counts: Vec = vec![0; self.atom_count]; + for bond in &self.bonds { + counts[bond.atom1] += 1; + counts[bond.atom2] += 1; + } + let max_bonds = counts.iter().copied().max().unwrap_or(0); + + let total = self.atom_count * max_bonds; + let mut bonds_data: Vec = vec![-1i32; total]; + let mut types_data: Vec = vec![-1i8; total]; + + counts.fill(0); + for bond in &self.bonds { + let bt = bond.bond_type as i8; + let a1 = bond.atom1; + let a2 = bond.atom2; + + bonds_data[a1 * max_bonds + counts[a1]] = a2 as i32; + types_data[a1 * max_bonds + counts[a1]] = bt; + counts[a1] += 1; + + bonds_data[a2 * max_bonds + counts[a2]] = a1 as i32; + types_data[a2 * max_bonds + counts[a2]] = bt; + counts[a2] += 1; + } + + ( + Array2::from_shape_vec((self.atom_count, max_bonds), bonds_data) + .expect("Shape mismatch"), + Array2::from_shape_vec((self.atom_count, max_bonds), types_data) + .expect("Shape mismatch"), + ) + } + /// Index the bond list with a boolean mask. /// /// If an atom in a bond is not masked, the bond is removed. diff --git a/tests/structure/test_bonds.py b/tests/structure/test_bonds.py index fdcdb2127..4c5b48358 100644 --- a/tests/structure/test_bonds.py +++ b/tests/structure/test_bonds.py @@ -10,6 +10,7 @@ import biotite.structure as struc import biotite.structure.info as info import biotite.structure.io as strucio +import biotite.structure.io.pdbx as pdbx from tests.util import data_dir @@ -158,13 +159,9 @@ def test_concatenation_and_splitting(seed): split_bond_lists = [] starts = [0] for _ in range(N_BOND_LISTS): - n_atoms = rng.integers(1, MAX_ATOMS) - bonds = rng.integers(0, n_atoms, size=(MAX_BONDS, 2)) - bond_types = rng.integers(0, len(struc.BondType), size=MAX_BONDS) + n_atoms = rng.integers(2, MAX_ATOMS) split_bond_lists.append( - struc.BondList( - n_atoms, np.concatenate([bonds, bond_types[:, np.newaxis]], axis=1) - ) + generate_random_bond_list(n_atoms, MAX_BONDS, seed=seed) ) starts.append(starts[-1] + n_atoms) @@ -288,11 +285,43 @@ def test_indexing(bond_list): assert sub_list.as_array().tolist() == [[1, 2, 0], [0, 2, 0], [2, 3, 0]] -def test_get_all_bonds(): +def test_get_all_bonds_typical(): """ Test whether the individual rows returned from :func:`get_all_bonds()` are equal to corresponding calls of :func:`get_bonds()`. + + The bonds are taken from a real structure, so no atom exceeds 4 bonds per atom. + """ + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) + atoms = pdbx.get_structure(pdbx_file, model=1, include_bonds=True) + + bonds, bond_types = atoms.bonds.get_all_bonds() + + assert (bonds != -1).all(axis=1).any(axis=0) + assert (bond_types != -1).all(axis=1).any(axis=0) + + test_bonds = [ + (bonded_i[bonded_i != -1].tolist(), bond_type[bond_type != -1].tolist()) + for bonded_i, bond_type in zip(bonds, bond_types) + ] + + ref_bonds = [atoms.bonds.get_bonds(i) for i in range(atoms.array_length())] + ref_bonds = [ + (bonded_i.tolist(), bond_type.tolist()) for bonded_i, bond_type in ref_bonds + ] + + assert test_bonds == ref_bonds + + +def test_get_all_bonds_atypical(): + """ + Test whether the individual rows returned from + :func:`get_all_bonds()` are equal to corresponding calls of + :func:`get_bonds()`. + + The average number of bonds per atom is greater than the typical maximum of 4, + so the internal fallback method is used. """ ATOM_COUNT = 100 BOND_COUNT = 500 From 3d9626a74367ca64636954c32ef9cdc7f204420f Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Mon, 2 Mar 2026 10:01:49 +0100 Subject: [PATCH 4/5] Implement connection functionalities in Rust --- Cargo.toml | 1 + src/biotite/application/autodock/app.py | 2 +- src/biotite/structure/__init__.py | 1 + src/biotite/structure/bonds.py | 597 ----------------------- src/biotite/structure/connect.py | 454 +++++++++++++++++ src/biotite/structure/io/pdb/file.py | 3 +- src/biotite/structure/io/pdbqt/file.py | 8 +- src/biotite/structure/io/pdbx/convert.py | 3 +- src/biotite/structure/molecules.py | 3 +- src/rust/structure/bonds.rs | 64 +-- src/rust/structure/connect.rs | 387 +++++++++++++++ src/rust/structure/mod.rs | 8 + tests/structure/test_bonds.py | 113 ----- tests/structure/test_connect.py | 142 ++++++ 14 files changed, 1034 insertions(+), 752 deletions(-) create mode 100644 src/biotite/structure/connect.py create mode 100644 src/rust/structure/connect.rs create mode 100644 tests/structure/test_connect.py diff --git a/Cargo.toml b/Cargo.toml index 76e0711c8..7f93f8550 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ numpy = "0.27" ndarray = "0.17" num-traits = "0.2" smallvec = "1" +itertools = "0.14" [dependencies.pyo3] version = "0.27" diff --git a/src/biotite/application/autodock/app.py b/src/biotite/application/autodock/app.py index 73d19f106..91f404540 100644 --- a/src/biotite/application/autodock/app.py +++ b/src/biotite/application/autodock/app.py @@ -11,7 +11,7 @@ import numpy as np from biotite.application.application import AppState, requires_state from biotite.application.localapp import LocalApp, cleanup_tempfile -from biotite.structure.bonds import find_connected +from biotite.structure.connect import find_connected from biotite.structure.error import BadStructureError from biotite.structure.io.pdbqt import PDBQTFile from biotite.structure.residues import get_residue_masks, get_residue_starts_for diff --git a/src/biotite/structure/__init__.py b/src/biotite/structure/__init__.py index 2fce5fc40..c615000c6 100644 --- a/src/biotite/structure/__init__.py +++ b/src/biotite/structure/__init__.py @@ -116,6 +116,7 @@ from .chains import * from .charges import * from .compare import * +from .connect import * from .density import * from .dotbracket import * from .error import * diff --git a/src/biotite/structure/bonds.py b/src/biotite/structure/bonds.py index 0b894ee3f..5a05d4863 100644 --- a/src/biotite/structure/bonds.py +++ b/src/biotite/structure/bonds.py @@ -2,28 +2,15 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -""" -This module allows efficient search of atoms in a defined radius around -a location. -""" - __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" __all__ = [ "BondList", "BondType", - "connect_via_distances", - "connect_via_residue_names", - "find_connected", - "find_rotatable_bonds", ] -import itertools from enum import IntEnum -import networkx as nx -import numpy as np from biotite.rust.structure import BondList, bond_type_members -from biotite.structure.error import BadStructureError def _without_aromaticity(self): @@ -78,587 +65,3 @@ def _without_aromaticity(self): - ``AROMATIC`` - Aromatic bond without specification of the formal bond """ BondType.without_aromaticity = _without_aromaticity - - -# fmt: off -_DEFAULT_DISTANCE_RANGE = { - # Taken from Allen et al. - # min - 2*std max + 2*std - ("B", "C" ) : (1.556 - 2*0.015, 1.556 + 2*0.015), - ("BR", "C" ) : (1.875 - 2*0.029, 1.966 + 2*0.029), - ("BR", "O" ) : (1.581 - 2*0.007, 1.581 + 2*0.007), - ("C", "C" ) : (1.174 - 2*0.011, 1.588 + 2*0.025), - ("C", "CL") : (1.713 - 2*0.011, 1.849 + 2*0.011), - ("C", "F" ) : (1.320 - 2*0.009, 1.428 + 2*0.009), - ("C", "H" ) : (1.059 - 2*0.030, 1.099 + 2*0.007), - ("C", "I" ) : (2.095 - 2*0.015, 2.162 + 2*0.015), - ("C", "N" ) : (1.325 - 2*0.009, 1.552 + 2*0.023), - ("C", "O" ) : (1.187 - 2*0.011, 1.477 + 2*0.008), - ("C", "P" ) : (1.791 - 2*0.006, 1.855 + 2*0.019), - ("C", "S" ) : (1.630 - 2*0.014, 1.863 + 2*0.015), - ("C", "SE") : (1.893 - 2*0.013, 1.970 + 2*0.032), - ("C", "SI") : (1.837 - 2*0.012, 1.888 + 2*0.023), - ("CL", "O" ) : (1.414 - 2*0.026, 1.414 + 2*0.026), - ("CL", "P" ) : (1.997 - 2*0.035, 2.008 + 2*0.035), - ("CL", "S" ) : (2.072 - 2*0.023, 2.072 + 2*0.023), - ("CL", "SI") : (2.072 - 2*0.009, 2.072 + 2*0.009), - ("F", "N" ) : (1.406 - 2*0.016, 1.406 + 2*0.016), - ("F", "P" ) : (1.495 - 2*0.016, 1.579 + 2*0.025), - ("F", "S" ) : (1.640 - 2*0.011, 1.640 + 2*0.011), - ("F", "SI") : (1.588 - 2*0.014, 1.694 + 2*0.013), - ("H", "N" ) : (1.009 - 2*0.022, 1.033 + 2*0.022), - ("H", "O" ) : (0.967 - 2*0.010, 1.015 + 2*0.017), - ("I", "O" ) : (2.144 - 2*0.028, 2.144 + 2*0.028), - ("N", "N" ) : (1.124 - 2*0.015, 1.454 + 2*0.021), - ("N", "O" ) : (1.210 - 2*0.011, 1.463 + 2*0.012), - ("N", "P" ) : (1.571 - 2*0.013, 1.697 + 2*0.015), - ("N", "S" ) : (1.541 - 2*0.022, 1.710 + 2*0.019), - ("N", "SI") : (1.711 - 2*0.019, 1.748 + 2*0.022), - ("O", "P" ) : (1.449 - 2*0.007, 1.689 + 2*0.024), - ("O", "S" ) : (1.423 - 2*0.008, 1.580 + 2*0.015), - ("O", "SI") : (1.622 - 2*0.014, 1.680 + 2*0.008), - ("P", "P" ) : (2.214 - 2*0.022, 2.214 + 2*0.022), - ("P", "S" ) : (1.913 - 2*0.014, 1.954 + 2*0.005), - ("P", "SE") : (2.093 - 2*0.019, 2.093 + 2*0.019), - ("P", "SI") : (2.264 - 2*0.019, 2.264 + 2*0.019), - ("S", "S" ) : (1.897 - 2*0.012, 2.070 + 2*0.022), - ("S", "SE") : (2.193 - 2*0.015, 2.193 + 2*0.015), - ("S", "SI") : (2.145 - 2*0.020, 2.145 + 2*0.020), - ("SE", "SE") : (2.340 - 2*0.024, 2.340 + 2*0.024), - ("SI", "SE") : (2.359 - 2*0.012, 2.359 + 2*0.012), -} -# fmt: on - - -def connect_via_distances( - atoms, - distance_range=None, - inter_residue=True, - default_bond_type=BondType.ANY, - periodic=False, -): - """ - connect_via_distances(atoms, distance_range=None, inter_residue=True, - default_bond_type=BondType.ANY, periodic=False) - - Create a :class:`BondList` for a given atom array, based on - pairwise atom distances. - - A :attr:`BondType.ANY`, bond is created for two atoms within the - same residue, if the distance between them is within the expected - bond distance range. - Bonds between two adjacent residues are created for the atoms - expected to connect these residues, i.e. ``'C'`` and ``'N'`` for - peptides and ``"O3'"`` and ``'P'`` for nucleotides. - - Parameters - ---------- - atoms : AtomArray - The structure to create the :class:`BondList` for. - distance_range : dict of tuple(str, str) -> tuple(float, float), optional - Custom minimum and maximum bond distances. - The dictionary keys are tuples of chemical elements representing - the atoms to be potentially bonded. - The order of elements within each tuple does not matter. - The dictionary values are the minimum and maximum bond distance, - respectively, for the given combination of elements. - This parameter updates the default dictionary. - Hence, the default bond distances for missing element pairs are - still taken from the default dictionary. - The default bond distances are taken from :footcite:`Allen1987`. - inter_residue : bool, optional - If true, connections between consecutive amino acids and - nucleotides are also added. - default_bond_type : BondType or int, optional - By default, all created bonds have :attr:`BondType.ANY`. - An alternative :class:`BondType` can be given in this parameter. - periodic : bool, optional - If set to true, bonds can also be detected in periodic - boundary conditions. - The `box` attribute of `atoms` is required in this case. - - Returns - ------- - BondList - The created bond list. - - See Also - -------- - connect_via_residue_names - - Notes - ----- - This method might miss bonds, if the bond distance is unexpectedly - high or low, or it might create false bonds, if two atoms within a - residue are accidentally in the right distance. - A more accurate method for determining bonds is - :func:`connect_via_residue_names()`. - - References - ---------- - - .. footbibliography:: - """ - from biotite.structure.atoms import AtomArray - from biotite.structure.geometry import distance - from biotite.structure.residues import get_residue_starts - - if not isinstance(atoms, AtomArray): - raise TypeError(f"Expected 'AtomArray', not '{type(atoms).__name__}'") - if periodic: - if atoms.box is None: - raise BadStructureError("Atom array has no box") - box = atoms.box - else: - box = None - - # Prepare distance dictionary... - if distance_range is None: - distance_range = {} - # Merge default and custom entries - dist_ranges = {} - for key, val in itertools.chain( - _DEFAULT_DISTANCE_RANGE.items(), distance_range.items() - ): - element1, element2 = key - # Add entries for both element orders - dist_ranges[(element1.upper(), element2.upper())] = val - dist_ranges[(element2.upper(), element1.upper())] = val - - bonds = [] - coord = atoms.coord - elements = atoms.element - - residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) - # Omit exclusive stop in 'residue_starts' - for i in range(len(residue_starts) - 1): - curr_start_i = residue_starts[i] - next_start_i = residue_starts[i + 1] - - elements_in_res = elements[curr_start_i:next_start_i] - coord_in_res = coord[curr_start_i:next_start_i] - # Matrix containing all pairwise atom distances in the residue - distances = distance( - coord_in_res[:, np.newaxis, :], coord_in_res[np.newaxis, :, :], box - ) - for atom_index1 in range(len(elements_in_res)): - for atom_index2 in range(atom_index1): - dist_range = dist_ranges.get( - (elements_in_res[atom_index1], elements_in_res[atom_index2]) - ) - if dist_range is None: - # No bond distance entry for this element - # combination -> skip - continue - else: - min_dist, max_dist = dist_range - dist = distances[atom_index1, atom_index2] - if dist >= min_dist and dist <= max_dist: - # Convert BondType to int if necessary - bt_int = ( - int(default_bond_type) - if hasattr(default_bond_type, "__int__") - else default_bond_type - ) - bonds.append( - ( - curr_start_i + atom_index1, - curr_start_i + atom_index2, - bt_int, - ) - ) - - if bonds: - bond_list = BondList(atoms.array_length(), np.array(bonds, dtype=np.int64)) - else: - bond_list = BondList(atoms.array_length()) - - if inter_residue: - inter_bonds = _connect_inter_residue(atoms, residue_starts) - if default_bond_type == BondType.ANY: - # As all bonds should be of type ANY, convert also - # inter-residue bonds to ANY - inter_bonds.remove_bond_order() - return bond_list.merge(inter_bonds) - else: - return bond_list - - -def connect_via_residue_names(atoms, inter_residue=True, custom_bond_dict=None): - """ - connect_via_residue_names(atoms, inter_residue=True, custom_bond_dict=None) - - Create a :class:`BondList` for a given atom array (stack), based on - the deposited bonds for each residue in the RCSB ``components.cif`` - dataset. - - Bonds between two adjacent residues are created for the atoms - expected to connect these residues, i.e. ``'C'`` and ``'N'`` for - peptides and ``"O3'"`` and ``'P'`` for nucleotides. - - Parameters - ---------- - atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) - The structure to create the :class:`BondList` for. - inter_residue : bool, optional - If true, connections between consecutive amino acids and - nucleotides are also added. - custom_bond_dict : dict (str -> dict ((str, str) -> int)), optional - A dictionary of dictionaries: - The outer dictionary maps residue names to inner dictionaries. - The inner dictionary maps tuples of two atom names to their - respective :class:`BondType` (represented as integer). - If given, these bonds are used instead of the bonds read from - ``components.cif``. - - Returns - ------- - BondList - The created bond list. - No bonds are added for residues that are not found in - ``components.cif``. - - See Also - -------- - connect_via_distances - - Notes - ----- - This method can only find bonds for residues in the RCSB - *Chemical Component Dictionary*, unless `custom_bond_dict` is set. - Although this includes most molecules one encounters, this will fail - for exotic molecules, e.g. specialized inhibitors. - - .. currentmodule:: biotite.structure.info - - To supplement `custom_bond_dict` with bonds for residues from the - *Chemical Component Dictionary* you can use - :meth:`bonds_in_residue()`. - - >>> import pprint - >>> custom_bond_dict = { - ... "XYZ": { - ... ("A", "B"): BondType.SINGLE, - ... ("B", "C"): BondType.SINGLE - ... } - ... } - >>> # Supplement with bonds for common residues - >>> custom_bond_dict["ALA"] = bonds_in_residue("ALA") - >>> pp = pprint.PrettyPrinter(width=40) - >>> pp.pprint(custom_bond_dict) - {'ALA': {('C', 'O'): , - ('C', 'OXT'): , - ('CA', 'C'): , - ('CA', 'CB'): , - ('CA', 'HA'): , - ('CB', 'HB1'): , - ('CB', 'HB2'): , - ('CB', 'HB3'): , - ('N', 'CA'): , - ('N', 'H'): , - ('N', 'H2'): , - ('OXT', 'HXT'): }, - 'XYZ': {('A', 'B'): , - ('B', 'C'): }} - """ - from biotite.structure.info.bonds import bonds_in_residue - from biotite.structure.residues import get_residue_starts - - bonds = [] - atom_names = atoms.atom_name - res_names = atoms.res_name - - residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) - # Omit exclusive stop in 'residue_starts' - for res_i in range(len(residue_starts) - 1): - curr_start_i = residue_starts[res_i] - next_start_i = residue_starts[res_i + 1] - - if custom_bond_dict is None: - bond_dict_for_res = bonds_in_residue(res_names[curr_start_i]) - else: - bond_dict_for_res = custom_bond_dict.get(res_names[curr_start_i], {}) - - atom_names_in_res = atom_names[curr_start_i:next_start_i] - for (atom_name1, atom_name2), bond_type in bond_dict_for_res.items(): - atom_indices1 = np.where(atom_names_in_res == atom_name1)[0].astype( - np.int64, copy=False - ) - atom_indices2 = np.where(atom_names_in_res == atom_name2)[0].astype( - np.int64, copy=False - ) - # In rare cases the same atom name may appear multiple times - # (e.g. in altlocs) - # -> create all possible bond combinations - # Convert BondType to int if necessary - bt_int = int(bond_type) if hasattr(bond_type, "__int__") else bond_type - for i in range(len(atom_indices1)): - for j in range(len(atom_indices2)): - bonds.append( - ( - curr_start_i + atom_indices1[i], - curr_start_i + atom_indices2[j], - bt_int, - ) - ) - - if bonds: - bond_list = BondList(atoms.array_length(), np.array(bonds, dtype=np.int64)) - else: - bond_list = BondList(atoms.array_length()) - - if inter_residue: - inter_bonds = _connect_inter_residue(atoms, residue_starts) - return bond_list.merge(inter_bonds) - else: - return bond_list - - -_PEPTIDE_LINKS = ["PEPTIDE LINKING", "L-PEPTIDE LINKING", "D-PEPTIDE LINKING"] -_NUCLEIC_LINKS = ["RNA LINKING", "DNA LINKING"] - - -def _connect_inter_residue(atoms, residue_starts): - """ - Create a :class:`BondList` containing the bonds between adjacent - amino acid or nucleotide residues. - - Parameters - ---------- - atoms : AtomArray or AtomArrayStack - The structure to create the :class:`BondList` for. - residue_starts : ndarray, dtype=int - Return value of - ``get_residue_starts(atoms, add_exclusive_stop=True)``. - - Returns - ------- - BondList - A bond list containing all inter residue bonds. - """ - from biotite.structure.info.misc import link_type - - bonds = [] - atom_names = atoms.atom_name - res_names = atoms.res_name - res_ids = atoms.res_id - chain_ids = atoms.chain_id - - # Iterate over all starts excluding: - # - the last residue and - # - exclusive end index of 'atoms' - for i in range(len(residue_starts) - 2): - curr_start_i = residue_starts[i] - next_start_i = residue_starts[i + 1] - after_next_start_i = residue_starts[i + 2] - - # Check if the current and next residue is in the same chain - if chain_ids[next_start_i] != chain_ids[curr_start_i]: - continue - # Check if the current and next residue - # have consecutive residue IDs - # (Same residue ID is also possible if insertion code is used) - if res_ids[next_start_i] - res_ids[curr_start_i] > 1: - continue - - # Get link type for this residue from RCSB components.cif - curr_link = link_type(res_names[curr_start_i]) - next_link = link_type(res_names[next_start_i]) - - if curr_link in _PEPTIDE_LINKS and next_link in _PEPTIDE_LINKS: - curr_connect_atom_name = "C" - next_connect_atom_name = "N" - elif curr_link in _NUCLEIC_LINKS and next_link in _NUCLEIC_LINKS: - curr_connect_atom_name = "O3'" - next_connect_atom_name = "P" - else: - # Create no bond if the connection types of consecutive - # residues are not compatible - continue - - # Index in atom array for atom name in current residue - # Addition of 'curr_start_i' is necessary, as only a slice of - # 'atom_names' is taken, beginning at 'curr_start_i' - curr_connect_indices = ( - curr_start_i - + np.where(atom_names[curr_start_i:next_start_i] == curr_connect_atom_name)[ - 0 - ] - ) - # Index in atom array for atom name in next residue - next_connect_indices = ( - next_start_i - + np.where( - atom_names[next_start_i:after_next_start_i] == next_connect_atom_name - )[0] - ) - if len(curr_connect_indices) == 0 or len(next_connect_indices) == 0: - # The connector atoms are not found in the adjacent residues - # -> skip this bond - continue - - bonds.append( - (curr_connect_indices[0], next_connect_indices[0], int(BondType.SINGLE)) - ) - - if bonds: - return BondList(atoms.array_length(), np.array(bonds, dtype=np.int64)) - else: - return BondList(atoms.array_length()) - - -def find_connected(bond_list, root, as_mask=False): - """ - find_connected(bond_list, root, as_mask=False) - - Get indices to all atoms that are directly or indirectly connected - to the root atom indicated by the given index. - - An atom is *connected* to the `root` atom, if that atom is reachable - by traversing an arbitrary number of bonds, starting from the - `root`. - Effectively, this means that all atoms are *connected* to `root`, - that are in the same molecule as `root`. - Per definition `root` is also *connected* to itself. - - Parameters - ---------- - bond_list : BondList - The reference bond list. - root : int - The index of the root atom. - as_mask : bool, optional - If true, the connected atom indices are returned as boolean - mask. - By default, the connected atom indices are returned as integer - array. - - Returns - ------- - connected : ndarray, dtype=int or ndarray, dtype=bool - Either a boolean mask or an integer array, representing the - connected atoms. - In case of a boolean mask: ``connected[i] == True``, if the atom - with index ``i`` is connected. - - Examples - -------- - Consider a system with 4 atoms, where only the last atom is not - bonded with the other ones (``0-1-2 3``): - - >>> bonds = BondList(4) - >>> bonds.add_bond(0, 1) - >>> bonds.add_bond(1, 2) - >>> print(find_connected(bonds, 0)) - [0 1 2] - >>> print(find_connected(bonds, 1)) - [0 1 2] - >>> print(find_connected(bonds, 2)) - [0 1 2] - >>> print(find_connected(bonds, 3)) - [3] - """ - all_bonds, _ = bond_list.get_all_bonds() - - if root >= bond_list.get_atom_count(): - raise ValueError( - f"Root atom index {root} is out of bounds for bond list " - f"representing {bond_list.get_atom_count()} atoms" - ) - - is_connected_mask = np.zeros(bond_list.get_atom_count(), dtype=np.uint8) - # Find connections in a recursive way, - # by visiting all atoms that are reachable by a bond - _find_connected_recursive(root, is_connected_mask, all_bonds) - if as_mask: - return is_connected_mask.astype(bool) - else: - return np.where(is_connected_mask)[0] - - -def _find_connected_recursive(index, is_connected_mask, all_bonds): - """Recursive helper for find_connected.""" - if is_connected_mask[index]: - # This atom has already been visited - # -> exit condition - return - is_connected_mask[index] = 1 - - for j in range(all_bonds.shape[1]): - connected_index = all_bonds[index, j] - if connected_index == -1: - # Ignore padding values - continue - _find_connected_recursive(connected_index, is_connected_mask, all_bonds) - - -def find_rotatable_bonds(bonds): - """ - find_rotatable_bonds(bonds) - - Find all rotatable bonds in a given :class:`BondList`. - - The following conditions must be true for a bond to be counted as - rotatable: - - 1. The bond must be a single bond (``BondType.SINGLE``) - 2. The connected atoms must not be within the same cycle/ring - 3. Both connected atoms must not be terminal, e.g. not a *C-H* - bond, as rotation about such bonds would not change any - coordinates - - Parameters - ---------- - bonds : BondList - The bonds to find the rotatable bonds in. - - Returns - ------- - rotatable_bonds : BondList - The subset of the input `bonds` that contains only rotatable - bonds. - - Examples - -------- - - >>> molecule = residue("TYR") - >>> for i, j, _ in find_rotatable_bonds(molecule.bonds).as_array(): - ... print(molecule.atom_name[i], molecule.atom_name[j]) - N CA - CA C - CA CB - C OXT - CB CG - CZ OH - """ - bond_graph = bonds.as_graph() - cycles = nx.algorithms.cycles.cycle_basis(bond_graph) - - number_of_partners = np.count_nonzero(bonds.get_all_bonds()[0] != -1, axis=1) - - rotatable_bonds = [] - bonds_array = bonds.as_array() - for i, j, bond_type in bonds_array: - # Can only rotate about single bonds - # Furthermore, it makes no sense to rotate about a bond, - # that leads to a single atom - if ( - bond_type == BondType.SINGLE - and number_of_partners[i] > 1 - and number_of_partners[j] > 1 - ): - # Cannot rotate about a bond, if the two connected atoms - # are in a cycle - in_same_cycle = False - for cycle in cycles: - if i in cycle and j in cycle: - in_same_cycle = True - break - if not in_same_cycle: - rotatable_bonds.append((i, j, int(bond_type))) - if rotatable_bonds: - return BondList( - bonds.get_atom_count(), np.array(rotatable_bonds, dtype=np.int64) - ) - else: - return BondList(bonds.get_atom_count()) diff --git a/src/biotite/structure/connect.py b/src/biotite/structure/connect.py new file mode 100644 index 000000000..a34c2e9e8 --- /dev/null +++ b/src/biotite/structure/connect.py @@ -0,0 +1,454 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +__name__ = "biotite.structure" +__author__ = "Patrick Kunzmann" +__all__ = [ + "connect_via_distances", + "connect_via_residue_names", + "find_connected", + "find_rotatable_bonds", +] + +import itertools +import networkx as nx +import numpy as np +from biotite.rust.structure import ( + connect_inter_residue as rust_connect_inter_residue, +) +from biotite.rust.structure import ( + connect_via_distances as rust_connect_via_distances, +) +from biotite.rust.structure import ( + connect_via_residue_names as rust_connect_via_residue_names, +) +from biotite.rust.structure import ( + find_connected, +) +from biotite.structure.bonds import BondList, BondType +from biotite.structure.error import BadStructureError + +# fmt: off +_DEFAULT_DISTANCE_RANGE = { + # Taken from Allen et al. + # min - 2*std max + 2*std + ("B", "C" ) : (1.556 - 2*0.015, 1.556 + 2*0.015), + ("BR", "C" ) : (1.875 - 2*0.029, 1.966 + 2*0.029), + ("BR", "O" ) : (1.581 - 2*0.007, 1.581 + 2*0.007), + ("C", "C" ) : (1.174 - 2*0.011, 1.588 + 2*0.025), + ("C", "CL") : (1.713 - 2*0.011, 1.849 + 2*0.011), + ("C", "F" ) : (1.320 - 2*0.009, 1.428 + 2*0.009), + ("C", "H" ) : (1.059 - 2*0.030, 1.099 + 2*0.007), + ("C", "I" ) : (2.095 - 2*0.015, 2.162 + 2*0.015), + ("C", "N" ) : (1.325 - 2*0.009, 1.552 + 2*0.023), + ("C", "O" ) : (1.187 - 2*0.011, 1.477 + 2*0.008), + ("C", "P" ) : (1.791 - 2*0.006, 1.855 + 2*0.019), + ("C", "S" ) : (1.630 - 2*0.014, 1.863 + 2*0.015), + ("C", "SE") : (1.893 - 2*0.013, 1.970 + 2*0.032), + ("C", "SI") : (1.837 - 2*0.012, 1.888 + 2*0.023), + ("CL", "O" ) : (1.414 - 2*0.026, 1.414 + 2*0.026), + ("CL", "P" ) : (1.997 - 2*0.035, 2.008 + 2*0.035), + ("CL", "S" ) : (2.072 - 2*0.023, 2.072 + 2*0.023), + ("CL", "SI") : (2.072 - 2*0.009, 2.072 + 2*0.009), + ("F", "N" ) : (1.406 - 2*0.016, 1.406 + 2*0.016), + ("F", "P" ) : (1.495 - 2*0.016, 1.579 + 2*0.025), + ("F", "S" ) : (1.640 - 2*0.011, 1.640 + 2*0.011), + ("F", "SI") : (1.588 - 2*0.014, 1.694 + 2*0.013), + ("H", "N" ) : (1.009 - 2*0.022, 1.033 + 2*0.022), + ("H", "O" ) : (0.967 - 2*0.010, 1.015 + 2*0.017), + ("I", "O" ) : (2.144 - 2*0.028, 2.144 + 2*0.028), + ("N", "N" ) : (1.124 - 2*0.015, 1.454 + 2*0.021), + ("N", "O" ) : (1.210 - 2*0.011, 1.463 + 2*0.012), + ("N", "P" ) : (1.571 - 2*0.013, 1.697 + 2*0.015), + ("N", "S" ) : (1.541 - 2*0.022, 1.710 + 2*0.019), + ("N", "SI") : (1.711 - 2*0.019, 1.748 + 2*0.022), + ("O", "P" ) : (1.449 - 2*0.007, 1.689 + 2*0.024), + ("O", "S" ) : (1.423 - 2*0.008, 1.580 + 2*0.015), + ("O", "SI") : (1.622 - 2*0.014, 1.680 + 2*0.008), + ("P", "P" ) : (2.214 - 2*0.022, 2.214 + 2*0.022), + ("P", "S" ) : (1.913 - 2*0.014, 1.954 + 2*0.005), + ("P", "SE") : (2.093 - 2*0.019, 2.093 + 2*0.019), + ("P", "SI") : (2.264 - 2*0.019, 2.264 + 2*0.019), + ("S", "S" ) : (1.897 - 2*0.012, 2.070 + 2*0.022), + ("S", "SE") : (2.193 - 2*0.015, 2.193 + 2*0.015), + ("S", "SI") : (2.145 - 2*0.020, 2.145 + 2*0.020), + ("SE", "SE") : (2.340 - 2*0.024, 2.340 + 2*0.024), + ("SI", "SE") : (2.359 - 2*0.012, 2.359 + 2*0.012), +} +# fmt: on + + +def connect_via_distances( + atoms, + distance_range=None, + inter_residue=True, + default_bond_type=BondType.ANY, + periodic=False, +): + """ + connect_via_distances(atoms, distance_range=None, inter_residue=True, + default_bond_type=BondType.ANY, periodic=False) + + Create a :class:`BondList` for a given atom array, based on + pairwise atom distances. + + A :attr:`BondType.ANY`, bond is created for two atoms within the + same residue, if the distance between them is within the expected + bond distance range. + Bonds between two adjacent residues are created for the atoms + expected to connect these residues, i.e. ``'C'`` and ``'N'`` for + peptides and ``"O3'"`` and ``'P'`` for nucleotides. + + Parameters + ---------- + atoms : AtomArray + The structure to create the :class:`BondList` for. + distance_range : dict of tuple(str, str) -> tuple(float, float), optional + Custom minimum and maximum bond distances. + The dictionary keys are tuples of chemical elements representing + the atoms to be potentially bonded. + The order of elements within each tuple does not matter. + The dictionary values are the minimum and maximum bond distance, + respectively, for the given combination of elements. + This parameter updates the default dictionary. + Hence, the default bond distances for missing element pairs are + still taken from the default dictionary. + The default bond distances are taken from :footcite:`Allen1987`. + inter_residue : bool, optional + If true, connections between consecutive amino acids and + nucleotides are also added. + default_bond_type : BondType or int, optional + By default, all created bonds have :attr:`BondType.ANY`. + An alternative :class:`BondType` can be given in this parameter. + periodic : bool, optional + If set to true, bonds can also be detected in periodic + boundary conditions. + The `box` attribute of `atoms` is required in this case. + + Returns + ------- + BondList + The created bond list. + + See Also + -------- + connect_via_residue_names + + Notes + ----- + This method might miss bonds, if the bond distance is unexpectedly + high or low, or it might create false bonds, if two atoms within a + residue are accidentally in the right distance. + A more accurate method for determining bonds is + :func:`connect_via_residue_names()`. + + References + ---------- + + .. footbibliography:: + """ + from biotite.structure.atoms import AtomArray + from biotite.structure.residues import get_residue_starts + + if not isinstance(atoms, AtomArray): + raise TypeError(f"Expected 'AtomArray', not '{type(atoms).__name__}'") + if periodic: + if atoms.box is None: + raise BadStructureError("Atom array has no box") + + # Prepare distance dictionary... + if distance_range is None: + distance_range = {} + # Merge default and custom entries + dist_ranges = {} + for key, val in itertools.chain( + _DEFAULT_DISTANCE_RANGE.items(), distance_range.items() + ): + element1, element2 = key + # Add entries for both element orders + dist_ranges[(element1.upper(), element2.upper())] = val + dist_ranges[(element2.upper(), element1.upper())] = val + + residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) + + if periodic: + bond_list = _connect_via_distances_periodic( + atoms, residue_starts, dist_ranges, default_bond_type + ) + else: + bond_list = rust_connect_via_distances( + atoms.coord, + atoms.element.tolist(), + residue_starts, + dist_ranges, + default_bond_type, + ) + + if inter_residue: + inter_bonds = _connect_inter_residue(atoms, residue_starts) + if default_bond_type == BondType.ANY: + # As all bonds should be of type ANY, convert also + # inter-residue bonds to ANY + inter_bonds.remove_bond_order() + return bond_list.merge(inter_bonds) + else: + return bond_list + + +def _connect_via_distances_periodic( + atoms, residue_starts, dist_ranges, default_bond_type +): + """ + Fallback for :func:`connect_via_distances` with periodic boundary conditions. + Uses :func:`biotite.structure.geometry.distance` for periodic distance computation. + """ + from biotite.structure.geometry import distance + + box = atoms.box + coord = atoms.coord + elements = atoms.element + bt_int = int(default_bond_type) + + bonds = [] + for start, stop in itertools.pairwise(residue_starts): + elements_in_res = elements[start:stop] + coord_in_res = coord[start:stop] + distances = distance( + coord_in_res[:, np.newaxis, :], coord_in_res[np.newaxis, :, :], box + ) + for atom_index1 in range(len(elements_in_res)): + for atom_index2 in range(atom_index1): + dist_range = dist_ranges.get( + (elements_in_res[atom_index1], elements_in_res[atom_index2]) + ) + if dist_range is None: + continue + dist = distances[atom_index1, atom_index2] + if dist_range[0] <= dist <= dist_range[1]: + bonds.append((start + atom_index1, start + atom_index2, bt_int)) + + if bonds: + return BondList(atoms.array_length(), np.stack(bonds, axis=0, dtype=np.int64)) + + +def connect_via_residue_names(atoms, inter_residue=True, custom_bond_dict=None): + """ + connect_via_residue_names(atoms, inter_residue=True, custom_bond_dict=None) + + Create a :class:`BondList` for a given atom array (stack), based on + the deposited bonds for each residue in the *Chemical Component Dictionary* (CCD) + dataset. + + Bonds between two adjacent residues are created for the atoms + expected to connect these residues, i.e. ``'C'`` and ``'N'`` for + peptides and ``"O3'"`` and ``'P'`` for nucleotides. + + Parameters + ---------- + atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) + The structure to create the :class:`BondList` for. + inter_residue : bool, optional + If true, connections between consecutive amino acids and + nucleotides are also added. + custom_bond_dict : dict (str -> dict ((str, str) -> int)), optional + A dictionary of dictionaries: + The outer dictionary maps residue names to inner dictionaries. + The inner dictionary maps tuples of two atom names to their + respective :class:`BondType` (represented as integer). + If given, these bonds are used instead of the bonds read from the CCD. + + Returns + ------- + BondList + The created bond list. + No bonds are added for residues that are not found in the CCD. + + See Also + -------- + connect_via_distances + + Notes + ----- + This method can only find bonds for residues in the RCSB + *Chemical Component Dictionary*, unless `custom_bond_dict` is set. + Although this includes most molecules one encounters, this will fail + for exotic molecules, e.g. specialized inhibitors. + + .. currentmodule:: biotite.structure.info + + To supplement `custom_bond_dict` with bonds for residues from the + *Chemical Component Dictionary* you can use + :meth:`bonds_in_residue()`. + + >>> import pprint + >>> custom_bond_dict = { + ... "XYZ": { + ... ("A", "B"): BondType.SINGLE, + ... ("B", "C"): BondType.SINGLE + ... } + ... } + >>> # Supplement with bonds for common residues + >>> custom_bond_dict["ALA"] = bonds_in_residue("ALA") + >>> pp = pprint.PrettyPrinter(width=40) + >>> pp.pprint(custom_bond_dict) + {'ALA': {('C', 'O'): , + ('C', 'OXT'): , + ('CA', 'C'): , + ('CA', 'CB'): , + ('CA', 'HA'): , + ('CB', 'HB1'): , + ('CB', 'HB2'): , + ('CB', 'HB3'): , + ('N', 'CA'): , + ('N', 'H'): , + ('N', 'H2'): , + ('OXT', 'HXT'): }, + 'XYZ': {('A', 'B'): , + ('B', 'C'): }} + """ + # Avoid circular import + from biotite.structure.info.bonds import bonds_in_residue + from biotite.structure.residues import get_residue_starts + + bond_info = { + res_name: _bond_dict_to_list(bonds_in_residue(res_name)) or [] + for res_name in np.unique(atoms.res_name) + } + if custom_bond_dict is not None: + bond_info = { + res_name: _bond_dict_to_list(bond_info) + for res_name, bond_info in custom_bond_dict.items() + } + + residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) + bond_list = rust_connect_via_residue_names( + atoms.res_name.tolist(), atoms.atom_name.tolist(), residue_starts, bond_info + ) + + if inter_residue: + inter_bonds = _connect_inter_residue(atoms, residue_starts) + return bond_list.merge(inter_bonds) + else: + return bond_list + + +def find_rotatable_bonds(bonds): + """ + find_rotatable_bonds(bonds) + + Find all rotatable bonds in a given :class:`BondList`. + + The following conditions must be true for a bond to be counted as + rotatable: + + 1. The bond must be a single bond (``BondType.SINGLE``) + 2. The connected atoms must not be within the same cycle/ring + 3. Both connected atoms must not be terminal, e.g. not a *C-H* + bond, as rotation about such bonds would not change any + coordinates + + Parameters + ---------- + bonds : BondList + The bonds to find the rotatable bonds in. + + Returns + ------- + rotatable_bonds : BondList + The subset of the input `bonds` that contains only rotatable + bonds. + + Examples + -------- + + >>> molecule = residue("TYR") + >>> for i, j, _ in find_rotatable_bonds(molecule.bonds).as_array(): + ... print(molecule.atom_name[i], molecule.atom_name[j]) + N CA + CA C + CA CB + C OXT + CB CG + CZ OH + """ + bond_graph = bonds.as_graph() + cycles = nx.algorithms.cycles.cycle_basis(bond_graph) + + number_of_partners = np.count_nonzero(bonds.get_all_bonds()[0] != -1, axis=1) + + rotatable_bonds = [] + bonds_array = bonds.as_array() + for i, j, bond_type in bonds_array: + # Can only rotate about single bonds + # Furthermore, it makes no sense to rotate about a bond, + # that leads to a single atom + if ( + bond_type == BondType.SINGLE + and number_of_partners[i] > 1 + and number_of_partners[j] > 1 + ): + # Cannot rotate about a bond, if the two connected atoms + # are in a cycle + in_same_cycle = False + for cycle in cycles: + if i in cycle and j in cycle: + in_same_cycle = True + break + if not in_same_cycle: + rotatable_bonds.append((i, j, int(bond_type))) + if rotatable_bonds: + return BondList( + bonds.get_atom_count(), np.array(rotatable_bonds, dtype=np.int64) + ) + else: + return BondList(bonds.get_atom_count()) + + +def _connect_inter_residue(atoms, residue_starts): + """ + Create a :class:`BondList` containing the bonds between adjacent + amino acid or nucleotide residues. + + Parameters + ---------- + atoms : AtomArray or AtomArrayStack + The structure to create the :class:`BondList` for. + residue_starts : ndarray, dtype=int + Return value of + ``get_residue_starts(atoms, add_exclusive_stop=True)``. + + Returns + ------- + BondList + A bond list containing all inter residue bonds. + """ + # Avoid circular import + from biotite.structure.info.misc import link_type + + link_types = [ + link_type(res_name) for res_name in atoms.res_name[residue_starts[:-1]] + ] + is_disconnected = ( + # Residues are not inside the same chain + (atoms.chain_id[residue_starts[1:-1]] != atoms.chain_id[residue_starts[:-2]]) + # There is at least one residue missing in between + | (atoms.res_id[residue_starts[1:-1]] - atoms.res_id[residue_starts[:-2]] > 1) + ) + return rust_connect_inter_residue( + atoms.atom_name.tolist(), residue_starts, link_types, is_disconnected + ) + + +def _bond_dict_to_list(bond_dict): + """ + Convert the input bond dictionary in the form ``(name1, name2) -> bond_type`` into a + list of tuples ``(index1, index2, bond_type)``. + """ + if bond_dict is None: + return None + else: + return [ + (atom_name1, atom_name2, bond_type) + for (atom_name1, atom_name2), bond_type in bond_dict.items() + ] diff --git a/src/biotite/structure/io/pdb/file.py b/src/biotite/structure/io/pdb/file.py index 7aa9a31a5..4c0bcb463 100644 --- a/src/biotite/structure/io/pdb/file.py +++ b/src/biotite/structure/io/pdb/file.py @@ -19,7 +19,8 @@ max_hybrid36_number, ) from biotite.structure.atoms import repeat -from biotite.structure.bonds import BondList, connect_via_residue_names +from biotite.structure.bonds import BondList +from biotite.structure.connect import connect_via_residue_names from biotite.structure.error import BadStructureError from biotite.structure.filter import ( filter_solvent, diff --git a/src/biotite/structure/io/pdbqt/file.py b/src/biotite/structure/io/pdbqt/file.py index 60f629693..fd9c5a118 100644 --- a/src/biotite/structure/io/pdbqt/file.py +++ b/src/biotite/structure/io/pdbqt/file.py @@ -10,13 +10,9 @@ import numpy as np from biotite.file import InvalidFileError, TextFile from biotite.structure.atoms import AtomArray, AtomArrayStack -from biotite.structure.bonds import ( - BondList, - BondType, - find_connected, - find_rotatable_bonds, -) +from biotite.structure.bonds import BondList, BondType from biotite.structure.charges import partial_charges +from biotite.structure.connect import find_connected, find_rotatable_bonds from biotite.structure.error import BadStructureError PARAMETRIZED_ELEMENTS = [ diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 85dfc86c9..d8ff6a52d 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -29,7 +29,7 @@ concatenate, repeat, ) -from biotite.structure.bonds import BondList, BondType, connect_via_residue_names +from biotite.structure.bonds import BondList, BondType from biotite.structure.box import ( coord_to_fraction, fraction_to_coord, @@ -37,6 +37,7 @@ unitcell_from_vectors, vectors_from_unitcell, ) +from biotite.structure.connect import connect_via_residue_names from biotite.structure.error import BadStructureError from biotite.structure.filter import _canonical_aa_list as canonical_aa_list from biotite.structure.filter import ( diff --git a/src/biotite/structure/molecules.py b/src/biotite/structure/molecules.py index bc227348c..bdae017da 100644 --- a/src/biotite/structure/molecules.py +++ b/src/biotite/structure/molecules.py @@ -13,7 +13,8 @@ import numpy as np from biotite.structure.atoms import AtomArray, AtomArrayStack -from biotite.structure.bonds import BondList, find_connected +from biotite.structure.bonds import BondList +from biotite.structure.connect import find_connected def get_molecule_indices(array): diff --git a/src/rust/structure/bonds.rs b/src/rust/structure/bonds.rs index 360984b39..77038c917 100644 --- a/src/rust/structure/bonds.rs +++ b/src/rust/structure/bonds.rs @@ -300,7 +300,7 @@ pub struct BondList { impl BondList { #[new] #[pyo3(signature = (atom_count, bonds=None))] - fn new<'py>( + pub fn new<'py>( py: Python<'py>, atom_count: usize, bonds: Option<&Bound<'py, PyAny>>, @@ -398,7 +398,7 @@ impl BondList { /// [2 3] /// [2 4]] #[staticmethod] - fn concatenate(bond_lists: Vec) -> PyResult { + pub fn concatenate(bond_lists: Vec) -> PyResult { if bond_lists.is_empty() { return Ok(BondList { atom_count: 0, @@ -453,7 +453,7 @@ impl BondList { /// [[2 3 0] /// [3 5 0] /// [3 6 0]] - fn offset_indices(&mut self, offset: isize) -> PyResult<()> { + pub fn offset_indices(&mut self, offset: isize) -> PyResult<()> { if offset < 0 { return Err(exceptions::PyValueError::new_err("Offset must be positive")); } @@ -478,7 +478,7 @@ impl BondList { /// first atom, the second column the index of the second atom /// involved in the bond. /// The third column stores the :class:`BondType`. - fn as_array<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { + pub fn as_array<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { let n_bonds = self.bonds.len(); // Allocate uninitialized numpy array and write directly let mut array = Array2::::uninit([n_bonds, 3]); @@ -511,7 +511,7 @@ impl BondList { /// The first integer represents the first atom, /// the second integer represents the second atom, /// the third integer represents the :class:`BondType`. - fn as_set<'py>(&self, py: Python<'py>) -> PyResult> { + pub fn as_set<'py>(&self, py: Python<'py>) -> PyResult> { let set = PySet::empty(py)?; for bond in &self.bonds { let tuple = PyTuple::new( @@ -553,7 +553,7 @@ impl BondList { /// 0 1 {'bond_type': } /// 1 3 {'bond_type': } /// 1 4 {'bond_type': } - fn as_graph<'py>(&self, py: Python<'py>) -> PyResult> { + pub fn as_graph<'py>(&self, py: Python<'py>) -> PyResult> { let nx = PyModule::import(py, "networkx")?; let graph = nx.call_method0("Graph")?; @@ -592,7 +592,7 @@ impl BondList { /// ... print(i, j, BondType(bond_type).name) /// 0 1 SINGLE /// 1 2 DOUBLE - fn remove_aromaticity(&mut self) { + pub fn remove_aromaticity(&mut self) { for bond in &mut self.bonds { bond.bond_type = bond.bond_type.without_aromaticity(); } @@ -612,7 +612,7 @@ impl BondList { /// ... print(i, j, BondType(bond_type).name) /// 0 1 AROMATIC /// 1 2 AROMATIC - fn remove_kekulization(&mut self) { + pub fn remove_kekulization(&mut self) { for bond in &mut self.bonds { if matches!( bond.bond_type, @@ -624,7 +624,7 @@ impl BondList { } /// Convert all bonds to :attr:`BondType.ANY`. - fn remove_bond_order(&mut self) { + pub fn remove_bond_order(&mut self) { for bond in &mut self.bonds { bond.bond_type = BondType::Any; } @@ -659,7 +659,7 @@ impl BondList { /// 0 1 DOUBLE /// 1 2 SINGLE /// 2 3 SINGLE - fn convert_bond_type(&mut self, original_bond_type: BondType, new_bond_type: BondType) { + pub fn convert_bond_type(&mut self, original_bond_type: BondType, new_bond_type: BondType) { for bond in &mut self.bonds { if bond.bond_type == original_bond_type { bond.bond_type = new_bond_type; @@ -673,7 +673,7 @@ impl BondList { /// ------- /// atom_count : int /// The atom count. - fn get_atom_count(&self) -> usize { + pub fn get_atom_count(&self) -> usize { self.atom_count } @@ -684,7 +684,7 @@ impl BondList { /// bond_count : int /// The number of bonds. This is equal to the length of the /// internal :class:`ndarray` containing the bonds. - fn get_bond_count(&self) -> usize { + pub fn get_bond_count(&self) -> usize { self.bonds.len() } @@ -716,7 +716,7 @@ impl BondList { /// >>> print(bonds) /// [0 3 4] #[allow(clippy::type_complexity)] - fn get_bonds<'py>( + pub fn get_bonds<'py>( &self, py: Python<'py>, atom_index: isize, @@ -830,7 +830,7 @@ impl BondList { /// 9: [3] /// 10: [4] /// 11: [5] - fn get_all_bonds<'py>( + pub fn get_all_bonds<'py>( &self, py: Python<'py>, ) -> (Bound<'py, PyArray2>, Bound<'py, PyArray2>) { @@ -878,7 +878,7 @@ impl BondList { /// [ True False False False] /// [ True False False False] /// [ True False False False]] - fn adjacency_matrix<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { + pub fn adjacency_matrix<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { let n = self.atom_count; let mut matrix = Array2::from_elem((n, n), false); for bond in &self.bonds { @@ -925,7 +925,7 @@ impl BondList { /// [ 2 -1 -1 -1] /// [ 1 -1 -1 -1] /// [ 1 -1 -1 -1]] - fn bond_type_matrix<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { + pub fn bond_type_matrix<'py>(&self, py: Python<'py>) -> Bound<'py, PyArray2> { let n = self.atom_count; let mut matrix = Array2::from_elem((n, n), -1i8); for bond in &self.bonds { @@ -949,7 +949,7 @@ impl BondList { /// bond_type : BondType /// The type of the bond. Default is :attr:`BondType.ANY`. #[pyo3(signature = (atom_index1, atom_index2, bond_type=BondType::Any))] - fn add_bond( + pub fn add_bond( &mut self, atom_index1: isize, atom_index2: isize, @@ -980,7 +980,7 @@ impl BondList { /// ---------- /// atom_index1, atom_index2 : int /// The indices of the atoms whose bond should be removed. - fn remove_bond(&mut self, atom_index1: isize, atom_index2: isize) -> PyResult<()> { + pub fn remove_bond(&mut self, atom_index1: isize, atom_index2: isize) -> PyResult<()> { let target = Bond::new(atom_index1, atom_index2, BondType::Any, self.atom_count)?; self.bonds.retain(|bond| !bond.equal_atoms(&target)); Ok(()) @@ -995,7 +995,7 @@ impl BondList { /// ---------- /// atom_index : int /// The index of the atom whose bonds should be removed. - fn remove_bonds_to(&mut self, atom_index: isize) -> PyResult<()> { + pub fn remove_bonds_to(&mut self, atom_index: isize) -> PyResult<()> { let index = to_positive_index(atom_index, self.atom_count)?; self.bonds .retain(|bond| bond.atom1 != index && bond.atom2 != index); @@ -1015,7 +1015,7 @@ impl BondList { /// ---------- /// bond_list : BondList /// The bonds in `bond_list` are removed from this instance. - fn remove_bonds(&mut self, bond_list: &BondList) { + pub fn remove_bonds(&mut self, bond_list: &BondList) { let bonds_to_remove: HashSet<(usize, usize)> = bond_list.bonds.iter().map(|b| (b.atom1, b.atom2)).collect(); self.bonds @@ -1077,7 +1077,7 @@ impl BondList { /// [[0 1 1] /// [1 2 1] /// [2 3 2]] - fn merge(&self, bond_list: &BondList) -> BondList { + pub fn merge(&self, bond_list: &BondList) -> BondList { let new_atom_count = self.atom_count.max(bond_list.atom_count); // Concatenate bonds: other bond_list takes precedence (put first) @@ -1094,11 +1094,11 @@ impl BondList { result } - fn __add__(&self, other: &BondList) -> BondList { + pub fn __add__(&self, other: &BondList) -> BondList { BondList::concatenate(vec![self.clone(), other.clone()]).unwrap() } - fn __getitem__<'py>( + pub fn __getitem__<'py>( &self, py: Python<'py>, index: &Bound<'py, PyAny>, @@ -1160,7 +1160,7 @@ impl BondList { } } - fn __contains__(&self, item: (isize, isize)) -> PyResult { + pub fn __contains__(&self, item: (isize, isize)) -> PyResult { let target = match Bond::new(item.0, item.1, BondType::Any, self.atom_count) { Ok(bond) => bond, // If the atom indices are out of range, the bond is not part of the bond list @@ -1169,7 +1169,7 @@ impl BondList { Ok(self.bonds.iter().any(|bond| bond.equal_atoms(&target))) } - fn __eq__(&self, other: &BondList) -> bool { + pub fn __eq__(&self, other: &BondList) -> bool { if self.atom_count != other.atom_count { return false; } @@ -1178,19 +1178,19 @@ impl BondList { self_set == other_set } - fn __str__(&self, py: Python<'_>) -> PyResult { + pub fn __str__(&self, py: Python<'_>) -> PyResult { let array = self.as_array(py); array.call_method0("__str__")?.extract() } - fn __iter__(&self) -> PyResult<()> { + pub fn __iter__(&self) -> PyResult<()> { Err(exceptions::PyTypeError::new_err( "'BondList' object is not iterable", )) } // Pickling support - fn __reduce__<'py>(&self, py: Python<'py>) -> PyResult> { + pub fn __reduce__<'py>(&self, py: Python<'py>) -> PyResult> { let struc = PyModule::import(py, "biotite.structure")?; let cls = struc.getattr("BondList")?; let from_state = cls.getattr("_from_state")?; @@ -1215,7 +1215,7 @@ impl BondList { #[staticmethod] #[pyo3(name = "_from_state")] - fn from_state(atom_count: usize, bonds_data: Vec<[usize; 3]>) -> PyResult { + pub fn from_state(atom_count: usize, bonds_data: Vec<[usize; 3]>) -> PyResult { let bonds: Vec = bonds_data .into_iter() .map(|[a1, a2, bt]| { @@ -1231,15 +1231,15 @@ impl BondList { } // Copy support for biotite.Copyable interface - fn copy(&self) -> BondList { + pub fn copy(&self) -> BondList { self.clone() } - fn __copy__(&self) -> BondList { + pub fn __copy__(&self) -> BondList { self.clone() } - fn __deepcopy__(&self, _memo: &Bound<'_, PyAny>) -> BondList { + pub fn __deepcopy__(&self, _memo: &Bound<'_, PyAny>) -> BondList { self.clone() } } diff --git a/src/rust/structure/connect.rs b/src/rust/structure/connect.rs new file mode 100644 index 000000000..b8205243b --- /dev/null +++ b/src/rust/structure/connect.rs @@ -0,0 +1,387 @@ +use itertools::Itertools; +use numpy::ndarray::Array1; +use numpy::ndarray::ArrayView2; +use numpy::{IntoPyArray, PyReadonlyArray1, PyReadonlyArray2}; +use pyo3::exceptions; +use pyo3::prelude::*; +use std::collections::{HashMap, VecDeque}; +use std::convert::TryFrom; + +use crate::structure::bonds::{Bond, BondList, BondType}; + +/// Internal rust function for :func:`biotite.structure.connect.connect_via_residue_names`. +/// +/// Parameters +/// ---------- +/// res_names +/// The residue name of each atom. +/// atom_names +/// The atom name of each atom. +/// residue_starts +/// The start index of each residue (including exclusive stop index). +/// bond_dict +/// The bond dictionary mapping connected atom name pairs to their bond type if they form a +/// covalent bond. +/// +/// Returns +/// ------- +/// bonds +/// The bonds between the atoms. +#[pyfunction] +pub fn connect_via_residue_names<'py>( + _py: Python<'py>, + res_names: Vec, + atom_names: Vec, + residue_starts: PyReadonlyArray1<'py, i64>, + bond_dict: HashMap>, +) -> PyResult { + let mut bond_list = BondList::empty(atom_names.len()); + for (start_i, stop_i) in residue_starts.as_array().iter().tuple_windows() { + let start_i = *start_i as usize; + let stop_i = *stop_i as usize; + let bonds_res = match bond_dict.get(&res_names[start_i]) { + Some(bonds_res) => bonds_res, + None => continue, + }; + let atom_names_in_res = &atom_names[start_i..stop_i]; + for (atom_name1, atom_name2, bond_type) in bonds_res.iter() { + let atom_indices_1 = find(atom_names_in_res, atom_name1); + let atom_indices_2 = find(atom_names_in_res, atom_name2); + // In rare cases the same atom name may appear multiple times + // (e.g. in altlocs) + // -> create all possible bond combinations + for atom_index_i in atom_indices_1.iter() { + for atom_index_j in atom_indices_2.iter() { + unsafe { + bond_list.add(Bond { + atom1: start_i + atom_index_i, + atom2: start_i + atom_index_j, + bond_type: BondType::try_from(*bond_type as u8)?, + })?; + } + } + } + } + } + bond_list.remove_redundant_bonds(); + Ok(bond_list) +} + +/// Internal rust function for :func:`biotite.structure.connect.connect_via_distances`. +/// +/// Parameters +/// ---------- +/// coord +/// The coordinates of each atom (shape: Nx3, dtype: float32). +/// elements +/// The element name of each atom. +/// residue_starts +/// The start index of each residue (including exclusive stop index). +/// dist_ranges +/// The distance ranges mapping element pairs to (min_dist, max_dist). +/// Both element orders must be included. +/// default_bond_type +/// The bond type to assign to all created bonds. +/// +/// Returns +/// ------- +/// bonds +/// The bonds between the atoms. +#[pyfunction] +#[allow(clippy::needless_range_loop)] +pub fn connect_via_distances<'py>( + _py: Python<'py>, + coord: PyReadonlyArray2<'py, f32>, + elements: Vec, + residue_starts: PyReadonlyArray1<'py, i64>, + dist_ranges: HashMap<(String, String), (f32, f32)>, + default_bond_type: BondType, +) -> PyResult { + let coord = coord.as_array(); + let residue_starts = residue_starts.as_array(); + let n_atoms = elements.len(); + + // Create a mapping from element names to integers + // to allow a more efficient lookup in the hot loop + let mut element_to_index: HashMap<&str, usize> = HashMap::new(); + for (element1, element2) in dist_ranges.keys() { + let next = element_to_index.len(); + element_to_index.entry(element1.as_str()).or_insert(next); + let next = element_to_index.len(); + element_to_index.entry(element2.as_str()).or_insert(next); + } + + // Map each atom to its element index, None if not in dist_ranges + let atom_element_index: Vec> = elements + .iter() + .map(|element| element_to_index.get(element.as_str()).copied()) + .collect(); + + // Build a flat 2D lookup table for squared distance ranges indexed by element indices + // to avoid computing expensive square roots in the hot loop + let n_unique_elements = element_to_index.len(); + let mut distance_range_table: Vec> = + vec![None; n_unique_elements * n_unique_elements]; + for ((element1, element2), &(min_dist, max_dist)) in &dist_ranges { + if let (Some(&index1), Some(&index2)) = ( + element_to_index.get(element1.as_str()), + element_to_index.get(element2.as_str()), + ) { + distance_range_table[index1 * n_unique_elements + index2] = + Some((min_dist * min_dist, max_dist * max_dist)); + } + } + + let mut bond_list = BondList::empty(n_atoms); + + for (start, stop) in residue_starts.iter().tuple_windows() { + let start = *start as usize; + let stop = *stop as usize; + + for atom_i in start..stop { + let element_i = match atom_element_index[atom_i] { + Some(index) => index, + None => continue, + }; + for atom_j in start..atom_i { + let element_j = match atom_element_index[atom_j] { + Some(index) => index, + None => continue, + }; + let (min_distance, max_distance) = + match distance_range_table[element_i * n_unique_elements + element_j] { + Some(range) => range, + None => continue, + }; + let distance = squared_distance(&coord, atom_i, atom_j); + if distance >= min_distance && distance <= max_distance { + unsafe { + bond_list.add(Bond { + atom1: atom_j, + atom2: atom_i, + bond_type: default_bond_type, + })?; + } + } + } + } + } + + Ok(bond_list) +} + +/// Internal rust function for :func:`biotite.structure.connect._connect_inter_residue`. +/// +/// Parameters +/// ---------- +/// atom_names +/// The atom name of each atom. +/// residue_starts +/// The start index of each residue (including exclusive stop index). +/// link_types +/// The link type of each residue. None if the residue is not found in the CCD. +/// is_disconnected +/// Whether each residue is disconnected from the next residue due to chain or +/// residue discontinuity. +/// Length is ``len(residue_starts) - 2``. +#[pyfunction] +pub fn connect_inter_residue<'py>( + _py: Python<'py>, + atom_names: Vec, + residue_starts: PyReadonlyArray1<'py, i64>, + link_types: Vec>, + is_disconnected: PyReadonlyArray1<'py, bool>, +) -> PyResult { + let residue_starts = residue_starts.as_array(); + let is_disconnected = is_disconnected.as_array(); + let n_residues = residue_starts.len() - 1; + + let mut bond_list = BondList::empty(atom_names.len()); + + for i in 0..n_residues - 1 { + // Check if residues are disconnected + if is_disconnected[i] { + continue; + } + + let curr_start = residue_starts[i] as usize; + let curr_stop = residue_starts[i + 1] as usize; + let next_start = residue_starts[i + 1] as usize; + let next_stop = residue_starts[i + 2] as usize; + + // Get link type for both residues + let (curr_link, next_link) = match (&link_types[i], &link_types[i + 1]) { + (Some(curr), Some(next)) => (curr.as_str(), next.as_str()), + _ => continue, + }; + + let (curr_connect_name, next_connect_name) = + if is_peptide_link(curr_link) && is_peptide_link(next_link) { + ("C", "N") + } else if is_nucleic_link(curr_link) && is_nucleic_link(next_link) { + ("O3'", "P") + } else { + // Create no bond if the connection types of consecutive + // residues are not compatible + continue; + }; + + // Find first connector atom in each residue + let curr_connect_idx = atom_names[curr_start..curr_stop] + .iter() + .position(|name| name == curr_connect_name); + let next_connect_idx = atom_names[next_start..next_stop] + .iter() + .position(|name| name == next_connect_name); + + if let (Some(curr_idx), Some(next_idx)) = (curr_connect_idx, next_connect_idx) { + unsafe { + bond_list.add(Bond { + atom1: curr_start + curr_idx, + atom2: next_start + next_idx, + bond_type: BondType::Single, + })?; + } + } + } + + Ok(bond_list) +} + +/// find_connected(bond_list, root, as_mask=False) +/// +/// Get indices to all atoms that are directly or indirectly connected +/// to the root atom indicated by the given index. +/// +/// An atom is *connected* to the `root` atom, if that atom is reachable +/// by traversing an arbitrary number of bonds, starting from the +/// `root`. +/// Effectively, this means that all atoms are *connected* to `root`, +/// that are in the same molecule as `root`. +/// Per definition `root` is also *connected* to itself. +/// +/// Parameters +/// ---------- +/// bond_list : BondList +/// The reference bond list. +/// root : int +/// The index of the root atom. +/// as_mask : bool, optional +/// If true, the connected atom indices are returned as boolean +/// mask. +/// By default, the connected atom indices are returned as integer +/// array. +/// +/// Returns +/// ------- +/// connected : ndarray, dtype=int or ndarray, dtype=bool +/// Either a boolean mask or an integer array, representing the +/// connected atoms. +/// In case of a boolean mask: ``connected[i] == True``, if the atom +/// with index ``i`` is connected. +/// +/// Examples +/// -------- +/// Consider a system with 4 atoms, where only the last atom is not +/// bonded with the other ones (``0-1-2 3``): +/// +/// >>> bonds = BondList(4) +/// >>> bonds.add_bond(0, 1) +/// >>> bonds.add_bond(1, 2) +/// >>> print(find_connected(bonds, 0)) +/// [0 1 2] +/// >>> print(find_connected(bonds, 1)) +/// [0 1 2] +/// >>> print(find_connected(bonds, 2)) +/// [0 1 2] +/// >>> print(find_connected(bonds, 3)) +/// [3] +/// +/// The result can be output as a boolean mask: +/// +/// >>> print(find_connected(bonds, 0, as_mask=True)) +/// [ True True True False] +#[pyfunction] +#[pyo3(signature = (bond_list, root, as_mask=false))] +pub fn find_connected( + py: Python<'_>, + bond_list: &BondList, + root: usize, + as_mask: bool, +) -> PyResult> { + let atom_count = bond_list.get_atom_count(); + let bonds = bond_list.get_bonds_ref(); + + if root >= atom_count { + return Err(exceptions::PyValueError::new_err(format!( + "Root atom index {} is out of bounds for bond list representing {} atoms", + root, atom_count + ))); + } + + // Build adjacency list + let mut adj: Vec> = vec![Vec::new(); atom_count]; + for bond in bonds { + adj[bond.atom1].push(bond.atom2); + adj[bond.atom2].push(bond.atom1); + } + + // Iterative breadth-first search + let mut visited = vec![false; atom_count]; + let mut queue = VecDeque::new(); + queue.push_back(root); + visited[root] = true; + + while let Some(current) = queue.pop_front() { + for &neighbor in &adj[current] { + if !visited[neighbor] { + visited[neighbor] = true; + queue.push_back(neighbor); + } + } + } + + if as_mask { + let mask = Array1::from_vec(visited); + Ok(mask.into_pyarray(py).into_any().unbind()) + } else { + let indices: Vec = visited + .iter() + .enumerate() + .filter(|(_, &v)| v) + .map(|(i, _)| i as i64) + .collect(); + let arr = Array1::from_vec(indices); + Ok(arr.into_pyarray(py).into_any().unbind()) + } +} + +/// Squared Euclidean distance between two atoms in an Nx3 coordinate array. +#[inline(always)] +fn squared_distance(coord: &ArrayView2, a1: usize, a2: usize) -> f32 { + let dx = coord[[a1, 0]] - coord[[a2, 0]]; + let dy = coord[[a1, 1]] - coord[[a2, 1]]; + let dz = coord[[a1, 2]] - coord[[a2, 2]]; + dx * dx + dy * dy + dz * dz +} + +fn is_peptide_link(link_type: &str) -> bool { + matches!( + link_type, + "PEPTIDE LINKING" | "L-PEPTIDE LINKING" | "D-PEPTIDE LINKING" + ) +} + +fn is_nucleic_link(link_type: &str) -> bool { + matches!(link_type, "RNA LINKING" | "DNA LINKING") +} + +/// Find all indices where a value occurs in an array. +fn find(array: &[T], value: &T) -> Vec { + array + .iter() + .enumerate() + .filter(|(_, x)| *x == value) + .map(|(i, _)| i) + .collect() +} diff --git a/src/rust/structure/mod.rs b/src/rust/structure/mod.rs index 4135ca82e..7cbad86e2 100644 --- a/src/rust/structure/mod.rs +++ b/src/rust/structure/mod.rs @@ -3,6 +3,7 @@ use pyo3::prelude::*; pub mod bonds; pub mod celllist; +pub mod connect; pub mod io; pub mod sasa; pub mod util; @@ -13,6 +14,13 @@ pub fn module<'py>(parent_module: &Bound<'py, PyModule>) -> PyResult()?; module.add_class::()?; module.add_function(wrap_pyfunction!(bonds::bond_type_members, &module)?)?; + module.add_function(wrap_pyfunction!( + connect::connect_via_residue_names, + &module + )?)?; + module.add_function(wrap_pyfunction!(connect::connect_inter_residue, &module)?)?; + module.add_function(wrap_pyfunction!(connect::connect_via_distances, &module)?)?; + module.add_function(wrap_pyfunction!(connect::find_connected, &module)?)?; module.add_function(wrap_pyfunction!(sasa::sasa, &module)?)?; add_subpackage(&module, &io::module(&module)?, "biotite.rust.structure.io")?; Ok(module) diff --git a/tests/structure/test_bonds.py b/tests/structure/test_bonds.py index 4c5b48358..4b7a9a844 100644 --- a/tests/structure/test_bonds.py +++ b/tests/structure/test_bonds.py @@ -8,7 +8,6 @@ import numpy as np import pytest import biotite.structure as struc -import biotite.structure.info as info import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx from tests.util import data_dir @@ -507,118 +506,6 @@ def test_atom_array_consistency(): assert test_ids.tolist() == ref_ids.tolist() -@pytest.mark.parametrize("periodic", [False, True]) -def test_method_consistency(periodic): - """ - Check if :func:`connect_via_distances()` and - :func:`connect_via_residue_names()` give the same bond list - """ - THRESHOLD_PERCENTAGE = 0.99 - - # Structure with peptide, nucleotide, small molecules and water - pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "5ugo.bcif")) - atoms = pdbx.get_structure(pdbx_file, model=1) - if periodic: - # Add large dummy box to test parameter - # No actual bonds over the periodic boundary are expected - atoms.box = np.identity(3) * 100 - - bonds_from_names = struc.connect_via_residue_names(atoms) - bonds_from_names.remove_bond_order() - - bonds_from_distances = struc.connect_via_distances(atoms, periodic=periodic) - - # The distance based method may not detect all bonds - assert bonds_from_distances.as_set().issubset(bonds_from_names.as_set()) - assert ( - len(bonds_from_distances.as_array()) - >= len(bonds_from_names.as_array()) * THRESHOLD_PERCENTAGE - ) - - -def test_find_connected(bond_list): - """ - Find all connected atoms to an atom in a known example. - """ - for index in (0, 1, 2, 3, 4, 6): - assert struc.find_connected(bond_list, index).tolist() == [0, 1, 2, 3, 4, 6] - assert struc.find_connected(bond_list, 5).tolist() == [5] - - -@pytest.mark.parametrize( - "res_name, expected_bonds", - [ - # Easy ligand visualization at: - # https://www.rcsb.org/ligand/ - ("TYR", [ - ("N", "CA" ), - ("CA", "C" ), - ("CA", "CB" ), - ("C", "OXT"), - ("CB", "CG" ), - ("CZ", "OH" ), - ]), - ("CEL", [ - ("C1", "C4" ), - ("C8", "C11"), - ("C15", "S1" ), - ("N3", "S1" ) - ]), - ("LEO", [ - ("C3", "C8" ), - ("C6", "C17"), - ("C17", "C22"), - ]), - ] -) # fmt: skip -def test_find_rotatable_bonds(res_name, expected_bonds): - """ - Check the :func:`find_rotatable_bonds()` function based on - known examples. - """ - molecule = info.residue(res_name) - - ref_bond_set = { - tuple(sorted((name_i, name_j))) for name_i, name_j in expected_bonds - } - - rotatable_bonds = struc.find_rotatable_bonds(molecule.bonds) - test_bond_set = set() - for i, j, _ in rotatable_bonds.as_array(): - test_bond_set.add(tuple(sorted((molecule.atom_name[i], molecule.atom_name[j])))) - - # Compare with reference bonded atom names - assert test_bond_set == ref_bond_set - # All rotatable bonds must be single bonds - assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE) - - -@pytest.mark.parametrize( - "cif_path, expected_bond_indices", - [ - ( - join(data_dir("structure"), "3o5r.cif"), - [252, 257], # Carbonyl carbon and subsequent backbone nitrogen - ) - ], -) -def test_canonical_bonds_with_altloc_occupancy(cif_path, expected_bond_indices): - """ - Test whether canonical inter-residue bonds are correctly computed when - `altloc="occupancy"` and the higher-occupancy atom occurs second in the CIF file. - """ - - cif_file = pdbx.CIFFile.read(cif_path) - atom_array = pdbx.get_structure( - cif_file.block, altloc="occupancy", include_bonds=True - ) - - atom1, atom2 = expected_bond_indices - - # Assert that the canonical inter-residue bond exists - assert atom2 in atom_array.bonds.get_bonds(atom1)[0] - - def test_pickle(bond_list): """ Check that a BondList survives a pickle round-trip unchanged. diff --git a/tests/structure/test_connect.py b/tests/structure/test_connect.py new file mode 100644 index 000000000..36a689de7 --- /dev/null +++ b/tests/structure/test_connect.py @@ -0,0 +1,142 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +from os.path import join +import numpy as np +import pytest +import biotite.structure as struc +import biotite.structure.info as info +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +@pytest.mark.parametrize("seed", range(20)) +@pytest.mark.parametrize("as_mask", [False, True]) +def test_find_connected(seed, as_mask): + """ + The ``label_asym_id`` in PDBx files distinguishes different molecules. + Therefore, all connected atoms should hame the same chain ID. + An exception are water molecules, which have the same chain ID, albeit not being + bonded to each other. + """ + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1k6p.bcif")) + atoms = pdbx.get_structure( + pdbx_file, model=1, include_bonds=True, use_author_fields=False + ) + atoms = atoms[~struc.filter_solvent(atoms)] + + np.random.seed(seed) + rng = np.random.default_rng(seed) + root = rng.integers(atoms.array_length()) + chain_id = atoms.chain_id[root] + + connected_indices = struc.find_connected(atoms.bonds, root, as_mask=as_mask) + assert np.all(atoms.chain_id[connected_indices] == chain_id) + if as_mask: + # When we have a boolean mask, we can also check + # if all non-connected atoms have a different chain ID + assert np.all(atoms.chain_id[~connected_indices] != chain_id) + + +@pytest.mark.parametrize( + "res_name, expected_bonds", + [ + # Easy ligand visualization at: + # https://www.rcsb.org/ligand/ + ("TYR", [ + ("N", "CA" ), + ("CA", "C" ), + ("CA", "CB" ), + ("C", "OXT"), + ("CB", "CG" ), + ("CZ", "OH" ), + ]), + ("CEL", [ + ("C1", "C4" ), + ("C8", "C11"), + ("C15", "S1" ), + ("N3", "S1" ) + ]), + ("LEO", [ + ("C3", "C8" ), + ("C6", "C17"), + ("C17", "C22"), + ]), + ] +) # fmt: skip +def test_find_rotatable_bonds(res_name, expected_bonds): + """ + Check the :func:`find_rotatable_bonds()` function based on + known examples. + """ + molecule = info.residue(res_name) + + ref_bond_set = { + tuple(sorted((name_i, name_j))) for name_i, name_j in expected_bonds + } + + rotatable_bonds = struc.find_rotatable_bonds(molecule.bonds) + test_bond_set = set() + for i, j, _ in rotatable_bonds.as_array(): + test_bond_set.add(tuple(sorted((molecule.atom_name[i], molecule.atom_name[j])))) + + # Compare with reference bonded atom names + assert test_bond_set == ref_bond_set + # All rotatable bonds must be single bonds + assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE) + + +@pytest.mark.parametrize( + "cif_path, expected_bond_indices", + [ + ( + join(data_dir("structure"), "3o5r.cif"), + [252, 257], # Carbonyl carbon and subsequent backbone nitrogen + ) + ], +) +def test_canonical_bonds_with_altloc_occupancy(cif_path, expected_bond_indices): + """ + Test whether canonical inter-residue bonds are correctly computed when + `altloc="occupancy"` and the higher-occupancy atom occurs second in the CIF file. + """ + + cif_file = pdbx.CIFFile.read(cif_path) + atom_array = pdbx.get_structure( + cif_file.block, altloc="occupancy", include_bonds=True + ) + + atom1, atom2 = expected_bond_indices + + # Assert that the canonical inter-residue bond exists + assert atom2 in atom_array.bonds.get_bonds(atom1)[0] + + +@pytest.mark.parametrize("periodic", [False, True]) +def test_method_consistency(periodic): + """ + Check if :func:`connect_via_distances()` and + :func:`connect_via_residue_names()` give the same bond list + """ + THRESHOLD_PERCENTAGE = 0.99 + + # Structure with peptide, nucleotide, small molecules and water + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "5ugo.bcif")) + atoms = pdbx.get_structure(pdbx_file, model=1) + if periodic: + # Add large dummy box to test parameter + # No actual bonds over the periodic boundary are expected + atoms.box = np.identity(3) * 100 + + bonds_from_names = struc.connect_via_residue_names(atoms) + bonds_from_names.remove_bond_order() + + bonds_from_distances = struc.connect_via_distances(atoms, periodic=periodic) + + # The distance based method may not detect all bonds + assert bonds_from_distances.as_set().issubset(bonds_from_names.as_set()) + assert ( + len(bonds_from_distances.as_array()) + >= len(bonds_from_names.as_array()) * THRESHOLD_PERCENTAGE + ) From f26d0b500bb66a60554d033b7f28528ed893a0fc Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Mon, 2 Mar 2026 13:42:33 +0100 Subject: [PATCH 5/5] Prevent self-bonds --- src/rust/structure/bonds.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/rust/structure/bonds.rs b/src/rust/structure/bonds.rs index 77038c917..4c4bbca89 100644 --- a/src/rust/structure/bonds.rs +++ b/src/rust/structure/bonds.rs @@ -119,6 +119,11 @@ impl Bond { fn new(atom1: isize, atom2: isize, bond_type: BondType, atom_count: usize) -> PyResult { let a1 = to_positive_index(atom1, atom_count)?; let a2 = to_positive_index(atom2, atom_count)?; + if a1 == a2 { + return Err(exceptions::PyValueError::new_err( + "An atom cannot be bonded to itself", + )); + } let (a1, a2) = if a1 <= a2 { (a1, a2) } else { (a2, a1) }; Ok(Bond { atom1: a1,