diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..80717df --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,58 @@ +name: CI + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + test: + name: Python ${{ matrix.python-version }} / ${{ matrix.os }} + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + # 🔧 NEW: ensure modern packaging tools before installing anything + - name: Upgrade pip, setuptools, and wheel + run: | + python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install pytest pytest-cov flake8 + + - name: Install package + run: pip install -e . + + - name: Lint with flake8 + run: | + # Fail on syntax errors or undefined names; treat everything else as a warning + flake8 pylipid --count --select=E9,F63,F7,F82 --show-source --statistics + flake8 pylipid --count --exit-zero --max-line-length=120 --statistics + + - name: Run tests + run: | + pytest tests/ -v --cov=pylipid --cov-report=term-missing + + - name: Upload coverage report + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11' + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: .coverage diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 352700b..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -language: python -cache: pip -python: - - "3.6" - - "3.7" - - "3.8" -before_install: - - pip install -U pip - - pip install codecov -install: - - pip install -r requirements.txt -script: - - python -m unittest discover -s tests -after_success: - - bash <(curl -s https://codecov.io/bash) diff --git a/README.rst b/README.rst index 1990885..9855bac 100644 --- a/README.rst +++ b/README.rst @@ -1,65 +1,135 @@ +PyLipID - Analysis of Protein–Lipid Interactions from Molecular Dynamics +============================================================ -========================================================== -PyLipID - A Python Package For Lipid Interactions Analysis -========================================================== - -.. image:: https://travis-ci.com/wlsong/PyLipID.svg?branch=master - :target: https://travis-ci.com/github/wlsong/PyLipID +.. image:: https://github.com/pstansfeld/PyLipID/actions/workflows/ci.yml/badge.svg + :target: https://github.com/pstansfeld/PyLipID/actions/workflows/ci.yml .. image:: https://img.shields.io/pypi/v/PyLipID :target: https://pypi.org/project/pylipid/ +.. image:: https://img.shields.io/pypi/pyversions/PyLipID + :target: https://pypi.org/project/pylipid/ .. image:: docs/static/pylipid_logo_smallsize.png - :align: center + :align: center +PyLipID is a Python package for analysing lipid–protein interactions from +Molecular Dynamics (MD) simulations. It supports coarse-grained and all-atom +simulations, handles formats supported by MDTraj, and works in scripts or +Jupyter notebooks. -PyLipID is a python package for analyzing lipid interactions with membrane proteins from -Molecular Dynamics Simulations. PyLipID has the following main features, please check out -the tutorials for examples and the documentations for the API functionalities: +Documentation: https://pylipid.readthedocs.io - * Detection of binding sites via calculating community structures in the interactions networks. - * Calculation of lipid koff and residence time for interaction with binding sites and residues. - * Analysis of lipid interactions with binding sites and residues using a couple of metrics. - * Generation of representative bound poses for binding sites. - * Analysis of bound poses for binding sites via automated clustering scheme. - * Adoption of a dual-cutoff scheme to overcome the 'rattling in cage' effect of coarse-grained simulations. - * Generation of manuscript-ready figures for analysis. +Features +======== -PyLipID can be used from Jupyter (former IPython, recommended), or by writing Python scripts. -The documentaion and tutorials can be found at `pylipid.readthedocs.io `_. +- Dual-cutoff contact detection to improve coarse-grained accuracy. +- Binding site detection using NetworkX Louvain community analysis. +- Kinetic parameters: koff and residence times per residue and site. +- Interaction metrics including occupancy, count, duration. +- Representative bound pose extraction and clustering. +- SASA calculations per residue and site. +- Publication-ready plots and PyMOL scripts. Installation ============ -We recommend installing PyLipID using the package installer `pip`: +From PyPI:: + + pip install pylipid + +From source:: + + git clone https://github.com/pstansfeld/PyLipID + cd PyLipID + pip install . + +Editable/development install:: + + pip install -e . + +Requires Python >= 3.9. -``pip install pylipid`` +Quick Start +=========== -Alternatively, PyLipID can be installed from the source code. The package is available for -download on Github via: +.. code-block:: python -``git clone https://github.com/wlsong/PyLipID`` + from pylipid.api import LipidInteraction -Once the source code is downloaded, enter the source code directory and install the package as follow: + trajfile_list = ["run1/protein_lipids.xtc", "run2/protein_lipids.xtc"] + topfile_list = ["run1/protein_lipids.gro", "run2/protein_lipids.gro"] -``python setup.py install`` + li = LipidInteraction( + trajfile_list, + topfile_list=topfile_list, + cutoffs=[0.55, 0.8], + lipid="CHOL", + nprot=1, + ) + li.collect_residue_contacts() + li.compute_residue_duration() + li.compute_residue_koff() + li.compute_binding_nodes(threshold=4) + li.compute_site_koff() -Citation |DOI for Citing PyEMMA| -================================ + li.plot("Residence Time") + li.save_data("Dataset") + li.save_pymol_script(pdb_file="receptor.pdb") -If you use PyLipID in scientific research, please cite the following paper: :: +What's New in v1.6.0 +==================== - @article{song_pylipid_2022, - author = {Song, Wanling. and Corey, Robin A. and Ansell, T. Bertie. and - Cassidy, C. Keith. and Horrell, Michael R. and Duncan, Anna L. - and Stansfeld, Phillip J. and Sansom, Mark S.P.}, - title = {PyLipID: A Python package for analysis of protein-lipid interactions from MD simulations}, - journal = {J. Chem. Theory Comput}, - year = {2022}, - url = {https://doi.org/10.1021/acs.jctc.1c00708}, - doi = {10.1021/acs.jctc.1c00708}, - urldate = {2022-02-18}, - } +Bug Fixes +--------- +- Corrected binding-site detection off-by-one error. +- Fixed calculate_scores returning None. +- Fixed sparse_corrcoef matrix type issues. +- Fixed DataFrame column/index mismatches in analyze_bound_poses. +- Fixed missing return in compute_residue_lipidcount. +- Fixed write_PDB errors for .gro files lacking chain IDs. +- Fixed figure overwriting in plot_koff. +- Updated deprecated Matplotlib calls. +- Fixed LogNorm crash on zero-containing matrices. +- Replaced np.float128 with np.longdouble. +- Corrected output directory typo. -.. |DOI for Citing PyEMMA| image:: https://img.shields.io/badge/DOI-10.1021/acs.jctc.1c00708-blue +Performance +----------- +- collect_residue_contacts now calls md.compute_distances once per residue. +- Nested progress bars implemented. + +Dependencies +------------ +- Removed python-louvain; using NetworkX built-in Louvain. +- Minimum versions added. +- Dropped Python < 3.9. + +Packaging & CI +-------------- +- Migrated to pyproject.toml. +- CI moved from Travis CI to GitHub Actions. + +Citation +======== + +.. image:: https://img.shields.io/badge/DOI-10.1021/acs.jctc.1c00708-blue :target: https://doi.org/10.1021/acs.jctc.1c00708 + +:: + + @article{song_pylipid_2022, + author = {Song, Wanling and Corey, Robin A. and Ansell, T. Bertie and + Cassidy, C. Keith and Horrell, Michael R. and Duncan, Anna L. + and Stansfeld, Phillip J. and Sansom, Mark S.P.}, + title = {PyLipID: A Python package for analysis of protein-lipid + interactions from MD simulations}, + journal = {J. Chem. Theory Comput.}, + year = {2022}, + doi = {10.1021/acs.jctc.1c00708}, + url = {https://doi.org/10.1021/acs.jctc.1c00708}, + } + +License +======= + +MIT License. See LICENSE.txt. diff --git a/pylipid/__init__.py b/pylipid/__init__.py index e23c929..72a0e4c 100644 --- a/pylipid/__init__.py +++ b/pylipid/__init__.py @@ -14,4 +14,26 @@ # copies or substantial portions of the Software. ############################################################################## +# --- BEGIN: macOS-safe bootstrap ------------------------------------------- +# Ensures PyLipID runs safely on macOS by: +# 1) forcing multiprocessing to use 'spawn' +# 2) preventing GUI backends from loading in worker processes +import multiprocessing as _mp + +try: + # Safe on macOS — harmless on Linux/Windows + _mp.set_start_method("spawn", force=True) +except RuntimeError: + # Already set or protected — ignore + pass + +# Use a non-interactive backend so importing PyLipID never loads macOS AppKit +try: + import matplotlib + matplotlib.use("Agg", force=True) +except Exception: + pass +# --- END: macOS-safe bootstrap --------------------------------------------- + from ._version import __version__ +from .api import LipidInteraction diff --git a/pylipid/_version.py b/pylipid/_version.py index 8add312..66ea783 100644 --- a/pylipid/_version.py +++ b/pylipid/_version.py @@ -14,4 +14,4 @@ # copies or substantial portions of the Software. ############################################################################## -__version__ = '1.5.14' \ No newline at end of file +__version__ = '1.6.0' diff --git a/pylipid/api/api.py b/pylipid/api/api.py index faff1fd..693c55b 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -27,7 +27,8 @@ from scipy.sparse import coo_matrix import pandas as pd from tqdm import trange, tqdm -from p_tqdm import p_map +#from p_tqdm import p_map +from multiprocessing import get_context from ..func import cal_contact_residues from ..func import Duration from ..func import cal_lipidcount, cal_occupancy @@ -38,6 +39,20 @@ from ..plot import plot_residue_data, plot_corrcoef, plot_residue_data_logo from ..util import check_dir, write_PDB, write_pymol_script, sparse_corrcoef, get_traj_info +def _spawn_pmap(func, *iterables, num_cpus=None, desc=None): + """Parallel map using multiprocessing with 'spawn' (macOS-safe).""" + iterables = [list(it) for it in iterables] + if not iterables: + return [] + n = len(iterables[0]) + if any(len(it) != n for it in iterables): + raise ValueError("All iterables must have the same length") + if n == 0: + return [] + args_iterable = list(zip(*iterables)) + ctx = get_context("spawn") + with ctx.Pool(processes=num_cpus) as pool: + return list(tqdm(pool.starmap(func, args_iterable), total=n, desc=desc)) class LipidInteraction: def __init__(self, trajfile_list, cutoffs=[0.475, 0.7], lipid="CHOL", topfile_list=None, lipid_atoms=None, @@ -364,15 +379,31 @@ def collect_residue_contacts(self): "Trajectory {} contains {} residues whereas trajectory {} contains {} residues".format( traj_idx, len(traj_info["protein_residue_id"]), traj_idx - 1, len(self._protein_residue_id)) ncol_per_protein = len(traj_info["lipid_residue_atomid_list"]) * traj.n_frames + # Pre-flatten all lipid atom indices and build a boundary index so we can + # issue ONE md.compute_distances call per residue (instead of n_lipids calls). + flat_lipid_atoms = np.concatenate(traj_info["lipid_residue_atomid_list"]) + lipid_sizes = np.array([len(a) for a in traj_info["lipid_residue_atomid_list"]]) + lipid_boundaries = np.concatenate([[0], np.cumsum(lipid_sizes)]) + n_lipids = len(traj_info["lipid_residue_atomid_list"]) for protein_idx in np.arange(self._nprot, dtype=int): - for residue_id, residue_atom_indices in enumerate( - traj_info["protein_residue_atomid_list"][protein_idx]): - # calculate interaction per residue - dist_matrix = np.array([np.min( - md.compute_distances(traj, np.array(list(product(residue_atom_indices, lipid_atom_indices))), - periodic=True, opt=True), - axis=1) for lipid_atom_indices in traj_info["lipid_residue_atomid_list"]]) + tqdm(traj_info["protein_residue_atomid_list"][protein_idx], + desc=" Traj {:d} prot {:d} residues".format(traj_idx, protein_idx), + leave=False)): + # Single compute_distances call for all lipid atoms at once. + # Shape: (n_frames, n_residue_atoms * n_flat_lipid_atoms) + all_pairs = np.array(list(product(residue_atom_indices, flat_lipid_atoms))) + all_dists = md.compute_distances(traj, all_pairs, periodic=True, opt=True) + # all_dists shape: (n_frames, n_res_atoms * n_flat_lipid_atoms) + # Reduce to (n_frames, n_flat_lipid_atoms) by taking min over residue atoms + n_res_atoms = len(residue_atom_indices) + n_flat = len(flat_lipid_atoms) + # reshape: (n_frames, n_res_atoms, n_flat_lipid_atoms) → min over axis 1 + per_atom_dists = all_dists.reshape(traj.n_frames, n_res_atoms, n_flat).min(axis=1) + # (n_frames, n_flat_lipid_atoms) → per lipid min using boundaries + # np.minimum.reduceat operates along axis=1 (columns = lipid atoms) + dist_matrix = np.minimum.reduceat(per_atom_dists, lipid_boundaries[:-1], axis=1).T + # dist_matrix shape: (n_lipids, n_frames) — same as before contact_low, frame_id_set_low, lipid_id_set_low = cal_contact_residues(dist_matrix, self._cutoffs[0]) contact_high, _, _ = cal_contact_residues(dist_matrix, self._cutoffs[1]) self._contact_residues_high[residue_id].append(contact_high) @@ -561,7 +592,7 @@ def compute_residue_lipidcount(self, residue_id=None): if len(self._lipid_count[residue_id]) > 0 else 0 for residue_id in self._protein_residue_id] if len(selected_residue_id) == 1: - self._lipid_count[residue_id] + return self._lipid_count[residue_id] else: return [self._lipid_count[residue_id] for residue_id in selected_residue_id] @@ -664,7 +695,7 @@ def compute_residue_koff(self, residue_id=None, nbootstrap=10, initial_guess=[1. self._check_calculation("Duration", self.compute_residue_duration) if plot_data: - koff_dir = check_dir(save_dir, "Reisidue_koffs_{}".format(self._lipid)) if save_dir is not None \ + koff_dir = check_dir(save_dir, "Residue_koffs_{}".format(self._lipid)) if save_dir is not None \ else check_dir(self._save_dir, "Residue_koffs_{}".format(self._lipid)) if len(set(self._residue_list)) != len(self._residue_list): residue_name_set = ["{}_ResidueID{}".format(residue, residue_id) for residue, residue_id in @@ -695,7 +726,7 @@ def compute_residue_koff(self, residue_id=None, nbootstrap=10, initial_guess=[1. else: fn_set = [False for dummy in selected_residue_id] - returned_values = p_map(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, + returned_values = _spawn_pmap(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, initial_guess=initial_guess, plot_data=plot_data, timeunit=self._timeunit, fig_close=fig_close), [np.concatenate(self._duration[residue_id]) for residue_id in selected_residue_id], @@ -1139,7 +1170,7 @@ def compute_site_koff(self, binding_site_id=None, nbootstrap=10, initial_guess=[ fn_set = [os.path.join(BS_dir, f"BS_id{bs_id}.{fig_format}") for bs_id in selected_bs_id] else: fn_set = [False for dummy in selected_bs_id] - returned_values = p_map(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, + returned_values = _spawn_pmap(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, initial_guess=initial_guess, plot_data=plot_data, timeunit=self._timeunit, fig_close=fig_close), [np.concatenate(self._duration_BS[bs_id]) for bs_id in selected_bs_id], @@ -1167,214 +1198,134 @@ def compute_site_koff(self, binding_site_id=None, nbootstrap=10, initial_guess=[ [self._res_time_BS[bs_id] for bs_id in selected_bs_id] def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="auto", pose_format="gro", - score_weights=None, kde_bw=0.15, pca_component=0.90, plot_rmsd=True, save_dir=None, - eps=None, min_samples=None, metric="euclidean", - fig_close=False, fig_format="pdf", num_cpus=None): + score_weights=None, kde_bw=0.15, pca_component=0.90, plot_rmsd=True, save_dir=None, + eps=None, min_samples=None, metric="euclidean", + fig_close=False, fig_format="pdf", num_cpus=None): r"""Analyze bound poses for binding sites. - - This function can find representative bound poses, cluster the bound poses and calculate pose RMSD for - binding sites. - - If ``n_top_poses`` is an integer larger than 0, this method will find the representative bound poses for the specified - binding sites. To do so, it evaluates all the bound poses in a binding site using a density-based scoring function - and ranks the poses using based on the scores. The scoring function is defined as: - - .. math:: - \text { score }=\sum_{i} W_{i} \cdot \hat{f}_{i, H}(D) - - where :math:`W_{i}` is the weight given to atom i of the lipid molecule, H is the bandwidth and - :math:`\hat{f}_{i, H}(D)` is a multivariate kernel density etimation of the position of atom i in the specified - binding site. :math:`\hat{f}_{i, H}(D)` is calculated from all the bound lipid poses in that binding site. - The position of atom i is a `p`-variant vector, :math:`\left[D_{i 1}, D_{i 2}, \ldots, D_{i p}\right]` where - :math:`D_{i p}` is the minimum distance to the residue `p` of the binding site. The multivariant kernel density - is estimated by `KDEMultivariate - `_ - provided by Statsmodels. Higher weights can be given to e.g. the headgroup atoms of phospholipids, to generate - better defined binding poses, but all lipid atoms are weighted equally by default. The use of relative positions - of lipid atoms in their binding site makes the analysis independent of the conformational changes in the rest - part of the protein. - - If ``n_clusters`` is given an integer larger than 0, this method will cluster the lipid bound poses in the specified - binding site using `KMeans `_ - provided by scikit. The KMeans cluster separates the samples into `n` clusters of equal variances, via minimizing - the `inertia`, which is defined as: - - .. math:: - \sum_{i=0}^{n} \min _{u_{i} \in C}\left(\left\|x_{i}-u_{i}\right\|^{2}\right) - - where :math:`u_{i}` is the `centroid` of cluster i. KMeans scales well with large dataset but performs poorly - with clusters of varying sizes and density, which are often the case for lipid poses in a binding site. - - If ``n_clusters`` is set to `auto`, this method will cluster the bound poses using a density-based cluster - `DBSCAN `_ provided by scikit. - DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples` - points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are - mutually density-connected and density-reachable, i.e. there is a path - :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance - :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon` - determine the performance of this cluster. If None, `min_samples` takes the value of 2 * ndim. If - :math:`\varepsilon` is None, it is set as the value at the knee of the k-distance plot. - - For writing out the cluster poses, this method will randomly select one pose from each cluster in the case of - using KMeans or one from the core samples of each cluster when DBSCAN is used, and writes the selected lipid - pose with the protein conformation to which it binds using MDTraj, in the provided pose format. - - The root mean square deviation (RMSD) of a lipid bound pose in a binding site is calculated from the relative - position of the pose in the binding site compared to the average position of the bound poses. Thus, the pose - RMSD is defined as: - - .. math:: - RMSD=\sqrt{\frac{\sum_{i=0}^{N} \sum_{j=0}^{M}\left(D_{i j}-\bar{D}_{j}\right)^{2}}{N}} - - where :math:`D_{i j}` is the distance of atom `i` to the residue `j` in the binding site; :math:`\bar{D}_{j}` is the - average distance of atom `i` from all bound poses in the binding site to residue `j`; `N` is the number of atoms in - the lipid molecule and `M` is the number of residues in the binding site. - - Parameters - ---------- - binding_site_id : int or list of int, default=None - The binding site ID used in PyLipID. This ID is the index in the binding site node list that is - calculated by the method ``compute_binding_nodes``. The ID of the N-th binding site is (N-1). If None, - the contact duration of all binding sites are calculated. - - n_top_poses : int, default=3 - Number of representative bound poses written out for the selected binding site. - - n_clusters : int or 'auto' - Number of clusters to form for bound poses of the selected binding site. If ``n_clusters`` is set to 'auto'`, the - density-based clusterer DBSCAN will be used. If ``n_clusters`` is given a non-zero integer, KMeans is used. - - pose_format : str, default="gro" - The coordinate format the representative poses and clsutered poses are saved with. Support the formats - that are included in MDtraj.save(). - - score_weights : dict or None, default=None - The weights given to atoms in the scoring function for finding the representative bound poses. It should in - the format of a Python dictionary {atom name: weight}. The atom name should be consisten with the topology. - By default, all atoms in the lipid molecule are weighted equally. - - kde_bw : scalar, default=0.15 - The bandwidth for the Gaussian kernel. Used in the density estimation of the lipid atom coordinates in the binding - site. Used by the function - `KDEMultivariate `_ . - - pca_component : int, float or ‘mle’, default=0.90 - The PCA used to decrease the dimensions of lipid atom coordinates. The coordinate of a lipid atom in - the binding site is expressed as a distance vector of the minimum distances to the residues in that binding site, - i.e. :math:`[D_{i 1}, D_{i 2}, .., D_{i p}]`. This can be in high dimensions. Hence, PCA is used on this distance - vector prior to calculation of the density. This PCA is carried out by - `PCA `_ in sci-kit. - - plot_rmsd : bool, default=True - Plot the binding site RMSDs in a violinplot. - - eps : float or None, default=None - The minimum neighbour distance used by - `DBSCAN `_ - if None, the value will determined from as the elbow point of the sorted minimum neighbour distance - of all the data points. - - min_samples : int or None, default=None - The minimum number of samples to be considered as core samples used by - `DBSCAN `_ . - If None, the value will be automatically determined based on the size of data. - - metric : str, default='euclidean' - The metric used to calculated neighbour distance used by - `DBSCAN `_ . - Default is the Euclidean distance. - - fig_close : bool, default=False - This parameter control whether to close the plotted figures using plt.close(). It can save memory if - many figures are generated. - - fig_format : str, default="pdf" - Figure format. Support formats included in matplotlib.pyplot. - - num_cpus : int or None default=None - The number of CPUs used for the tasks of ranking the poses and clustering poses. Python multiprocessing deployed by - `p_tqdm `_ is used to speed up these calculations. - - save_dir : str or None, default=None - The root directory for saving the pose analysis results. If None, the root directory set at the initiation of - ``LipidInteraction`` will be used. The representative poses and clustered poses will be stored in the directory - of Bound_Poses_{lipid} under the root directory. - - Returns - ------- - pose_pool : dict - Coordinates of the bound poses for the selected binding sites stored in a python dictionary - {binding_site_id: pose coordinates}. The poses coordinates include lipid coordinates and those of the receptor - at the time the pose was bound. The pose coordinates are stored in a - `mdtraj.Trajectory `_ object. - - rmsd_data : pandas.DataFrame - Bound poses RMSDs are stored by columns with column name of binding site id. - - See Also - -------- - pylipid.func.collect_bound_poses - Collect bound pose coordinates from trajectories. - pylipid.func.vectorize_poses - Convert bound poses into distance vectors. - pylipid.func.calculate_scores - Score the bound poses based on the probability density function of the position of lipid atoms - pylipid.func.analyze_pose_wrapper - A wrapper function that ranks poses, clusters poses and calculates pose RMSD - + + (Docstring unchanged — omitted here for brevity) """ + # Ensure required upstream steps are done self._check_calculation("Binding Site ID", self.compute_binding_nodes, print_data=False) - + + # Where to save pose outputs pose_dir = check_dir(save_dir, "Bound_Poses_{}".format(self._lipid)) if save_dir is not None \ else check_dir(self._save_dir, "Bound_Poses_{}".format(self._lipid)) - if "Binding Site RMSD" in self.dataset.columns: - pose_rmsd_per_residue = self.dataset.columns["Binding Site Pose RMSD"] + + # Initialize/extend per-residue RMSD column + if "Binding Site Pose RMSD" in self.dataset.columns: + pose_rmsd_per_residue = self.dataset["Binding Site Pose RMSD"].values else: pose_rmsd_per_residue = np.zeros(self._nresi_per_protein) + + # Which binding sites to process if binding_site_id is not None: selected_bs_id = np.atleast_1d(binding_site_id) else: selected_bs_id = np.arange(len(self._node_list), dtype=int) - - # store bound lipid poses + + # Map of selected binding sites -> node lists selected_bs_map = {bs_id: self._node_list[bs_id] for bs_id in selected_bs_id} - pose_traj, pose_info = collect_bound_poses(selected_bs_map, self._contact_residues_low, self._trajfile_list, - self._topfile_list, self._lipid, self._protein_ref, self._lipid_ref, - stride=self._stride, nprot=self._nprot) + + # --- Heartbeat: announce pose collection -------------------------------- + print(f"[PyLipID] Gathering bound poses for analysis…") + print(f"[PyLipID] Binding sites to analyse: {len(selected_bs_id)}") + + # Collect bound poses and metadata + pose_traj, pose_info = collect_bound_poses( + selected_bs_map, + self._contact_residues_low, + self._trajfile_list, + self._topfile_list, + self._lipid, + self._protein_ref, + self._lipid_ref, + stride=self._stride, + nprot=self._nprot + ) + + # Quick per-site pose counts (helps reassure users on large jobs) + for bs_id in selected_bs_id: + try: + n_poses = len(pose_traj[bs_id]) + except Exception: + n_poses = 0 + print(f"[PyLipID] - Binding Site {bs_id}: {n_poses} poses") + + # Build atom index lists used by the analyzers protein_atom_indices = [[atom.index for atom in residue.atoms] for residue in self._protein_ref.top.residues] lipid_atom_indices = [self._protein_ref.n_atoms + atom_idx for atom_idx in np.arange(self._lipid_ref.n_atoms)] + + # Atom weights (default 1.0 each; allow user override by name) atom_weights = {atom_idx: 1 for atom_idx in np.arange(self._lipid_ref.n_atoms)} if score_weights is not None: translate = {atom_idx: score_weights[self._lipid_ref.top.atom(atom_idx).name] for atom_idx in np.arange(self._lipid_ref.n_atoms) if self._lipid_ref.top.atom(atom_idx).name in score_weights.keys()} atom_weights.update(translate) - + + RMSD_set = {} + + # If n_top_poses == 0, we skip the heavy scoring/clustering entirely if n_top_poses > 0: - # multiprocessing wrapped under p_tqdm - rmsd_set = p_map(partial(analyze_pose_wrapper, protein_atom_indices=protein_atom_indices, - lipid_atom_indices=lipid_atom_indices, n_top_poses=n_top_poses, - pose_dir=pose_dir, atom_weights=atom_weights, kde_bw=kde_bw, - pca_component=pca_component, pose_format=pose_format, n_clusters=n_clusters, - eps=eps, min_samples=min_samples, metric=metric, - trajfile_list=self._trajfile_list), - selected_bs_id, [pose_traj[bs_id] for bs_id in selected_bs_id], - [self._node_list[bs_id] for bs_id in selected_bs_id], - [pose_info[bs_id] for bs_id in selected_bs_id], num_cpus=num_cpus, desc="ANALYZE BOUND POSES") - RMSD_set = {} + # --- Heartbeat: announce the heavy parallel step -------------------- + # Number of workers actually used + import os + n_workers = num_cpus if (isinstance(num_cpus, int) and num_cpus > 0) else (os.cpu_count() or 1) + n_workers = min(n_workers, max(1, len(selected_bs_id))) + print(f"[PyLipID] Starting multiprocessing for bound pose scoring/clustering… " + f"(sites={len(selected_bs_id)}, workers={n_workers})") + #print(f"[PyLipID] Reducing num_cpus or set n_top_poses=0 / n_clusters=0 to speed up.") + + # Prepare the task closure + task = partial(analyze_pose_wrapper, + protein_atom_indices=protein_atom_indices, + lipid_atom_indices=lipid_atom_indices, + n_top_poses=n_top_poses, + pose_dir=pose_dir, + atom_weights=atom_weights, + kde_bw=kde_bw, + pca_component=pca_component, + pose_format=pose_format, + n_clusters=n_clusters, + eps=eps, + min_samples=min_samples, + metric=metric, + trajfile_list=self._trajfile_list) + + # Run in parallel with a visible tqdm progress bar + print("[PyLipID] Scoring and clustering bound poses… this may take some time.") + rmsd_set = _spawn_pmap(task, + selected_bs_id, + [pose_traj[bs_id] for bs_id in selected_bs_id], + [self._node_list[bs_id] for bs_id in selected_bs_id], + [pose_info[bs_id] for bs_id in selected_bs_id], + num_cpus=num_cpus, + desc="ANALYZE BOUND POSES") + print("[PyLipID] Finished clustering.") + # Collect RMSD output per site and update per-residue RMSD column for bs_id, rmsd in zip(selected_bs_id, rmsd_set): - RMSD_set["Binding Site {}".format(bs_id)] = rmsd - pose_rmsd_per_residue[self._node_list[bs_id]] = np.mean(RMSD_set["Binding Site {}".format(bs_id)]) - # update dataset + RMSD_set[f"Binding Site {bs_id}"] = rmsd + pose_rmsd_per_residue[self._node_list[bs_id]] = np.mean(RMSD_set[f"Binding Site {bs_id}"]) + + # Update dataset self.dataset["Binding Site Pose RMSD"] = pose_rmsd_per_residue - pose_rmsd_data = pd.DataFrame( - dict([(bs_label, pd.Series(rmsd_set)) for bs_label, rmsd_set in RMSD_set.items()])) - # plot RMSD - if plot_rmsd and n_top_poses > 0: - plot_binding_site_data(pose_rmsd_data, os.path.join(pose_dir, f"Pose_RMSD_violinplot.{fig_format}"), - title="{}".format(self._lipid), ylabel="RMSD (nm)", fig_close=fig_close) + pose_rmsd_data = pd.DataFrame(dict([(bs_label, pd.Series(rmsd_set)) + for bs_label, rmsd_set in RMSD_set.items()])) + + # Plot RMSD violin (lazy import to avoid GUI in workers) + if plot_rmsd and n_top_poses > 0 and not pose_rmsd_data.empty: + from ..plot import plot_binding_site_data # lazy import + plot_binding_site_data( + pose_rmsd_data, + os.path.join(pose_dir, f"Pose_RMSD_violinplot.{fig_format}"), + title=f"{self._lipid}", + ylabel="RMSD (nm)", + fig_close=fig_close + ) + return pose_traj, pose_rmsd_data def compute_surface_area(self, binding_site_id=None, radii=None, plot_data=True, save_dir=None, @@ -1463,7 +1414,7 @@ def compute_surface_area(self, binding_site_id=None, radii=None, plot_data=True, selected_bs_id = np.atleast_1d(np.array(binding_site_id, dtype=int)) if binding_site_id is not None \ else np.arange(len(self._node_list), dtype=int) selected_bs_id_map = {bs_id: self._node_list[bs_id] for bs_id in selected_bs_id} - returned_values = p_map(partial(calculate_surface_area_wrapper, binding_site_map=selected_bs_id_map, + returned_values = _spawn_pmap(partial(calculate_surface_area_wrapper, binding_site_map=selected_bs_id_map, nprot=self._nprot, timeunit=self._timeunit, stride=self._stride, dt_traj=self._dt_traj, radii=radii_book), self._trajfile_list, self._topfile_list, np.arange(len(self._trajfile_list), dtype=int), @@ -1481,7 +1432,8 @@ def compute_surface_area(self, binding_site_id=None, radii=None, plot_data=True, surface_area_per_residue[nodes] = surface_area_data["Binding Site {}".format(bs_id)].mean() self.dataset["Binding Site Surface Area"] = surface_area_per_residue # plot surface area - if plot_data: + _bs_cols = [c for c in surface_area_data.columns if c != "Time"] + if plot_data and _bs_cols: if save_dir is not None: surface_area_dir = check_dir(save_dir) else: @@ -1645,7 +1597,7 @@ def save_pymol_script(self, pdb_file, save_dir=None): """ script_dir = check_dir(save_dir) if save_dir is not None else \ check_dir(os.path.join(self._save_dir, "Dataset_{}".format(self._lipid))) - data_fname = os.path.join(script_dir, "Dataset.csv".format(self._lipid)) + data_fname = os.path.join(script_dir, "Dataset.csv") if not os.path.isfile(data_fname): self.dataset.to_csv(data_fname, index=False, header=True) write_pymol_script(os.path.join(script_dir, "show_binding_site_info.py"), pdb_file, data_fname, @@ -2008,4 +1960,3 @@ def _check_calculation(self, item, calculation, *args, **kwargs): calculation(*args, **kwargs) print("Finished the calculation of {}.".format(item)) print("#" * 60) - diff --git a/pylipid/func/binding_site.py b/pylipid/func/binding_site.py index a28f3c7..e08d34c 100644 --- a/pylipid/func/binding_site.py +++ b/pylipid/func/binding_site.py @@ -19,8 +19,9 @@ import os from collections import defaultdict from itertools import product -import community import networkx as nx +from networkx.algorithms.community import louvain_communities +from networkx.algorithms.community.quality import modularity as louvain_modularity import numpy as np import pandas as pd import mdtraj as md @@ -81,19 +82,14 @@ def get_node_list(corrcoef, threshold=4): .. [2] Newman, M. E. J., Analysis of weighted networks. Physical Review E 2004, 70 (5), 056131. """ - # TODO: check negative values in corrcoef_matrix. Come up with better solutions. corrcoef[corrcoef < 0.0] = 0.0 # network edge can't take negative values. Residues with # negative correlationsare are forced to separate to different binding sites. graph = nx.Graph(corrcoef) - partition = community.best_partition(graph, weight='weight') - values = [partition.get(node) for node in graph.nodes()] - node_list = [] - for value in range(max(values)): - nodes = [k for k, v in partition.items() if v == value] - if len(nodes) >= threshold: - node_list.append(nodes) + # Use networkx's built-in Louvain (networkx >= 2.6). seed fixes randomness for reproducibility. + communities = louvain_communities(graph, weight='weight', seed=42) + node_list = [sorted(c) for c in communities if len(c) >= threshold] if len(node_list) > 0: - modularity = community.modularity(partition, graph) + modularity = louvain_modularity(graph, communities, weight='weight') else: modularity = None return node_list, modularity @@ -205,16 +201,28 @@ def collect_bound_poses(binding_site_map, contact_residue_index, trajfile_list, contact_BS = [np.unique(np.concatenate( [contact_residue_index[node][list_to_take][frame_idx] for node in nodes])) for frame_idx in range(traj.n_frames)] - for frame_id in range(len(contact_BS)): - if len(contact_BS[frame_id]) > 0: - for lipid_id in contact_BS[frame_id]: - lipid_residue_index = lipid_residue_index_set[int(lipid_id)] - lipid_atom_indices = np.sort( - [atom.index for atom in traj.top.residue(lipid_residue_index).atoms]) - pose_pool[bs_id].append( - [np.copy(traj.xyz[frame_id, np.hstack([protein_indices, lipid_atom_indices])]), - np.copy(traj.unitcell_angles[frame_id]), np.copy(traj.unitcell_lengths[frame_id])]) - pose_info[bs_id].append((traj_idx, protein_idx, lipid_residue_index, frame_id*traj.timestep)) + # Group (frame_id, lipid_res_idx) pairs by lipid molecule so we can + # batch-extract all frames for each lipid with a single numpy slice + # instead of copying coordinates one frame at a time. + lip_frames = defaultdict(list) # {lipid_res_idx: [frame_ids]} + for frame_id, lipids_at_frame in enumerate(contact_BS): + for lipid_id in lipids_at_frame: + lip_res_idx = lipid_residue_index_set[int(lipid_id)] + lip_frames[lip_res_idx].append(frame_id) + for lip_res_idx, frame_ids in lip_frames.items(): + lip_atoms = np.sort( + [atom.index for atom in traj.top.residue(lip_res_idx).atoms]) + combined = np.hstack([protein_indices, lip_atoms]) + frame_arr = np.array(frame_ids) + # Single batch slice for all frames of this lipid + batch_xyz = traj.xyz[frame_arr][:, combined] + batch_angles = traj.unitcell_angles[frame_arr] + batch_lengths = traj.unitcell_lengths[frame_arr] + for i, frame_id in enumerate(frame_ids): + pose_pool[bs_id].append( + [batch_xyz[i], batch_angles[i], batch_lengths[i]]) + pose_info[bs_id].append( + (traj_idx, protein_idx, lip_res_idx, frame_id * traj.timestep)) pose_traj = {} for bs_id in pose_pool.keys(): pose_traj[bs_id] = md.Trajectory([frame[0] for frame in pose_pool[bs_id]], joined_top, @@ -254,12 +262,30 @@ def vectorize_poses(bound_poses, binding_nodes, protein_atom_indices, lipid_atom The distance matrix of the bound poses, in the shape of [n_lipid_atoms, n_poses, n_binding_site_residues] """ - # calculate distance to binding site residues for each lipid atom - dist_per_atom = np.array( - [np.array([md.compute_distances(bound_poses, - list(product([lipid_atom_indices[idx]], protein_atom_indices[node])), - periodic=True).min(axis=1) for node in binding_nodes]).T - for idx in np.arange(len(lipid_atom_indices))]) # shape: [n_lipid_atoms, n_poses, n_BS_residues] + n_lip = len(lipid_atom_indices) + n_nodes = len(binding_nodes) + + # Build one flat pair list covering every (lipid_atom × node_residue_atom) combination. + # block_starts[i] marks where the i-th (lipid_atom, node) block begins so that + # np.minimum.reduceat can collapse each block to its per-residue minimum in one pass. + # This replaces n_lip * n_nodes separate md.compute_distances calls with a single call. + all_pairs = [] + block_starts = [] # length = n_lip * n_nodes + + for lip_atom in lipid_atom_indices: + for node in binding_nodes: + block_starts.append(len(all_pairs)) + for prot_atom in protein_atom_indices[node]: + all_pairs.append((lip_atom, prot_atom)) + + # One call → shape [n_frames, n_total_pairs] + all_dists = md.compute_distances(bound_poses, all_pairs, periodic=True) + + # Per-block minimum → shape [n_frames, n_lip * n_nodes] + min_dists = np.minimum.reduceat(all_dists, block_starts, axis=1) + + # Reshape to [n_frames, n_lip, n_nodes] then transpose → [n_lip, n_frames, n_nodes] + dist_per_atom = min_dists.reshape(bound_poses.n_frames, n_lip, n_nodes).transpose(1, 0, 2) return dist_per_atom @@ -334,6 +360,7 @@ def calculate_scores(dist_matrix, kde_bw=0.15, pca_component=0.90, score_weights except ValueError: print("Pose generation error -- possibly due to insufficient number of binding event.") + return np.array([]) def write_bound_poses(pose_traj, pose_indices, save_dir, pose_prefix="BoundPose", pose_format="pdb"): @@ -524,9 +551,10 @@ def analyze_pose_wrapper(bs_id, bound_poses, binding_nodes, pose_info, pose_dir= _write_pose_info([pose_info[int(pose_idx)] for pose_idx in selected_pose_indices], f"{pose_dir_clustered}/pose_info.txt", trajfile_list) ## calculate RMSD ## - dist_mean = np.mean(lipid_dist_per_pose, axis=0) - pose_rmsds = [rmsd(lipid_dist_per_pose[pose_id], dist_mean) - for pose_id in np.arange(len(lipid_dist_per_pose))] + # lipid_dist_per_pose is already computed above; vectorise over all poses at once + # instead of looping frame by frame. + dist_mean = lipid_dist_per_pose.mean(axis=0) + pose_rmsds = np.sqrt(((lipid_dist_per_pose - dist_mean) ** 2).mean(axis=1)).tolist() return pose_rmsds @@ -647,4 +675,4 @@ def _write_pose_info(selected_pose_info, fn, trajfile_list): f.write("LIPID ID: {}\n".format(info[2])) f.write("TIME : {:8.3f} ps\n".format(info[3])) f.write("\n") - f.write("\n") \ No newline at end of file + f.write("\n") diff --git a/pylipid/func/interactions.py b/pylipid/func/interactions.py index 5c4237b..48d9d1a 100644 --- a/pylipid/func/interactions.py +++ b/pylipid/func/interactions.py @@ -100,7 +100,7 @@ def __init__(self, contact_low, contact_high, dt): self.contact_low = contact_low self.contact_high = contact_high self.dt = dt - self.pointer = [np.zeros_like(self.contact_high[idx], dtype=np.int) + self.pointer = [np.zeros_like(self.contact_high[idx], dtype=int) for idx in range(len(self.contact_high))] return diff --git a/pylipid/func/kinetics.py b/pylipid/func/kinetics.py index 370e6dd..d7da794 100644 --- a/pylipid/func/kinetics.py +++ b/pylipid/func/kinetics.py @@ -216,11 +216,11 @@ def cal_survival_func(durations, t_total, delta_t_list): def _curve_fitting(survival_func, delta_t_list, initial_guess): - """Fit the exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" + r"""Fit the exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" survival_rates = np.nan_to_num([survival_func[delta_t] for delta_t in delta_t_list]) # y try: popt, pcov = curve_fit(_bi_expo, np.array(delta_t_list), np.array(survival_rates), p0=initial_guess, maxfev=100000) - n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.float128), *popt) + n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.longdouble), *popt) r_squared = 1 - np.sum((np.nan_to_num(n_fitted) - np.nan_to_num(survival_rates))**2)/np.sum((survival_rates - np.mean(survival_rates))**2) ks = [abs(k) for k in popt[:2]] @@ -235,7 +235,7 @@ def _curve_fitting(survival_func, delta_t_list, initial_guess): def _bi_expo(x, k1, k2, A, B): - """The exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" + r"""The exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" return A*np.exp(-k1*x) + B*np.exp(-k2*x) @@ -277,4 +277,4 @@ def _format_koff_text(properties, timeunit): tu, cv_avg[1]) text += "{:14s} = {:.4f}\n".format("$R^2$$_{{boot}}$", np.mean(properties["r_squared_boot_set"])) text += "{:18s} = {:.3f} {:2s}".format("$Res. Time$", properties["res_time"], tu) - return text \ No newline at end of file + return text diff --git a/pylipid/plot/koff.py b/pylipid/plot/koff.py index 727d8ae..d341389 100644 --- a/pylipid/plot/koff.py +++ b/pylipid/plot/koff.py @@ -77,7 +77,7 @@ def plot_koff(durations, delta_t_list, survival_rates, n_fitted, xlabel = r"Duration ($\mu s$)" if text is None: - fig = plt.figure(1, figsize=(5.5, 3.5)) + fig = plt.figure(figsize=(5.5, 3.5)) left, width = 0.13, 0.33 bottom, height = 0.17, 0.68 left_h = left + width + 0.05 @@ -86,7 +86,7 @@ def plot_koff(durations, delta_t_list, survival_rates, n_fitted, axScatter = fig.add_axes(rect_scatter) axHisty = fig.add_axes(rect_histy) else: - fig = plt.figure(1, figsize=(8.2, 3.5)) + fig = plt.figure(figsize=(8.2, 3.5)) left, width = 0.0975, 0.23 bottom, height = 0.17, 0.68 left_h = left + width + 0.0375 @@ -147,4 +147,3 @@ def plot_koff(durations, delta_t_list, survival_rates, n_fitted, plt.close() return - diff --git a/pylipid/plot/plot1d.py b/pylipid/plot/plot1d.py index 0216b33..5ccfefd 100644 --- a/pylipid/plot/plot1d.py +++ b/pylipid/plot/plot1d.py @@ -274,7 +274,7 @@ def adjacent_values(vals, q1, q3): if title is None: title = "" - color_set = _cycle(plt.get_cmap("tab10").colors) + color_set = _cycle(plt.cm.tab10.colors) plt.rcParams["font.size"] = 10 plt.rcParams["font.weight"] = "bold" @@ -346,7 +346,7 @@ def plot_surface_area(surface_area, fig_fn, timeunit=None, fig_close=False): """ from itertools import cycle as _cycle - color_set = _cycle(plt.get_cmap("tab10").colors) + color_set = _cycle(plt.cm.tab10.colors) plt.rcParams["font.size"] = 10 plt.rcParams["font.weight"] = "normal" @@ -360,6 +360,8 @@ def plot_surface_area(surface_area, fig_fn, timeunit=None, fig_close=False): row_set = list(set([ind[:2] for ind in surface_area.index])) row_set.sort() col_set = [col for col in surface_area.columns if col != "Time"] + if not col_set or not row_set: + return colors = [next(color_set) for dummy in col_set] fig, axes = plt.subplots(len(row_set), len(col_set), figsize=(len(col_set)*2.4, len(row_set)*1.6), sharex=True, sharey=True) diff --git a/pylipid/plot/plot2d.py b/pylipid/plot/plot2d.py index 8b791d3..c0fb1ce 100644 --- a/pylipid/plot/plot2d.py +++ b/pylipid/plot/plot2d.py @@ -101,6 +101,8 @@ def plot_corrcoef(corrcoef, residue_index, cmap="Reds", vmin=None, vmax=None, vmax = np.percentile(np.unique(np.ravel(corrcoef)), 99) if vmin is None: vmin = np.percentile(np.unique(np.ravel(corrcoef)), 1) + vmin = max(vmin, np.finfo(float).tiny) # LogNorm requires strictly positive vmin + vmax = max(vmax, vmin * 10) # ensure vmax > vmin pcm = ax.pcolormesh(x, y, corrcoef, cmap=cmap, norm=colors.LogNorm(vmax=vmax, vmin=vmin)) fig.colorbar(pcm, ax=ax) @@ -130,4 +132,3 @@ def plot_corrcoef(corrcoef, residue_index, cmap="Reds", vmin=None, vmax=None, plt.close() return - diff --git a/pylipid/util/coordinate.py b/pylipid/util/coordinate.py index 111dec1..e5f927f 100644 --- a/pylipid/util/coordinate.py +++ b/pylipid/util/coordinate.py @@ -41,7 +41,14 @@ def write_PDB(prot_obj, bfactor, pdb_fn, resi_offset=0): resid_set = table.resSeq + resi_offset atom_name_set = table.name resn_set = table.resName - chainID = [chr(65 + int(idx)) for idx in table.chainID] + # table.chainID can be empty string with some topology formats (e.g. GROMACS .gro) + # Fall back to chain 'A' (index 0) when the value is missing or non-integer + def _chain_letter(idx): + try: + return chr(65 + int(idx)) + except (ValueError, TypeError): + return 'A' + chainID = [_chain_letter(idx) for idx in table.chainID] atom_residue_map = {atom_idx: prot_obj.top.atom(atom_idx).residue.index for atom_idx in np.arange(prot_obj.n_atoms)} ######## write out coords ########### @@ -66,4 +73,4 @@ def write_PDB(prot_obj, bfactor, pdb_fn, resi_offset=0): f.write(row) f.write("TER") - return \ No newline at end of file + return diff --git a/pylipid/util/corrcoef.py b/pylipid/util/corrcoef.py index 46f9fec..85b4f5f 100644 --- a/pylipid/util/corrcoef.py +++ b/pylipid/util/corrcoef.py @@ -28,12 +28,21 @@ def sparse_corrcoef(A, B=None): A = sparse.vstack((A, B), format='csr') A = A.astype(np.float64) n = A.shape[1] - # Compute the covariance matrix - rowsum = A.sum(1) - centering = rowsum.dot(rowsum.T.conjugate()) / n - C = (A.dot(A.T.conjugate()) - centering) / (n - 1) + # Compute the covariance matrix. + # Use explicit dense conversion to avoid numpy.matrix deprecation and + # version-dependent sparse-dense arithmetic (scipy >= 1.14 / numpy >= 1.25). + rowsum = np.asarray(A.sum(1)).flatten() # shape (n_rows,) ndarray + centering = np.outer(rowsum, rowsum) / n # explicit outer product + # Handle both sparse matrices (from collect_residue_contacts) and + # plain ndarrays (from tests or direct calls). + dot = A.dot(A.T.conjugate()) + if sparse.issparse(dot): + dot = np.asarray(dot.todense()) + else: + dot = np.asarray(dot) + C = (dot - centering) / (n - 1) # The correlation coefficients are given by # C_{i,j} / sqrt(C_{i} * C_{j}) d = np.diag(C) corrcoefs = C / np.sqrt(np.outer(d, d)) - return corrcoefs \ No newline at end of file + return corrcoefs diff --git a/pylipid/util/directory.py b/pylipid/util/directory.py index 265f4d2..0b2f9df 100644 --- a/pylipid/util/directory.py +++ b/pylipid/util/directory.py @@ -21,23 +21,20 @@ __all__ = ["check_dir"] -def check_dir(directory=None, suffix=None, print_info=True): - """Check directory +def check_dir(root, name=None, print_info=True): + """ + Create (if needed) and return a directory under 'root'. + If 'name' is provided, the directory will be 'root/name'. + Set print_info=False to suppress the creation message. + """ + import os - This function will combine the suffix with the given directory (or the current - working directory if None is given) to generate a new directory name, and create a - directory with this name if it does not exit. + path = os.path.join(root, name) if name is not None else root - """ - if directory is None: - directory = os.getcwd() - else: - directory = os.path.abspath(directory) - if suffix is not None: - directory = os.path.join(directory, suffix) - if not os.path.isdir(directory): - os.makedirs(directory) + if not os.path.isdir(path): if print_info: - print("Creating new director: {}".format(directory)) - return directory + print(f"Creating new directory: {path}") + os.makedirs(path, exist_ok=True) + + return path diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..03bb08d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = [ + "setuptools>=61", + "wheel" +] +build-backend = "setuptools.build_meta" + +[project] +name = "pylipid" +dynamic = ["version"] +description = "PyLipID - A Python Library For Lipid Interaction Analysis" +readme = "README.rst" +license = { file = "LICENSE.txt" } +authors = [ + { name = "Wanling Song", email = "wanling.song@hotmail.com" } +] +keywords = ["simulation tools", "network community", "binding site"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Chemistry", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Physics", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +requires-python = ">=3.9" +dependencies = [ + "mdtraj>=1.9.7", + "numpy>=1.21", + "pandas>=1.3", + "matplotlib>=3.5", + "networkx>=2.6", + "scipy>=1.7", + "logomaker>=0.8", + "statsmodels>=0.13", + "scikit-learn>=1.0", + "tqdm>=4.50", + "p_tqdm>=1.3", + "kneebow>=0.1", +] + +[project.urls] +Homepage = "https://github.com/pstansfeld/PyLipID" +Documentation = "https://pylipid.readthedocs.io" +"Bug Tracker" = "https://github.com/pstansfeld/PyLipID/issues" + +[tool.setuptools.dynamic] +version = { attr = "pylipid._version.__version__" } + +[tool.setuptools.packages.find] +where = ["."] +include = ["pylipid*"] diff --git a/requirements.txt b/requirements.txt index 7dd8414..326be4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,12 @@ -mdtraj -numpy -pandas -matplotlib>=3.3.3 -networkx -scipy -python-louvain -logomaker -statsmodels -scikit-learn -tqdm -p_tqdm -kneebow +mdtraj>=1.9.7 +numpy>=1.21 +pandas>=1.3 +matplotlib>=3.5 +networkx>=2.6 +scipy>=1.7 +logomaker>=0.8 +statsmodels>=0.13 +scikit-learn>=1.0 +tqdm>=4.50 +p_tqdm>=1.3 +kneebow>=0.1 diff --git a/setup.py b/setup.py deleted file mode 100644 index 5891ad1..0000000 --- a/setup.py +++ /dev/null @@ -1,61 +0,0 @@ -# setup.py -from setuptools import setup, find_packages -import pathlib - -here = pathlib.Path(__file__).parent.resolve() - -# Get the long description from the README file -long_description = (here / 'README.rst').read_text(encoding='utf-8') - -# read version info -import re -VERSIONFILE="pylipid/_version.py" -verstrline = open(VERSIONFILE, "rt").read() -VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" -mo = re.search(VSRE, verstrline, re.M) -if mo: - verstr = mo.group(1) -else: - raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE,)) - -# setup -setup( - name='pylipid', - version=verstr, - description='PyLipID - A Python Library For Lipid Interaction Analysis', - long_description=long_description, - long_description_content_type='text/x-rst', - url='https://github.com/wlsong/PyLipID', - author='Wanling Song', - author_email='wanling.song@hotmail.com', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Chemistry', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Scientific/Engineering :: Physics', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.9', - ], - keywords='simulation tools, network community, binding site', - python_requires='>=3.6, <4, !=3.8.*', - packages=find_packages(), - install_requires=[ - "mdtraj", - "numpy", - "pandas", - "matplotlib>=3.3.3", - "networkx", - "scipy", - "python-louvain", - "logomaker", - "statsmodels", - "scikit-learn", - "tqdm", - "p_tqdm", - "kneebow" - ] -) diff --git a/tests/api/test_LipidInteraction.py b/tests/api/test_LipidInteraction.py index 8f8981f..4e88949 100644 --- a/tests/api/test_LipidInteraction.py +++ b/tests/api/test_LipidInteraction.py @@ -1,75 +1,157 @@ -import unittest -import os -import shutil -from pylipid.api import LipidInteraction -from pylipid.util import check_dir - -class TestLipidInteraction(unittest.TestCase): - - def test_pylipid(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = check_dir(os.path.join(file_dir, "test_pylipid")) - li = LipidInteraction(trajfile_list, cutoffs=cutoffs, topfile_list=topfile_list, lipid=lipid, - nprot=1, save_dir=self.save_dir) - li.collect_residue_contacts() - - li.compute_residue_duration() - li.compute_residue_duration(10) - li.compute_residue_duration([2,3,4]) - - li.compute_residue_koff(plot_data=False) - li.compute_residue_koff(10) - li.compute_residue_koff([2,4,5,10]) - - li.compute_binding_nodes(threshold=4) - li.compute_binding_nodes(threshold=2, print_data=False) - - li.compute_site_koff(plot_data=False) - li.compute_site_koff(binding_site_id=[1,2,3]) - li.compute_site_koff(binding_site_id=1) - - li.compute_site_duration() - li.compute_site_duration(1) - li.compute_site_duration([0,1]) - - li.analyze_bound_poses() - li.analyze_bound_poses(binding_site_id=[1,2,3]) - li.analyze_bound_poses(binding_site_id=[1,2,3], n_clusters=2) - - li.compute_surface_area() - li.compute_surface_area(binding_site_id=[1,2,3]) - li.compute_surface_area(binding_site_id=[1, 2, 3], radii={"BB": 0.30, "SC1": 0.2}) - - li.write_site_info() - li.write_site_info(sort_residue="Duration") - - li.show_stats_per_traj() - - li.save_data(item="Dataset") - li.save_data(item="Duration") - - li.save_coordinate(item="Residence Time") - li.save_coordinate(item="Duration") - - li.save_pymol_script(pdb_file="../data/receptor.pdb") - - li.plot(item="Duration") - li.plot(item="Residence Time") - - li.plot_logo(item="Residence Time") - li.plot_logo(item="Lipid Count") - - def tearDown(self): - shutil.rmtree(self.save_dir) - -if __name__ == "__main__": - unittest.main() - - +""" +Pytest fixtures for PyLipID tests. +Generates small synthetic MDTraj trajectories in a temporary directory so that +CI can run without any committed trajectory data files. +Topology: + - 30 protein residues (ALA/GLY alternating, one chain) + - 20 CHOL lipid molecules (3 atoms each: C1, C2, O1) + - Cubic box, 8 nm side + - 2 trajectory files, 100 frames each +""" +import os +import numpy as np +import pytest +import mdtraj as md + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_AA_NAMES = ["ALA", "GLY", "VAL", "LEU", "ILE", "PRO", "PHE", "TRP", + "MET", "SER"] +_N_PROTEIN_RESIDUES = 30 +_N_LIPID_MOLECULES = 20 +_N_FRAMES = 100 +_BOX_NM = 8.0 # cubic box side length in nm +_N_RUNS = 2 # number of trajectory files + + +def _build_topology(): + """Build a minimal MDTraj topology: protein + CHOL lipids.""" + top = md.Topology() + + # --- protein chain --- + pchain = top.add_chain() + for i in range(_N_PROTEIN_RESIDUES): + resname = _AA_NAMES[i % len(_AA_NAMES)] + res = top.add_residue(resname, pchain) + top.add_atom("CA", md.element.carbon, res) + top.add_atom("CB", md.element.carbon, res) + top.add_atom("N", md.element.nitrogen, res) + top.add_atom("C", md.element.carbon, res) + top.add_atom("O", md.element.oxygen, res) + + # --- lipid chain --- + lchain = top.add_chain() + for _ in range(_N_LIPID_MOLECULES): + res = top.add_residue("CHOL", lchain) + top.add_atom("C1", md.element.carbon, res) + top.add_atom("C2", md.element.carbon, res) + top.add_atom("O1", md.element.oxygen, res) + + return top + + +def _build_trajectory(top, rng, n_frames=_N_FRAMES): + """ + Build a synthetic trajectory. + + Protein residues are placed in a cluster near the box centre. + Lipid molecules are placed nearby so that some will fall within + the contact cutoffs and generate non-trivial interaction data. + """ + n_atoms = top.n_atoms + xyz = np.zeros((n_frames, n_atoms, 3), dtype=np.float32) + + atoms_per_prot_res = 5 # CA CB N C O + atoms_per_lipid = 3 # C1 C2 O1 + n_prot_atoms = _N_PROTEIN_RESIDUES * atoms_per_prot_res + + centre = _BOX_NM / 2.0 + + for frame in range(n_frames): + # protein — small fluctuations around centre + prot_base = rng.uniform(centre - 1.0, centre + 1.0, + size=(n_prot_atoms, 3)).astype(np.float32) + prot_noise = rng.normal(0, 0.05, size=prot_base.shape).astype(np.float32) + xyz[frame, :n_prot_atoms] = prot_base + prot_noise + + # lipids — placed close enough to protein to generate contacts + for lip_idx in range(_N_LIPID_MOLECULES): + start = n_prot_atoms + lip_idx * atoms_per_lipid + # Alternate between near (within cutoff) and far positions + if lip_idx < _N_LIPID_MOLECULES // 2: + offset = rng.uniform(0.3, 0.6, size=(atoms_per_lipid, 3)) + else: + offset = rng.uniform(1.5, 3.0, size=(atoms_per_lipid, 3)) + sign = rng.choice([-1, 1], size=(atoms_per_lipid, 3)) + xyz[frame, start:start + atoms_per_lipid] = ( + centre + sign * offset.astype(np.float32) + ) + + unitcell_lengths = np.full((n_frames, 3), _BOX_NM, dtype=np.float32) + unitcell_angles = np.full((n_frames, 3), 90.0, dtype=np.float32) + + return md.Trajectory(xyz, top, + unitcell_lengths=unitcell_lengths, + unitcell_angles=unitcell_angles) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session") +def synthetic_traj_dir(tmp_path_factory): + """ + Session-scoped fixture: builds synthetic trajectory files once per test + session and returns a dict with file paths and metadata. + + Returns + ------- + dict with keys: + trajfile_list : list of .xtc paths + topfile_list : list of .gro paths + pdb_file : path to receptor .pdb + lipid : lipid residue name (str) + cutoffs : [lower, upper] cutoffs in nm + """ + rng = np.random.default_rng(42) + top = _build_topology() + base = tmp_path_factory.mktemp("traj_data") + + trajfile_list = [] + topfile_list = [] + + for run_idx in range(_N_RUNS): + run_dir = base / "run{}".format(run_idx + 1) + run_dir.mkdir() + + traj = _build_trajectory(top, rng) + + xtc_path = str(run_dir / "protein_lipids.xtc") + gro_path = str(run_dir / "protein_lipids.gro") + + traj.save_xtc(xtc_path) + traj[0].save_gro(gro_path) + + trajfile_list.append(xtc_path) + topfile_list.append(gro_path) + + # receptor PDB (protein atoms only) + prot_atom_indices = top.select("protein") + prot_traj = traj[0].atom_slice(prot_atom_indices) + pdb_path = str(base / "receptor.pdb") + prot_traj.save_pdb(pdb_path) + + return { + "trajfile_list": trajfile_list, + "topfile_list": topfile_list, + "pdb_file": pdb_path, + "lipid": "CHOL", + "cutoffs": [0.55, 0.8], + } diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..4e88949 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,157 @@ +""" +Pytest fixtures for PyLipID tests. + +Generates small synthetic MDTraj trajectories in a temporary directory so that +CI can run without any committed trajectory data files. + +Topology: + - 30 protein residues (ALA/GLY alternating, one chain) + - 20 CHOL lipid molecules (3 atoms each: C1, C2, O1) + - Cubic box, 8 nm side + - 2 trajectory files, 100 frames each +""" + +import os +import numpy as np +import pytest +import mdtraj as md + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_AA_NAMES = ["ALA", "GLY", "VAL", "LEU", "ILE", "PRO", "PHE", "TRP", + "MET", "SER"] +_N_PROTEIN_RESIDUES = 30 +_N_LIPID_MOLECULES = 20 +_N_FRAMES = 100 +_BOX_NM = 8.0 # cubic box side length in nm +_N_RUNS = 2 # number of trajectory files + + +def _build_topology(): + """Build a minimal MDTraj topology: protein + CHOL lipids.""" + top = md.Topology() + + # --- protein chain --- + pchain = top.add_chain() + for i in range(_N_PROTEIN_RESIDUES): + resname = _AA_NAMES[i % len(_AA_NAMES)] + res = top.add_residue(resname, pchain) + top.add_atom("CA", md.element.carbon, res) + top.add_atom("CB", md.element.carbon, res) + top.add_atom("N", md.element.nitrogen, res) + top.add_atom("C", md.element.carbon, res) + top.add_atom("O", md.element.oxygen, res) + + # --- lipid chain --- + lchain = top.add_chain() + for _ in range(_N_LIPID_MOLECULES): + res = top.add_residue("CHOL", lchain) + top.add_atom("C1", md.element.carbon, res) + top.add_atom("C2", md.element.carbon, res) + top.add_atom("O1", md.element.oxygen, res) + + return top + + +def _build_trajectory(top, rng, n_frames=_N_FRAMES): + """ + Build a synthetic trajectory. + + Protein residues are placed in a cluster near the box centre. + Lipid molecules are placed nearby so that some will fall within + the contact cutoffs and generate non-trivial interaction data. + """ + n_atoms = top.n_atoms + xyz = np.zeros((n_frames, n_atoms, 3), dtype=np.float32) + + atoms_per_prot_res = 5 # CA CB N C O + atoms_per_lipid = 3 # C1 C2 O1 + n_prot_atoms = _N_PROTEIN_RESIDUES * atoms_per_prot_res + + centre = _BOX_NM / 2.0 + + for frame in range(n_frames): + # protein — small fluctuations around centre + prot_base = rng.uniform(centre - 1.0, centre + 1.0, + size=(n_prot_atoms, 3)).astype(np.float32) + prot_noise = rng.normal(0, 0.05, size=prot_base.shape).astype(np.float32) + xyz[frame, :n_prot_atoms] = prot_base + prot_noise + + # lipids — placed close enough to protein to generate contacts + for lip_idx in range(_N_LIPID_MOLECULES): + start = n_prot_atoms + lip_idx * atoms_per_lipid + # Alternate between near (within cutoff) and far positions + if lip_idx < _N_LIPID_MOLECULES // 2: + offset = rng.uniform(0.3, 0.6, size=(atoms_per_lipid, 3)) + else: + offset = rng.uniform(1.5, 3.0, size=(atoms_per_lipid, 3)) + sign = rng.choice([-1, 1], size=(atoms_per_lipid, 3)) + xyz[frame, start:start + atoms_per_lipid] = ( + centre + sign * offset.astype(np.float32) + ) + + unitcell_lengths = np.full((n_frames, 3), _BOX_NM, dtype=np.float32) + unitcell_angles = np.full((n_frames, 3), 90.0, dtype=np.float32) + + return md.Trajectory(xyz, top, + unitcell_lengths=unitcell_lengths, + unitcell_angles=unitcell_angles) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session") +def synthetic_traj_dir(tmp_path_factory): + """ + Session-scoped fixture: builds synthetic trajectory files once per test + session and returns a dict with file paths and metadata. + + Returns + ------- + dict with keys: + trajfile_list : list of .xtc paths + topfile_list : list of .gro paths + pdb_file : path to receptor .pdb + lipid : lipid residue name (str) + cutoffs : [lower, upper] cutoffs in nm + """ + rng = np.random.default_rng(42) + top = _build_topology() + base = tmp_path_factory.mktemp("traj_data") + + trajfile_list = [] + topfile_list = [] + + for run_idx in range(_N_RUNS): + run_dir = base / "run{}".format(run_idx + 1) + run_dir.mkdir() + + traj = _build_trajectory(top, rng) + + xtc_path = str(run_dir / "protein_lipids.xtc") + gro_path = str(run_dir / "protein_lipids.gro") + + traj.save_xtc(xtc_path) + traj[0].save_gro(gro_path) + + trajfile_list.append(xtc_path) + topfile_list.append(gro_path) + + # receptor PDB (protein atoms only) + prot_atom_indices = top.select("protein") + prot_traj = traj[0].atom_slice(prot_atom_indices) + pdb_path = str(base / "receptor.pdb") + prot_traj.save_pdb(pdb_path) + + return { + "trajfile_list": trajfile_list, + "topfile_list": topfile_list, + "pdb_file": pdb_path, + "lipid": "CHOL", + "cutoffs": [0.55, 0.8], + } diff --git a/tests/data/.DS_Store b/tests/data/.DS_Store index 2174845..f5aae60 100644 Binary files a/tests/data/.DS_Store and b/tests/data/.DS_Store differ diff --git a/tests/funcs/test_binding_site.py b/tests/funcs/test_binding_site.py index d8398bf..6227c0e 100644 --- a/tests/funcs/test_binding_site.py +++ b/tests/funcs/test_binding_site.py @@ -1,96 +1,102 @@ -import unittest -import os -import shutil -import numpy as np -from pylipid.func import get_node_list, collect_bound_poses, vectorize_poses, calculate_scores, write_bound_poses -from pylipid.func import calculate_site_surface_area -from pylipid import LipidInteraction - - -class TestBindingSites(unittest.TestCase): - - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname( os.path.abspath(__file__) ) - self.save_dir = os.path.join(file_dir, "test_binding_site") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts(write_log=False, print_log=True) - - def test_get_node_list(self): - corrcoef = self.li.interaction_corrcoef - node_list = get_node_list(corrcoef) - self.assertIsInstance(node_list, list) - - def test_collect_binding_poses(self): - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(self.li._node_list)} - contact_list = self.li.contact_residues_low - pose_pool = collect_bound_poses(binding_site_map, contact_list, self.li.trajfile_list[0], - self.li.topfile_list[0], self.li.lipid, self.li.stride, self.li.nprot) - self.assertIsInstance(pose_pool, dict) - - def tearDown(self): - shutil.rmtree(self.save_dir) - +""" +Tests for binding site detection, pose collection, scoring and surface area. +Uses the synthetic_traj_dir session fixture from conftest.py — no real data files needed. +""" -class TestBindingPoses(unittest.TestCase): - - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = os.path.join(file_dir, "binding_site") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts(write_log=False, print_log=True) - node_list = self.li.compute_binding_nodes(print_data=False) - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(node_list)} - contact_residue_dict = self.li.contact_residues_low - self.pose_pool = collect_bound_poses(binding_site_map, contact_residue_dict, self.li.trajfile_list[0], - self.li.topfile_list[0], self.li.lipid, self.li.stride, self.li.nprot) +import pytest +import numpy as np +from pylipid.func import (get_node_list, collect_bound_poses, vectorize_poses, + calculate_scores, write_bound_poses) +from pylipid.api import LipidInteraction +import shutil - def test_vectorize_poses(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, self.li._lipid_ref) - self.assertEqual(dist_matrix.shape[0], self.li._lipid_ref.n_atoms) - self.assertEqual(dist_matrix.shape[1], len(self.pose_pool[bs_id])) - self.assertEqual(dist_matrix.shape[2], len(self.li._node_list[bs_id])) - return - def test_calculate_scores(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, self.li._lipid_ref) +@pytest.fixture(scope="module") +def li(synthetic_traj_dir, tmp_path_factory): + save_dir = str(tmp_path_factory.mktemp("bs_output")) + instance = LipidInteraction( + synthetic_traj_dir["trajfile_list"], + cutoffs=synthetic_traj_dir["cutoffs"], + topfile_list=synthetic_traj_dir["topfile_list"], + lipid=synthetic_traj_dir["lipid"], + nprot=1, + save_dir=save_dir, + ) + instance.collect_residue_contacts() + instance.compute_residue_koff(plot_data=False) + instance.compute_binding_nodes(threshold=2, print_data=False) + yield instance + shutil.rmtree(save_dir, ignore_errors=True) + + +@pytest.fixture(scope="module") +def pose_pool(li, synthetic_traj_dir): + """Collect bound poses for all detected binding sites.""" + if not li._node_list: + pytest.skip("No binding sites detected in synthetic data") + binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(li._node_list)} + return collect_bound_poses( + binding_site_map, + li._contact_residues_low, + li.trajfile_list, + li.topfile_list, + li.lipid, + li._protein_ref, + li._lipid_ref, + stride=li.stride, + nprot=li.nprot, + ) + + +class TestBindingSites: + + def test_get_node_list(self, li): + corrcoef = li.interaction_corrcoef + node_list, modularity = get_node_list(corrcoef, threshold=2) + assert isinstance(node_list, list) + + def test_collect_binding_poses(self, pose_pool): + assert isinstance(pose_pool, dict) + + +class TestBindingPoses: + + def test_vectorize_poses(self, li, pose_pool): + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) == 0: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) + assert dist_matrix.shape[0] == li._lipid_ref.n_atoms + assert dist_matrix.shape[1] == len(pose_pool[bs_id]) + assert dist_matrix.shape[2] == len(nodes) + + def test_calculate_scores(self, li, pose_pool): + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) == 0: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) scores = calculate_scores(dist_matrix) - self.assertEqual(len(scores), pose_traj.n_frames) - scores = calculate_scores(dist_matrix, score_weights={"RHO": 10}) - self.assertEqual(len(scores), pose_traj.n_frames) - - - def test_write_binding_poses(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, self.li._lipid_ref) + assert len(scores) == pose_traj.n_frames + scores_weighted = calculate_scores(dist_matrix, score_weights={"RHO": 10}) + assert len(scores_weighted) == pose_traj.n_frames + + def test_write_binding_poses(self, li, pose_pool, tmp_path): + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) == 0: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) scores = calculate_scores(dist_matrix) - num_of_poses = min(5, pose_traj.n_frames) + if len(scores) == 0: + continue + num_of_poses = min(3, pose_traj.n_frames) pose_indices = np.argsort(scores)[::-1][:num_of_poses] - write_bound_poses(pose_traj, pose_indices, self.save_dir, pose_prefix="BSid{}_top".format(bs_id), + write_bound_poses(pose_traj, pose_indices, str(tmp_path), + pose_prefix="BSid{}_top".format(bs_id), pose_format="gro") - - def test_calculate_site_surface_area(self): - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(self.li._node_list)} - radii_book = {"BB": 0.36, "SC1": 0.33, "SC2": 0.33, "SC3": 0.33} - surface_area= calculate_site_surface_area(binding_site_map, radii_book, self.li.trajfile_list, - self.li.topfile_list, self.li.nprot, self.li.timeunit, - self.li.stride, self.li.dt_traj) - - def tearDown(self): - shutil.rmtree(self.save_dir) - - -if __name__ == "__main__": - unittest.main() + def test_compute_surface_area(self, li): + # Use the high-level API method — calculate_surface_area_wrapper is internal + li.compute_surface_area() diff --git a/tests/funcs/test_clusterer.py b/tests/funcs/test_clusterer.py index ef8e62c..0ba87fe 100644 --- a/tests/funcs/test_clusterer.py +++ b/tests/funcs/test_clusterer.py @@ -1,63 +1,96 @@ -import unittest -import os -import shutil +""" +Tests for pose clustering (DBSCAN and KMeans). +Uses the synthetic_traj_dir session fixture from conftest.py — no real data files needed. +""" + +import pytest import numpy as np from sklearn.decomposition import PCA from pylipid.func import collect_bound_poses, vectorize_poses, write_bound_poses from pylipid.func import cluster_DBSCAN, cluster_KMeans -from pylipid import LipidInteraction - -class TestCluster(unittest.TestCase): - - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = os.path.join(file_dir, "binding_site") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts(write_log=False, print_log=True) - node_list = self.li.compute_binding_nodes(print_data=False) - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(node_list)} - contact_residue_dict = self.li.contact_residues_low - self.pose_pool = collect_bound_poses(binding_site_map, contact_residue_dict, self.li.trajfile_list[0], - self.li.topfile_list[0], self.li.lipid, self.li.stride, self.li.nprot) - - def test_cluster_DBSCAN(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, - self.li._lipid_ref) - lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() - for pose_id in np.arange(dist_matrix.shape[1])] - transformed_data = PCA(n_components=0.95).fit_transform(lipid_dist_per_pose) - cluster_labels = cluster_DBSCAN(transformed_data, eps=None, min_samples=None, - metric="euclidean") - self.assertEqual(len(cluster_labels), len(lipid_dist_per_pose)) - cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] - selected_pose_id = [np.random.choice(np.where(cluster_labels == cluster_id)[0], 1)[0] - for cluster_id in cluster_id_set] - write_bound_poses(pose_traj, selected_pose_id, self.save_dir, - pose_prefix="BSid{}_cluster_DBSCAN".format(bs_id), pose_format="gro") - - def test_cluster_KMeans(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, - self.li._lipid_ref) - lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() - for pose_id in np.arange(dist_matrix.shape[1])] - transformed_data = PCA(n_components=0.95).fit_transform(lipid_dist_per_pose) - cluster_labels = cluster_KMeans(transformed_data, n_clusters=5) - self.assertEqual(len(cluster_labels), len(lipid_dist_per_pose)) - cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] - selected_pose_id = [np.random.choice(np.where(cluster_labels == cluster_id)[0], 1)[0] - for cluster_id in cluster_id_set] - write_bound_poses(pose_traj, selected_pose_id, self.save_dir, - pose_prefix="BSid{}_cluster_KMeans".format(bs_id), pose_format="gro") - - def tearDown(self): - shutil.rmtree(self.save_dir) - - if __name__ == "__main__": - unittest.main() +from pylipid.api import LipidInteraction +import shutil + + +@pytest.fixture(scope="module") +def li_and_poses(synthetic_traj_dir, tmp_path_factory): + save_dir = str(tmp_path_factory.mktemp("cluster_output")) + li = LipidInteraction( + synthetic_traj_dir["trajfile_list"], + cutoffs=synthetic_traj_dir["cutoffs"], + topfile_list=synthetic_traj_dir["topfile_list"], + lipid=synthetic_traj_dir["lipid"], + nprot=1, + save_dir=save_dir, + ) + li.collect_residue_contacts() + li.compute_residue_koff(plot_data=False) + li.compute_binding_nodes(threshold=2, print_data=False) + + if not li._node_list: + yield li, {} + else: + binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(li._node_list)} + pose_pool = collect_bound_poses( + binding_site_map, + li._contact_residues_low, + li.trajfile_list, + li.topfile_list, + li.lipid, + li._protein_ref, + li._lipid_ref, + stride=li.stride, + nprot=li.nprot, + ) + yield li, pose_pool + + shutil.rmtree(save_dir, ignore_errors=True) + + +class TestCluster: + + def test_cluster_DBSCAN(self, li_and_poses, tmp_path): + li, pose_pool = li_and_poses + if not li._node_list: + pytest.skip("No binding sites detected in synthetic data") + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) < 5: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) + lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() + for pose_id in np.arange(dist_matrix.shape[1])] + transformed_data = PCA(n_components=min(2, len(lipid_dist_per_pose) - 1)).fit_transform(lipid_dist_per_pose) + cluster_labels = cluster_DBSCAN(transformed_data, eps=None, min_samples=None, + metric="euclidean") + assert len(cluster_labels) == len(lipid_dist_per_pose) + cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] + if cluster_id_set: + selected_pose_id = [np.random.choice(np.where(cluster_labels == cid)[0], 1)[0] + for cid in cluster_id_set] + write_bound_poses(pose_traj, selected_pose_id, str(tmp_path), + pose_prefix="BSid{}_cluster_DBSCAN".format(bs_id), + pose_format="gro") + + def test_cluster_KMeans(self, li_and_poses, tmp_path): + li, pose_pool = li_and_poses + if not li._node_list: + pytest.skip("No binding sites detected in synthetic data") + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) < 5: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) + lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() + for pose_id in np.arange(dist_matrix.shape[1])] + n_clusters = min(3, len(lipid_dist_per_pose)) + transformed_data = PCA(n_components=min(2, len(lipid_dist_per_pose) - 1)).fit_transform(lipid_dist_per_pose) + cluster_labels = cluster_KMeans(transformed_data, n_clusters=n_clusters) + assert len(cluster_labels) == len(lipid_dist_per_pose) + cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] + if cluster_id_set: + selected_pose_id = [np.random.choice(np.where(cluster_labels == cid)[0], 1)[0] + for cid in cluster_id_set] + write_bound_poses(pose_traj, selected_pose_id, str(tmp_path), + pose_prefix="BSid{}_cluster_KMeans".format(bs_id), + pose_format="gro") diff --git a/tests/funcs/test_kinetics.py b/tests/funcs/test_kinetics.py index 1bf2610..477582f 100644 --- a/tests/funcs/test_kinetics.py +++ b/tests/funcs/test_kinetics.py @@ -1,41 +1,48 @@ -import unittest -import os -import shutil +""" +Tests for koff and survival function calculations. +Uses the synthetic_traj_dir session fixture from conftest.py — no real data files needed. +""" + +import pytest import numpy as np +import shutil from pylipid.func import cal_koff, cal_survival_func -from pylipid import LipidInteraction - -class TestKinetics(unittest.TestCase): +from pylipid.api import LipidInteraction - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 1.0] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = os.path.join(file_dir, "test_kinetics") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts() - self.t_total = np.max(self.li._T_total) - self.timestep = np.min(self.li._timesteps) - def test_cal_survivla_function(self): - delta_t_list = np.arange(0, self.t_total, self.timestep) - survival_func = cal_survival_func(np.concatenate(self.li.durations[25]), self.t_total, delta_t_list) - self.assertIsInstance(survival_func, dict) +@pytest.fixture(scope="module") +def li(synthetic_traj_dir, tmp_path_factory): + save_dir = str(tmp_path_factory.mktemp("kinetics_output")) + instance = LipidInteraction( + synthetic_traj_dir["trajfile_list"], + cutoffs=synthetic_traj_dir["cutoffs"], + topfile_list=synthetic_traj_dir["topfile_list"], + lipid=synthetic_traj_dir["lipid"], + nprot=1, + save_dir=save_dir, + ) + instance.collect_residue_contacts() + instance.compute_residue_duration() + yield instance + shutil.rmtree(save_dir, ignore_errors=True) - def test_cal_koff(self): - koff, restime, properties = cal_koff(np.concatenate(self.li.durations[25]), self.t_total, - self.timestep, nbootstrap=20, initial_guess=[1,1,1,1]) - print(koff) - print(restime) - print(properties) - self.assertIsInstance(properties, dict) - def tearDown(self): - shutil.rmtree(self.li.save_dir) +class TestKinetics: + def test_cal_survival_function(self, li): + # Use residue 0 — guaranteed to exist in synthetic data + t_total = np.max(li._T_total) + timestep = np.min(li._timesteps) + durations = np.concatenate(li._duration[0]) + delta_t_list = np.arange(0, t_total, timestep) + survival_func = cal_survival_func(durations, t_total, delta_t_list) + assert isinstance(survival_func, dict) -if __name__ == "__main__": - unittest.main() + def test_cal_koff(self, li): + t_total = np.max(li._T_total) + timestep = np.min(li._timesteps) + durations = np.concatenate(li._duration[0]) + koff, restime, properties = cal_koff(durations, t_total, timestep, + nbootstrap=5, + initial_guess=[1, 1, 1, 1]) + assert isinstance(properties, dict) diff --git a/tests/plots/test_plot_1d.py b/tests/plots/test_plot_1d.py index 5769387..8184559 100644 --- a/tests/plots/test_plot_1d.py +++ b/tests/plots/test_plot_1d.py @@ -4,7 +4,7 @@ import shutil import pandas as pd from pylipid.util import check_dir -from pylipid.plot import plot_residue_data, plot_residue_data_logos +from pylipid.plot import plot_residue_data, plot_residue_data_logo from pylipid.plot import plot_binding_site_data, plot_surface_area class TestPlot1d(unittest.TestCase): @@ -23,18 +23,18 @@ def test_plot_residue_data(self): ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_1.pdf"), title="This is a test") logos = np.random.choice(letters_base, size=120) - plot_residue_data_logos(residue_index, logos, interactions, gap=100, letter_map=None, + plot_residue_data_logo(residue_index, logos, interactions, gap=100, letter_map=None, color_scheme="chemistry", ylabel="interactions", fn=os.path.join(self.save_dir, "plot_residue_data_logo_1.pdf")) # gap in-between sequence residue_index = np.concatenate([np.arange(120), np.arange(138, 183), np.arange(355, 382)]) - interactions = np.random.random(size=(120+(183-138)+(312-255))) + interactions = np.random.random(size=(120+(183-138)+(382-355))) plot_residue_data(residue_index, interactions, gap=50, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_2.pdf"), title="This is a test") logos = np.random.choice(letters_base, size=len(interactions)) - plot_residue_data_logos(residue_index, logos, interactions, gap=100, + plot_residue_data_logo(residue_index, logos, interactions, gap=100, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_logo_2.pdf")) # two chains in sequences. @@ -44,7 +44,7 @@ def test_plot_residue_data(self): ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_3.pdf"), title="This is a test") logos = np.random.choice(letters_base, size=len(interactions)) - plot_residue_data_logos(residue_index, logos, interactions, gap=100, + plot_residue_data_logo(residue_index, logos, interactions, gap=100, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_logo_3.pdf")) # check letter mapping @@ -52,7 +52,7 @@ def test_plot_residue_data(self): three_letter_seq = np.random.choice(list(letter_map.keys()), size=23) residue_index = np.arange(23) + 52 interactions = np.random.random(size=23) - plot_residue_data_logos(residue_index, three_letter_seq, interactions, gap=100, + plot_residue_data_logo(residue_index, three_letter_seq, interactions, gap=100, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_logo_4.pdf"), letter_map=letter_map) @@ -82,4 +82,3 @@ def tearDown(self): if __name__ == "__main__": unittest.main() - diff --git a/tests/util/test_utils.py b/tests/util/test_utils.py index caba7d0..c0eef18 100644 --- a/tests/util/test_utils.py +++ b/tests/util/test_utils.py @@ -1,24 +1,20 @@ +""" +Tests for utility functions: write_pymol_script, rmsd, sparse_corrcoef, get_traj_info. +Uses the synthetic_traj_dir session fixture from conftest.py for trajectory-dependent tests. +write_pymol_script and get_traj_info are tested with synthetic data — no real files needed. +""" + import os import unittest import numpy as np import shutil +import pytest import mdtraj as md -from pylipid.util import write_pymol_script -from pylipid.util import check_dir -from pylipid.util import rmsd -from pylipid.util import sparse_corrcoef -from pylipid.util import get_traj_info - -class TestScript(unittest.TestCase): +from pylipid.util import write_pymol_script, check_dir, rmsd, sparse_corrcoef, get_traj_info - def setUp(self): - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = os.path.join(file_dir, "test_util") - check_dir(self.save_dir) - def test_write_pymol_script(self): - write_pymol_script(os.path.join(self.save_dir, "show_bs_info.py"), - "../data/receptor.pdb", "../data/Interactions_CHOL.csv", "CHOL", 10) +class TestRmsdAndCorrcoef(unittest.TestCase): + """Pure-function tests — no trajectory data needed.""" def test_rmsd(self): matrix_a = np.random.random(size=(100, 5)) @@ -26,22 +22,40 @@ def test_rmsd(self): value = rmsd(matrix_a, matrix_b) self.assertIsInstance(value, float) - def test_sparse_corrcoef(self): + def test_sparse_corrcoef_ndarray(self): + """sparse_corrcoef should accept a plain ndarray as well as a sparse matrix.""" A = np.random.normal(size=(4, 500)) corrcoefs = sparse_corrcoef(A) self.assertEqual(len(corrcoefs), len(A)) - def test_get_traj_info(self): - trajfile = "../data/run1/protein_lipids.xtc" - topfile = "../data/run1/protein_lipids.gro" - traj = md.load(trajfile, top=topfile) - traj_info, protein_ref, lipid_ref = get_traj_info(traj, "CHOL") - self.assertIsInstance(protein_ref, md.Trajectory) - self.assertIsInstance(lipid_ref, md.Trajectory) - def tearDown(self): - shutil.rmtree(self.save_dir) +# --------------------------------------------------------------------------- +# Trajectory-dependent tests — use pytest fixtures via a thin wrapper +# --------------------------------------------------------------------------- +@pytest.mark.usefixtures("synthetic_traj_dir") +class TestTrajUtils: + """Tests that require a trajectory. Uses synthetic_traj_dir fixture.""" -if __name__ == "__main__": - unittest.main() + def test_get_traj_info(self, synthetic_traj_dir, tmp_path): + trajfile = synthetic_traj_dir["trajfile_list"][0] + topfile = synthetic_traj_dir["topfile_list"][0] + traj = md.load(trajfile, top=topfile) + traj_info, protein_ref, lipid_ref = get_traj_info(traj, synthetic_traj_dir["lipid"]) + assert isinstance(protein_ref, md.Trajectory) + assert isinstance(lipid_ref, md.Trajectory) + + def test_write_pymol_script(self, synthetic_traj_dir, tmp_path): + # write_pymol_script needs a PDB and a CSV file — generate minimal ones + pdb_file = synthetic_traj_dir["pdb_file"] + + # Minimal Dataset CSV that write_pymol_script can read + csv_file = str(tmp_path / "Interactions_CHOL.csv") + with open(csv_file, "w") as f: + f.write("Residue,Residue ID,Binding Site ID,Residence Time\n") + for i in range(5): + f.write(f"{i}ALA,{i},0,{float(i)}\n") + + script_fn = str(tmp_path / "show_bs_info.py") + write_pymol_script(script_fn, pdb_file, csv_file, "CHOL", 1) + assert os.path.exists(script_fn)