From 7378c3489cd5b57d119ae83306b1b0d66eaa0311 Mon Sep 17 00:00:00 2001 From: Stansfeld Date: Fri, 28 Nov 2025 17:41:28 +0000 Subject: [PATCH 01/24] int fixed --- pylipid/func/interactions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylipid/func/interactions.py b/pylipid/func/interactions.py index 5c4237b..48d9d1a 100644 --- a/pylipid/func/interactions.py +++ b/pylipid/func/interactions.py @@ -100,7 +100,7 @@ def __init__(self, contact_low, contact_high, dt): self.contact_low = contact_low self.contact_high = contact_high self.dt = dt - self.pointer = [np.zeros_like(self.contact_high[idx], dtype=np.int) + self.pointer = [np.zeros_like(self.contact_high[idx], dtype=int) for idx in range(len(self.contact_high))] return From 697d8217513c2f160f118107b32bf89c6ef6edbd Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 21:15:23 +0000 Subject: [PATCH 02/24] v1.6.0 --- README.rst | 7 ++--- github/workflows/ci.yml | 54 ++++++++++++++++++++++++++++++++++++ pylipid/_version.py | 2 +- pylipid/api/api.py | 11 ++++---- pylipid/func/binding_site.py | 19 ++++++------- pylipid/func/kinetics.py | 4 +-- pylipid/plot/koff.py | 5 ++-- pylipid/plot/plot1d.py | 4 +-- pylipid/plot/plot2d.py | 3 +- pylipid/util/corrcoef.py | 12 ++++---- requirements.txt | 25 ++++++++--------- setup.py | 34 +++++++++++------------ 12 files changed, 115 insertions(+), 65 deletions(-) create mode 100644 github/workflows/ci.yml diff --git a/README.rst b/README.rst index 1990885..6acc2f4 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,9 @@ - ========================================================== PyLipID - A Python Package For Lipid Interactions Analysis ========================================================== -.. image:: https://travis-ci.com/wlsong/PyLipID.svg?branch=master - :target: https://travis-ci.com/github/wlsong/PyLipID +.. image:: https://github.com/pstansfeld/PyLipID/actions/workflows/ci.yml/badge.svg + :target: https://github.com/pstansfeld/PyLipID/actions/workflows/ci.yml .. image:: https://img.shields.io/pypi/v/PyLipID :target: https://pypi.org/project/pylipid/ @@ -37,7 +36,7 @@ We recommend installing PyLipID using the package installer `pip`: Alternatively, PyLipID can be installed from the source code. The package is available for download on Github via: -``git clone https://github.com/wlsong/PyLipID`` +``git clone https://github.com/pstansfeld/PyLipID`` Once the source code is downloaded, enter the source code directory and install the package as follow: diff --git a/github/workflows/ci.yml b/github/workflows/ci.yml new file mode 100644 index 0000000..ee50ca2 --- /dev/null +++ b/github/workflows/ci.yml @@ -0,0 +1,54 @@ +name: CI + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + test: + name: Python ${{ matrix.python-version }} / ${{ matrix.os }} + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-cov flake8 + + - name: Install package + run: pip install -e . + + - name: Lint with flake8 + run: | + # Fail on syntax errors or undefined names; treat everything else as a warning + flake8 pylipid --count --select=E9,F63,F7,F82 --show-source --statistics + flake8 pylipid --count --exit-zero --max-line-length=120 --statistics + + - name: Run tests + run: | + pytest tests/ -v --cov=pylipid --cov-report=term-missing + + - name: Upload coverage report + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11' + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: .coverage diff --git a/pylipid/_version.py b/pylipid/_version.py index 8add312..66ea783 100644 --- a/pylipid/_version.py +++ b/pylipid/_version.py @@ -14,4 +14,4 @@ # copies or substantial portions of the Software. ############################################################################## -__version__ = '1.5.14' \ No newline at end of file +__version__ = '1.6.0' diff --git a/pylipid/api/api.py b/pylipid/api/api.py index faff1fd..d8d858e 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -561,7 +561,7 @@ def compute_residue_lipidcount(self, residue_id=None): if len(self._lipid_count[residue_id]) > 0 else 0 for residue_id in self._protein_residue_id] if len(selected_residue_id) == 1: - self._lipid_count[residue_id] + return self._lipid_count[residue_id] else: return [self._lipid_count[residue_id] for residue_id in selected_residue_id] @@ -664,7 +664,7 @@ def compute_residue_koff(self, residue_id=None, nbootstrap=10, initial_guess=[1. self._check_calculation("Duration", self.compute_residue_duration) if plot_data: - koff_dir = check_dir(save_dir, "Reisidue_koffs_{}".format(self._lipid)) if save_dir is not None \ + koff_dir = check_dir(save_dir, "Residue_koffs_{}".format(self._lipid)) if save_dir is not None \ else check_dir(self._save_dir, "Residue_koffs_{}".format(self._lipid)) if len(set(self._residue_list)) != len(self._residue_list): residue_name_set = ["{}_ResidueID{}".format(residue, residue_id) for residue, residue_id in @@ -1327,8 +1327,8 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a pose_dir = check_dir(save_dir, "Bound_Poses_{}".format(self._lipid)) if save_dir is not None \ else check_dir(self._save_dir, "Bound_Poses_{}".format(self._lipid)) - if "Binding Site RMSD" in self.dataset.columns: - pose_rmsd_per_residue = self.dataset.columns["Binding Site Pose RMSD"] + if "Binding Site Pose RMSD" in self.dataset.columns: + pose_rmsd_per_residue = self.dataset["Binding Site Pose RMSD"].values else: pose_rmsd_per_residue = np.zeros(self._nresi_per_protein) if binding_site_id is not None: @@ -1645,7 +1645,7 @@ def save_pymol_script(self, pdb_file, save_dir=None): """ script_dir = check_dir(save_dir) if save_dir is not None else \ check_dir(os.path.join(self._save_dir, "Dataset_{}".format(self._lipid))) - data_fname = os.path.join(script_dir, "Dataset.csv".format(self._lipid)) + data_fname = os.path.join(script_dir, "Dataset.csv") if not os.path.isfile(data_fname): self.dataset.to_csv(data_fname, index=False, header=True) write_pymol_script(os.path.join(script_dir, "show_binding_site_info.py"), pdb_file, data_fname, @@ -2008,4 +2008,3 @@ def _check_calculation(self, item, calculation, *args, **kwargs): calculation(*args, **kwargs) print("Finished the calculation of {}.".format(item)) print("#" * 60) - diff --git a/pylipid/func/binding_site.py b/pylipid/func/binding_site.py index a28f3c7..aa8398f 100644 --- a/pylipid/func/binding_site.py +++ b/pylipid/func/binding_site.py @@ -19,8 +19,9 @@ import os from collections import defaultdict from itertools import product -import community import networkx as nx +from networkx.algorithms.community import louvain_communities +from networkx.algorithms.community.quality import modularity as louvain_modularity import numpy as np import pandas as pd import mdtraj as md @@ -81,19 +82,14 @@ def get_node_list(corrcoef, threshold=4): .. [2] Newman, M. E. J., Analysis of weighted networks. Physical Review E 2004, 70 (5), 056131. """ - # TODO: check negative values in corrcoef_matrix. Come up with better solutions. corrcoef[corrcoef < 0.0] = 0.0 # network edge can't take negative values. Residues with # negative correlationsare are forced to separate to different binding sites. graph = nx.Graph(corrcoef) - partition = community.best_partition(graph, weight='weight') - values = [partition.get(node) for node in graph.nodes()] - node_list = [] - for value in range(max(values)): - nodes = [k for k, v in partition.items() if v == value] - if len(nodes) >= threshold: - node_list.append(nodes) + # Use networkx's built-in Louvain (networkx >= 2.6). seed fixes randomness for reproducibility. + communities = louvain_communities(graph, weight='weight', seed=42) + node_list = [sorted(c) for c in communities if len(c) >= threshold] if len(node_list) > 0: - modularity = community.modularity(partition, graph) + modularity = louvain_modularity(graph, communities, weight='weight') else: modularity = None return node_list, modularity @@ -334,6 +330,7 @@ def calculate_scores(dist_matrix, kde_bw=0.15, pca_component=0.90, score_weights except ValueError: print("Pose generation error -- possibly due to insufficient number of binding event.") + return np.array([]) def write_bound_poses(pose_traj, pose_indices, save_dir, pose_prefix="BoundPose", pose_format="pdb"): @@ -647,4 +644,4 @@ def _write_pose_info(selected_pose_info, fn, trajfile_list): f.write("LIPID ID: {}\n".format(info[2])) f.write("TIME : {:8.3f} ps\n".format(info[3])) f.write("\n") - f.write("\n") \ No newline at end of file + f.write("\n") diff --git a/pylipid/func/kinetics.py b/pylipid/func/kinetics.py index 370e6dd..9d392cc 100644 --- a/pylipid/func/kinetics.py +++ b/pylipid/func/kinetics.py @@ -220,7 +220,7 @@ def _curve_fitting(survival_func, delta_t_list, initial_guess): survival_rates = np.nan_to_num([survival_func[delta_t] for delta_t in delta_t_list]) # y try: popt, pcov = curve_fit(_bi_expo, np.array(delta_t_list), np.array(survival_rates), p0=initial_guess, maxfev=100000) - n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.float128), *popt) + n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.longdouble), *popt) r_squared = 1 - np.sum((np.nan_to_num(n_fitted) - np.nan_to_num(survival_rates))**2)/np.sum((survival_rates - np.mean(survival_rates))**2) ks = [abs(k) for k in popt[:2]] @@ -277,4 +277,4 @@ def _format_koff_text(properties, timeunit): tu, cv_avg[1]) text += "{:14s} = {:.4f}\n".format("$R^2$$_{{boot}}$", np.mean(properties["r_squared_boot_set"])) text += "{:18s} = {:.3f} {:2s}".format("$Res. Time$", properties["res_time"], tu) - return text \ No newline at end of file + return text diff --git a/pylipid/plot/koff.py b/pylipid/plot/koff.py index 727d8ae..d341389 100644 --- a/pylipid/plot/koff.py +++ b/pylipid/plot/koff.py @@ -77,7 +77,7 @@ def plot_koff(durations, delta_t_list, survival_rates, n_fitted, xlabel = r"Duration ($\mu s$)" if text is None: - fig = plt.figure(1, figsize=(5.5, 3.5)) + fig = plt.figure(figsize=(5.5, 3.5)) left, width = 0.13, 0.33 bottom, height = 0.17, 0.68 left_h = left + width + 0.05 @@ -86,7 +86,7 @@ def plot_koff(durations, delta_t_list, survival_rates, n_fitted, axScatter = fig.add_axes(rect_scatter) axHisty = fig.add_axes(rect_histy) else: - fig = plt.figure(1, figsize=(8.2, 3.5)) + fig = plt.figure(figsize=(8.2, 3.5)) left, width = 0.0975, 0.23 bottom, height = 0.17, 0.68 left_h = left + width + 0.0375 @@ -147,4 +147,3 @@ def plot_koff(durations, delta_t_list, survival_rates, n_fitted, plt.close() return - diff --git a/pylipid/plot/plot1d.py b/pylipid/plot/plot1d.py index 0216b33..0f12870 100644 --- a/pylipid/plot/plot1d.py +++ b/pylipid/plot/plot1d.py @@ -274,7 +274,7 @@ def adjacent_values(vals, q1, q3): if title is None: title = "" - color_set = _cycle(plt.get_cmap("tab10").colors) + color_set = _cycle(plt.cm.tab10.colors) plt.rcParams["font.size"] = 10 plt.rcParams["font.weight"] = "bold" @@ -346,7 +346,7 @@ def plot_surface_area(surface_area, fig_fn, timeunit=None, fig_close=False): """ from itertools import cycle as _cycle - color_set = _cycle(plt.get_cmap("tab10").colors) + color_set = _cycle(plt.cm.tab10.colors) plt.rcParams["font.size"] = 10 plt.rcParams["font.weight"] = "normal" diff --git a/pylipid/plot/plot2d.py b/pylipid/plot/plot2d.py index 8b791d3..c0fb1ce 100644 --- a/pylipid/plot/plot2d.py +++ b/pylipid/plot/plot2d.py @@ -101,6 +101,8 @@ def plot_corrcoef(corrcoef, residue_index, cmap="Reds", vmin=None, vmax=None, vmax = np.percentile(np.unique(np.ravel(corrcoef)), 99) if vmin is None: vmin = np.percentile(np.unique(np.ravel(corrcoef)), 1) + vmin = max(vmin, np.finfo(float).tiny) # LogNorm requires strictly positive vmin + vmax = max(vmax, vmin * 10) # ensure vmax > vmin pcm = ax.pcolormesh(x, y, corrcoef, cmap=cmap, norm=colors.LogNorm(vmax=vmax, vmin=vmin)) fig.colorbar(pcm, ax=ax) @@ -130,4 +132,3 @@ def plot_corrcoef(corrcoef, residue_index, cmap="Reds", vmin=None, vmax=None, plt.close() return - diff --git a/pylipid/util/corrcoef.py b/pylipid/util/corrcoef.py index 46f9fec..f09a05b 100644 --- a/pylipid/util/corrcoef.py +++ b/pylipid/util/corrcoef.py @@ -28,12 +28,14 @@ def sparse_corrcoef(A, B=None): A = sparse.vstack((A, B), format='csr') A = A.astype(np.float64) n = A.shape[1] - # Compute the covariance matrix - rowsum = A.sum(1) - centering = rowsum.dot(rowsum.T.conjugate()) / n - C = (A.dot(A.T.conjugate()) - centering) / (n - 1) + # Compute the covariance matrix. + # Use explicit dense conversion to avoid numpy.matrix deprecation and + # version-dependent sparse-dense arithmetic (scipy >= 1.14 / numpy >= 1.25). + rowsum = np.asarray(A.sum(1)).flatten() # shape (n_rows,) ndarray + centering = np.outer(rowsum, rowsum) / n # explicit outer product + C = (np.asarray(A.dot(A.T.conjugate()).todense()) - centering) / (n - 1) # The correlation coefficients are given by # C_{i,j} / sqrt(C_{i} * C_{j}) d = np.diag(C) corrcoefs = C / np.sqrt(np.outer(d, d)) - return corrcoefs \ No newline at end of file + return corrcoefs diff --git a/requirements.txt b/requirements.txt index 7dd8414..326be4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,12 @@ -mdtraj -numpy -pandas -matplotlib>=3.3.3 -networkx -scipy -python-louvain -logomaker -statsmodels -scikit-learn -tqdm -p_tqdm -kneebow +mdtraj>=1.9.7 +numpy>=1.21 +pandas>=1.3 +matplotlib>=3.5 +networkx>=2.6 +scipy>=1.7 +logomaker>=0.8 +statsmodels>=0.13 +scikit-learn>=1.0 +tqdm>=4.50 +p_tqdm>=1.3 +kneebow>=0.1 diff --git a/setup.py b/setup.py index 5891ad1..1982390 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ description='PyLipID - A Python Library For Lipid Interaction Analysis', long_description=long_description, long_description_content_type='text/x-rst', - url='https://github.com/wlsong/PyLipID', + url='https://github.com/pstansfeld/PyLipID', author='Wanling Song', author_email='wanling.song@hotmail.com', classifiers=[ @@ -36,26 +36,26 @@ 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Physics', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', ], keywords='simulation tools, network community, binding site', - python_requires='>=3.6, <4, !=3.8.*', + python_requires='>=3.9', packages=find_packages(), install_requires=[ - "mdtraj", - "numpy", - "pandas", - "matplotlib>=3.3.3", - "networkx", - "scipy", - "python-louvain", - "logomaker", - "statsmodels", - "scikit-learn", - "tqdm", - "p_tqdm", - "kneebow" + "mdtraj>=1.9.7", + "numpy>=1.21", + "pandas>=1.3", + "matplotlib>=3.5", + "networkx>=2.6", + "scipy>=1.7", + "logomaker>=0.8", + "statsmodels>=0.13", + "scikit-learn>=1.0", + "tqdm>=4.50", + "p_tqdm>=1.3", + "kneebow>=0.1", ] ) From 2f2b46b8b2f257f3ceecdee3759fa0fea2258a88 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 21:19:10 +0000 Subject: [PATCH 03/24] yml edits --- {github => .github}/workflows/ci.yml | 0 .travis.yml | 15 --------------- 2 files changed, 15 deletions(-) rename {github => .github}/workflows/ci.yml (100%) delete mode 100644 .travis.yml diff --git a/github/workflows/ci.yml b/.github/workflows/ci.yml similarity index 100% rename from github/workflows/ci.yml rename to .github/workflows/ci.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 352700b..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -language: python -cache: pip -python: - - "3.6" - - "3.7" - - "3.8" -before_install: - - pip install -U pip - - pip install codecov -install: - - pip install -r requirements.txt -script: - - python -m unittest discover -s tests -after_success: - - bash <(curl -s https://codecov.io/bash) From 4015f88c5d8a887c8d5a71036f1087798c93b2cb Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 21:25:27 +0000 Subject: [PATCH 04/24] pip installable --- pyproject.toml | 54 ++++++++++++++++++++++++++++++++++++++++++++ setup.py | 61 -------------------------------------------------- 2 files changed, 54 insertions(+), 61 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3631cef --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["setuptools>=64", "setuptools-scm"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "pylipid" +dynamic = ["version"] +description = "PyLipID - A Python Library For Lipid Interaction Analysis" +readme = "README.rst" +license = { file = "LICENSE.txt" } +authors = [ + { name = "Wanling Song", email = "wanling.song@hotmail.com" } +] +keywords = ["simulation tools", "network community", "binding site"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Chemistry", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Physics", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +requires-python = ">=3.9" +dependencies = [ + "mdtraj>=1.9.7", + "numpy>=1.21", + "pandas>=1.3", + "matplotlib>=3.5", + "networkx>=2.6", + "scipy>=1.7", + "logomaker>=0.8", + "statsmodels>=0.13", + "scikit-learn>=1.0", + "tqdm>=4.50", + "p_tqdm>=1.3", + "kneebow>=0.1", +] + +[project.urls] +Homepage = "https://github.com/pstansfeld/PyLipID" +Documentation = "https://pylipid.readthedocs.io" +"Bug Tracker" = "https://github.com/pstansfeld/PyLipID/issues" + +[tool.setuptools.dynamic] +version = { attr = "pylipid._version.__version__" } + +[tool.setuptools.packages.find] +where = ["."] +include = ["pylipid*"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 1982390..0000000 --- a/setup.py +++ /dev/null @@ -1,61 +0,0 @@ -# setup.py -from setuptools import setup, find_packages -import pathlib - -here = pathlib.Path(__file__).parent.resolve() - -# Get the long description from the README file -long_description = (here / 'README.rst').read_text(encoding='utf-8') - -# read version info -import re -VERSIONFILE="pylipid/_version.py" -verstrline = open(VERSIONFILE, "rt").read() -VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" -mo = re.search(VSRE, verstrline, re.M) -if mo: - verstr = mo.group(1) -else: - raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE,)) - -# setup -setup( - name='pylipid', - version=verstr, - description='PyLipID - A Python Library For Lipid Interaction Analysis', - long_description=long_description, - long_description_content_type='text/x-rst', - url='https://github.com/pstansfeld/PyLipID', - author='Wanling Song', - author_email='wanling.song@hotmail.com', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Chemistry', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Scientific/Engineering :: Physics', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - ], - keywords='simulation tools, network community, binding site', - python_requires='>=3.9', - packages=find_packages(), - install_requires=[ - "mdtraj>=1.9.7", - "numpy>=1.21", - "pandas>=1.3", - "matplotlib>=3.5", - "networkx>=2.6", - "scipy>=1.7", - "logomaker>=0.8", - "statsmodels>=0.13", - "scikit-learn>=1.0", - "tqdm>=4.50", - "p_tqdm>=1.3", - "kneebow>=0.1", - ] -) From af52946e1462a84faae4d21bf1891bebbe5df941 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 21:37:24 +0000 Subject: [PATCH 05/24] potential speed-up --- pylipid/api/api.py | 30 +++++++++++++++++++++++------- pylipid/util/coordinate.py | 11 +++++++++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/pylipid/api/api.py b/pylipid/api/api.py index d8d858e..548a25d 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -364,15 +364,31 @@ def collect_residue_contacts(self): "Trajectory {} contains {} residues whereas trajectory {} contains {} residues".format( traj_idx, len(traj_info["protein_residue_id"]), traj_idx - 1, len(self._protein_residue_id)) ncol_per_protein = len(traj_info["lipid_residue_atomid_list"]) * traj.n_frames + # Pre-flatten all lipid atom indices and build a boundary index so we can + # issue ONE md.compute_distances call per residue (instead of n_lipids calls). + flat_lipid_atoms = np.concatenate(traj_info["lipid_residue_atomid_list"]) + lipid_sizes = np.array([len(a) for a in traj_info["lipid_residue_atomid_list"]]) + lipid_boundaries = np.concatenate([[0], np.cumsum(lipid_sizes)]) + n_lipids = len(traj_info["lipid_residue_atomid_list"]) for protein_idx in np.arange(self._nprot, dtype=int): - for residue_id, residue_atom_indices in enumerate( - traj_info["protein_residue_atomid_list"][protein_idx]): - # calculate interaction per residue - dist_matrix = np.array([np.min( - md.compute_distances(traj, np.array(list(product(residue_atom_indices, lipid_atom_indices))), - periodic=True, opt=True), - axis=1) for lipid_atom_indices in traj_info["lipid_residue_atomid_list"]]) + tqdm(traj_info["protein_residue_atomid_list"][protein_idx], + desc=" Traj {:d} prot {:d} residues".format(traj_idx, protein_idx), + leave=False)): + # Single compute_distances call for all lipid atoms at once. + # Shape: (n_frames, n_residue_atoms * n_flat_lipid_atoms) + all_pairs = np.array(list(product(residue_atom_indices, flat_lipid_atoms))) + all_dists = md.compute_distances(traj, all_pairs, periodic=True, opt=True) + # all_dists shape: (n_frames, n_res_atoms * n_flat_lipid_atoms) + # Reduce to (n_frames, n_flat_lipid_atoms) by taking min over residue atoms + n_res_atoms = len(residue_atom_indices) + n_flat = len(flat_lipid_atoms) + # reshape: (n_frames, n_res_atoms, n_flat_lipid_atoms) → min over axis 1 + per_atom_dists = all_dists.reshape(traj.n_frames, n_res_atoms, n_flat).min(axis=1) + # (n_frames, n_flat_lipid_atoms) → per lipid min using boundaries + # np.minimum.reduceat operates along axis=1 (columns = lipid atoms) + dist_matrix = np.minimum.reduceat(per_atom_dists, lipid_boundaries[:-1], axis=1).T + # dist_matrix shape: (n_lipids, n_frames) — same as before contact_low, frame_id_set_low, lipid_id_set_low = cal_contact_residues(dist_matrix, self._cutoffs[0]) contact_high, _, _ = cal_contact_residues(dist_matrix, self._cutoffs[1]) self._contact_residues_high[residue_id].append(contact_high) diff --git a/pylipid/util/coordinate.py b/pylipid/util/coordinate.py index 111dec1..e5f927f 100644 --- a/pylipid/util/coordinate.py +++ b/pylipid/util/coordinate.py @@ -41,7 +41,14 @@ def write_PDB(prot_obj, bfactor, pdb_fn, resi_offset=0): resid_set = table.resSeq + resi_offset atom_name_set = table.name resn_set = table.resName - chainID = [chr(65 + int(idx)) for idx in table.chainID] + # table.chainID can be empty string with some topology formats (e.g. GROMACS .gro) + # Fall back to chain 'A' (index 0) when the value is missing or non-integer + def _chain_letter(idx): + try: + return chr(65 + int(idx)) + except (ValueError, TypeError): + return 'A' + chainID = [_chain_letter(idx) for idx in table.chainID] atom_residue_map = {atom_idx: prot_obj.top.atom(atom_idx).residue.index for atom_idx in np.arange(prot_obj.n_atoms)} ######## write out coords ########### @@ -66,4 +73,4 @@ def write_PDB(prot_obj, bfactor, pdb_fn, resi_offset=0): f.write(row) f.write("TER") - return \ No newline at end of file + return From daf14faacdc1c72c5f26a539f655a6b23e749d0e Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 21:39:40 +0000 Subject: [PATCH 06/24] Update README.rst --- README.rst | 181 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 147 insertions(+), 34 deletions(-) diff --git a/README.rst b/README.rst index 6acc2f4..9777475 100644 --- a/README.rst +++ b/README.rst @@ -6,59 +6,172 @@ PyLipID - A Python Package For Lipid Interactions Analysis :target: https://github.com/pstansfeld/PyLipID/actions/workflows/ci.yml .. image:: https://img.shields.io/pypi/v/PyLipID :target: https://pypi.org/project/pylipid/ +.. image:: https://img.shields.io/pypi/pyversions/PyLipID + :target: https://pypi.org/project/pylipid/ .. image:: docs/static/pylipid_logo_smallsize.png :align: center -PyLipID is a python package for analyzing lipid interactions with membrane proteins from -Molecular Dynamics Simulations. PyLipID has the following main features, please check out -the tutorials for examples and the documentations for the API functionalities: +PyLipID is a Python package for analysing lipid interactions with membrane proteins from +Molecular Dynamics (MD) simulations. It supports both coarse-grained and all-atom force fields, +works with any trajectory format supported by `MDTraj `_, and is designed +to run from Jupyter notebooks or Python scripts. + +Full documentation and tutorials: `pylipid.readthedocs.io `_ + + +Features +======== - * Detection of binding sites via calculating community structures in the interactions networks. - * Calculation of lipid koff and residence time for interaction with binding sites and residues. - * Analysis of lipid interactions with binding sites and residues using a couple of metrics. - * Generation of representative bound poses for binding sites. - * Analysis of bound poses for binding sites via automated clustering scheme. - * Adoption of a dual-cutoff scheme to overcome the 'rattling in cage' effect of coarse-grained simulations. - * Generation of manuscript-ready figures for analysis. +- **Dual-cutoff contact detection** — overcomes the 'rattling in cage' artefact common in + coarse-grained simulations by defining contact start and end with separate distance thresholds. +- **Binding site detection** — identifies lipid binding sites by computing community structures + in residue interaction networks using the Louvain algorithm. +- **Kinetics** — calculates lipid koff rates and residence times for both individual residues + and binding sites. +- **Interaction metrics** — computes occupancy, lipid count, duration and residence time per + residue and per binding site. +- **Bound pose analysis** — extracts and clusters representative lipid-bound poses for each + binding site. +- **Surface area** — calculates solvent-accessible surface area contributions per residue and + binding site. +- **Visualisation** — generates publication-ready figures and PyMOL scripts for structural + visualisation. -PyLipID can be used from Jupyter (former IPython, recommended), or by writing Python scripts. -The documentaion and tutorials can be found at `pylipid.readthedocs.io `_. Installation ============ -We recommend installing PyLipID using the package installer `pip`: +Install from PyPI (recommended):: + + pip install pylipid + +Install from source:: + + git clone https://github.com/pstansfeld/PyLipID + cd PyLipID + pip install . + +For development (editable install):: + + pip install -e . + +**Requirements:** Python >= 3.9. All dependencies are installed automatically by pip. + + +Quick Start +=========== + +.. code-block:: python + + from pylipid.api import LipidInteraction + + # Point to your trajectory and topology files + trajfile_list = ["run1/protein_lipids.xtc", "run2/protein_lipids.xtc"] + topfile_list = ["run1/protein_lipids.gro", "run2/protein_lipids.gro"] -``pip install pylipid`` + # Initialise — dual cutoffs in nm, lipid residue name as in your topology + li = LipidInteraction( + trajfile_list, + topfile_list=topfile_list, + cutoffs=[0.55, 0.8], # [lower, upper] in nm + lipid="CHOL", + nprot=1, + ) -Alternatively, PyLipID can be installed from the source code. The package is available for -download on Github via: + # Collect contacts (the main calculation step) + li.collect_residue_contacts() -``git clone https://github.com/pstansfeld/PyLipID`` + # Compute metrics + li.compute_residue_duration() + li.compute_residue_koff() + li.compute_binding_nodes(threshold=4) + li.compute_site_koff() -Once the source code is downloaded, enter the source code directory and install the package as follow: + # Plot and save + li.plot(item="Residence Time") + li.save_data(item="Dataset") + li.save_pymol_script(pdb_file="receptor.pdb") -``python setup.py install`` +For a full walkthrough including coarse-grained and all-atom examples, see the +`tutorials on readthedocs `_. -Citation |DOI for Citing PyEMMA| -================================ +What's New in v1.6.0 +==================== -If you use PyLipID in scientific research, please cite the following paper: :: +This release, maintained by the `Stansfeld Lab `_, +fixes a number of bugs present in v1.5.14 and modernises the package for current +Python and dependency versions. - @article{song_pylipid_2022, - author = {Song, Wanling. and Corey, Robin A. and Ansell, T. Bertie. and - Cassidy, C. Keith. and Horrell, Michael R. and Duncan, Anna L. - and Stansfeld, Phillip J. and Sansom, Mark S.P.}, - title = {PyLipID: A Python package for analysis of protein-lipid interactions from MD simulations}, - journal = {J. Chem. Theory Comput}, - year = {2022}, - url = {https://doi.org/10.1021/acs.jctc.1c00708}, - doi = {10.1021/acs.jctc.1c00708}, - urldate = {2022-02-18}, - } +**Bug fixes** -.. |DOI for Citing PyEMMA| image:: https://img.shields.io/badge/DOI-10.1021/acs.jctc.1c00708-blue +- Fixed off-by-one error in binding site detection that silently dropped the last + binding site on every run. +- Fixed ``calculate_scores`` returning ``None`` on error, causing a downstream + ``TypeError`` in pose analysis. +- Fixed correlation coefficient matrix computation (``sparse_corrcoef``) returning a + ``numpy.matrix`` object incompatible with NetworkX 3.x and newer NumPy (root cause + of the ``IndexError`` reported in issue #33). +- Fixed ``analyze_bound_poses`` referencing a non-existent column name and using + incorrect DataFrame indexing. +- Fixed ``compute_residue_lipidcount`` missing its ``return`` statement for single-residue + queries. +- Fixed ``write_PDB`` crashing with ``ValueError: invalid literal for int()`` when chain + IDs are absent in GROMACS ``.gro`` topologies (issues #29, #26). +- Fixed hardcoded figure number in ``plot_koff`` causing curves to overwrite each other + when called for multiple residues. +- Fixed deprecated ``plt.get_cmap()`` call removed in Matplotlib 3.9. +- Fixed ``LogNorm`` crash in ``plot_corrcoef`` when the correlation matrix contains zeros. +- Fixed ``np.float128`` (Linux-only) replaced with portable ``np.longdouble``. +- Fixed typo in output folder name ``Reisidue_koffs`` -> ``Residue_koffs``. + +**Performance** + +- ``collect_residue_contacts`` now issues a single ``md.compute_distances`` call per + residue rather than one per lipid molecule, giving a significant speedup on systems + with large lipid counts. A nested progress bar now shows per-residue progress within + each trajectory so the calculation no longer appears frozen. + +**Dependency changes** + +- Removed ``python-louvain`` dependency; binding site detection now uses the Louvain + implementation built into NetworkX (>= 2.6). Results are now reproducible across + runs (fixed random seed). +- All dependencies now carry minimum version bounds. +- Python < 3.9 (end-of-life) is no longer supported. + +**Packaging** + +- Migrated from ``setup.py`` to ``pyproject.toml``. +- CI migrated from Travis to GitHub Actions (Python 3.9--3.12, Ubuntu and macOS). + + +Citation +======== + +If you use PyLipID in scientific research, please cite: + +.. image:: https://img.shields.io/badge/DOI-10.1021/acs.jctc.1c00708-blue :target: https://doi.org/10.1021/acs.jctc.1c00708 + +:: + + @article{song_pylipid_2022, + author = {Song, Wanling and Corey, Robin A. and Ansell, T. Bertie and + Cassidy, C. Keith and Horrell, Michael R. and Duncan, Anna L. + and Stansfeld, Phillip J. and Sansom, Mark S.P.}, + title = {PyLipID: A Python package for analysis of protein-lipid + interactions from MD simulations}, + journal = {J. Chem. Theory Comput.}, + year = {2022}, + doi = {10.1021/acs.jctc.1c00708}, + url = {https://doi.org/10.1021/acs.jctc.1c00708}, + } + + +Licence +======= + +MIT licence. See `LICENSE.txt `_ for details. From 5aecaeac15c931a3ad087ac2044a32af896567e2 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 22:03:16 +0000 Subject: [PATCH 07/24] Update README.rst --- README.rst | 148 +++++++++++++++++++---------------------------------- 1 file changed, 53 insertions(+), 95 deletions(-) diff --git a/README.rst b/README.rst index 9777475..9855bac 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,5 @@ -========================================================== -PyLipID - A Python Package For Lipid Interactions Analysis -========================================================== +PyLipID - Analysis of Protein–Lipid Interactions from Molecular Dynamics +============================================================ .. image:: https://github.com/pstansfeld/PyLipID/actions/workflows/ci.yml/badge.svg :target: https://github.com/pstansfeld/PyLipID/actions/workflows/ci.yml @@ -10,55 +9,44 @@ PyLipID - A Python Package For Lipid Interactions Analysis :target: https://pypi.org/project/pylipid/ .. image:: docs/static/pylipid_logo_smallsize.png - :align: center + :align: center +PyLipID is a Python package for analysing lipid–protein interactions from +Molecular Dynamics (MD) simulations. It supports coarse-grained and all-atom +simulations, handles formats supported by MDTraj, and works in scripts or +Jupyter notebooks. -PyLipID is a Python package for analysing lipid interactions with membrane proteins from -Molecular Dynamics (MD) simulations. It supports both coarse-grained and all-atom force fields, -works with any trajectory format supported by `MDTraj `_, and is designed -to run from Jupyter notebooks or Python scripts. - -Full documentation and tutorials: `pylipid.readthedocs.io `_ - +Documentation: https://pylipid.readthedocs.io Features ======== -- **Dual-cutoff contact detection** — overcomes the 'rattling in cage' artefact common in - coarse-grained simulations by defining contact start and end with separate distance thresholds. -- **Binding site detection** — identifies lipid binding sites by computing community structures - in residue interaction networks using the Louvain algorithm. -- **Kinetics** — calculates lipid koff rates and residence times for both individual residues - and binding sites. -- **Interaction metrics** — computes occupancy, lipid count, duration and residence time per - residue and per binding site. -- **Bound pose analysis** — extracts and clusters representative lipid-bound poses for each - binding site. -- **Surface area** — calculates solvent-accessible surface area contributions per residue and - binding site. -- **Visualisation** — generates publication-ready figures and PyMOL scripts for structural - visualisation. - +- Dual-cutoff contact detection to improve coarse-grained accuracy. +- Binding site detection using NetworkX Louvain community analysis. +- Kinetic parameters: koff and residence times per residue and site. +- Interaction metrics including occupancy, count, duration. +- Representative bound pose extraction and clustering. +- SASA calculations per residue and site. +- Publication-ready plots and PyMOL scripts. Installation ============ -Install from PyPI (recommended):: +From PyPI:: pip install pylipid -Install from source:: +From source:: git clone https://github.com/pstansfeld/PyLipID cd PyLipID pip install . -For development (editable install):: +Editable/development install:: pip install -e . -**Requirements:** Python >= 3.9. All dependencies are installed automatically by pip. - +Requires Python >= 3.9. Quick Start =========== @@ -67,92 +55,63 @@ Quick Start from pylipid.api import LipidInteraction - # Point to your trajectory and topology files trajfile_list = ["run1/protein_lipids.xtc", "run2/protein_lipids.xtc"] topfile_list = ["run1/protein_lipids.gro", "run2/protein_lipids.gro"] - # Initialise — dual cutoffs in nm, lipid residue name as in your topology li = LipidInteraction( trajfile_list, topfile_list=topfile_list, - cutoffs=[0.55, 0.8], # [lower, upper] in nm + cutoffs=[0.55, 0.8], lipid="CHOL", nprot=1, ) - # Collect contacts (the main calculation step) li.collect_residue_contacts() - - # Compute metrics li.compute_residue_duration() li.compute_residue_koff() li.compute_binding_nodes(threshold=4) li.compute_site_koff() - # Plot and save - li.plot(item="Residence Time") - li.save_data(item="Dataset") + li.plot("Residence Time") + li.save_data("Dataset") li.save_pymol_script(pdb_file="receptor.pdb") -For a full walkthrough including coarse-grained and all-atom examples, see the -`tutorials on readthedocs `_. - - What's New in v1.6.0 ==================== -This release, maintained by the `Stansfeld Lab `_, -fixes a number of bugs present in v1.5.14 and modernises the package for current -Python and dependency versions. - -**Bug fixes** - -- Fixed off-by-one error in binding site detection that silently dropped the last - binding site on every run. -- Fixed ``calculate_scores`` returning ``None`` on error, causing a downstream - ``TypeError`` in pose analysis. -- Fixed correlation coefficient matrix computation (``sparse_corrcoef``) returning a - ``numpy.matrix`` object incompatible with NetworkX 3.x and newer NumPy (root cause - of the ``IndexError`` reported in issue #33). -- Fixed ``analyze_bound_poses`` referencing a non-existent column name and using - incorrect DataFrame indexing. -- Fixed ``compute_residue_lipidcount`` missing its ``return`` statement for single-residue - queries. -- Fixed ``write_PDB`` crashing with ``ValueError: invalid literal for int()`` when chain - IDs are absent in GROMACS ``.gro`` topologies (issues #29, #26). -- Fixed hardcoded figure number in ``plot_koff`` causing curves to overwrite each other - when called for multiple residues. -- Fixed deprecated ``plt.get_cmap()`` call removed in Matplotlib 3.9. -- Fixed ``LogNorm`` crash in ``plot_corrcoef`` when the correlation matrix contains zeros. -- Fixed ``np.float128`` (Linux-only) replaced with portable ``np.longdouble``. -- Fixed typo in output folder name ``Reisidue_koffs`` -> ``Residue_koffs``. - -**Performance** - -- ``collect_residue_contacts`` now issues a single ``md.compute_distances`` call per - residue rather than one per lipid molecule, giving a significant speedup on systems - with large lipid counts. A nested progress bar now shows per-residue progress within - each trajectory so the calculation no longer appears frozen. - -**Dependency changes** - -- Removed ``python-louvain`` dependency; binding site detection now uses the Louvain - implementation built into NetworkX (>= 2.6). Results are now reproducible across - runs (fixed random seed). -- All dependencies now carry minimum version bounds. -- Python < 3.9 (end-of-life) is no longer supported. - -**Packaging** - -- Migrated from ``setup.py`` to ``pyproject.toml``. -- CI migrated from Travis to GitHub Actions (Python 3.9--3.12, Ubuntu and macOS). - +Bug Fixes +--------- +- Corrected binding-site detection off-by-one error. +- Fixed calculate_scores returning None. +- Fixed sparse_corrcoef matrix type issues. +- Fixed DataFrame column/index mismatches in analyze_bound_poses. +- Fixed missing return in compute_residue_lipidcount. +- Fixed write_PDB errors for .gro files lacking chain IDs. +- Fixed figure overwriting in plot_koff. +- Updated deprecated Matplotlib calls. +- Fixed LogNorm crash on zero-containing matrices. +- Replaced np.float128 with np.longdouble. +- Corrected output directory typo. + +Performance +----------- +- collect_residue_contacts now calls md.compute_distances once per residue. +- Nested progress bars implemented. + +Dependencies +------------ +- Removed python-louvain; using NetworkX built-in Louvain. +- Minimum versions added. +- Dropped Python < 3.9. + +Packaging & CI +-------------- +- Migrated to pyproject.toml. +- CI moved from Travis CI to GitHub Actions. Citation ======== -If you use PyLipID in scientific research, please cite: - .. image:: https://img.shields.io/badge/DOI-10.1021/acs.jctc.1c00708-blue :target: https://doi.org/10.1021/acs.jctc.1c00708 @@ -170,8 +129,7 @@ If you use PyLipID in scientific research, please cite: url = {https://doi.org/10.1021/acs.jctc.1c00708}, } - -Licence +License ======= -MIT licence. See `LICENSE.txt `_ for details. +MIT License. See LICENSE.txt. From 05a7ed09c5b4dd637416dfc91d40ee7f4c33ce5e Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 22:08:25 +0000 Subject: [PATCH 08/24] Update pyproject.toml --- pyproject.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3631cef..03bb08d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,9 @@ [build-system] -requires = ["setuptools>=64", "setuptools-scm"] -build-backend = "setuptools.backends.legacy:build" +requires = [ + "setuptools>=61", + "wheel" +] +build-backend = "setuptools.build_meta" [project] name = "pylipid" From 2e7ec8544311ceae7f95eb7acf177a6e79216f8d Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Tue, 10 Mar 2026 22:10:29 +0000 Subject: [PATCH 09/24] Update ci.yml --- .github/workflows/ci.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee50ca2..80717df 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,11 +25,15 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - cache: 'pip' + cache: pip + + # 🔧 NEW: ensure modern packaging tools before installing anything + - name: Upgrade pip, setuptools, and wheel + run: | + python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - python -m pip install --upgrade pip pip install -r requirements.txt pip install pytest pytest-cov flake8 From 8e889e3a60dcac31914f8607ea444af6327a88ad Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Wed, 11 Mar 2026 15:12:11 +0000 Subject: [PATCH 10/24] update --- pylipid/__init__.py | 1 + pylipid/func/kinetics.py | 6 +- tests/funcs/test_binding_site.py | 182 ++++++++++++++++--------------- tests/funcs/test_clusterer.py | 151 +++++++++++++++---------- tests/funcs/test_kinetics.py | 73 +++++++------ tests/plots/test_plot_1d.py | 13 +-- 6 files changed, 236 insertions(+), 190 deletions(-) diff --git a/pylipid/__init__.py b/pylipid/__init__.py index e23c929..0a534fd 100644 --- a/pylipid/__init__.py +++ b/pylipid/__init__.py @@ -15,3 +15,4 @@ ############################################################################## from ._version import __version__ +from .api import LipidInteraction diff --git a/pylipid/func/kinetics.py b/pylipid/func/kinetics.py index 9d392cc..d6ce542 100644 --- a/pylipid/func/kinetics.py +++ b/pylipid/func/kinetics.py @@ -216,11 +216,11 @@ def cal_survival_func(durations, t_total, delta_t_list): def _curve_fitting(survival_func, delta_t_list, initial_guess): - """Fit the exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" + r"""Fit the exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" survival_rates = np.nan_to_num([survival_func[delta_t] for delta_t in delta_t_list]) # y try: popt, pcov = curve_fit(_bi_expo, np.array(delta_t_list), np.array(survival_rates), p0=initial_guess, maxfev=100000) - n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.longdouble), *popt) + n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.float128), *popt) r_squared = 1 - np.sum((np.nan_to_num(n_fitted) - np.nan_to_num(survival_rates))**2)/np.sum((survival_rates - np.mean(survival_rates))**2) ks = [abs(k) for k in popt[:2]] @@ -235,7 +235,7 @@ def _curve_fitting(survival_func, delta_t_list, initial_guess): def _bi_expo(x, k1, k2, A, B): - """The exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" + r"""The exponential curve :math:`y=Ae^{-k_1\Delta t}+Be^{-k_2\Delta t}`""" return A*np.exp(-k1*x) + B*np.exp(-k2*x) diff --git a/tests/funcs/test_binding_site.py b/tests/funcs/test_binding_site.py index d8398bf..6227c0e 100644 --- a/tests/funcs/test_binding_site.py +++ b/tests/funcs/test_binding_site.py @@ -1,96 +1,102 @@ -import unittest -import os -import shutil -import numpy as np -from pylipid.func import get_node_list, collect_bound_poses, vectorize_poses, calculate_scores, write_bound_poses -from pylipid.func import calculate_site_surface_area -from pylipid import LipidInteraction - - -class TestBindingSites(unittest.TestCase): - - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname( os.path.abspath(__file__) ) - self.save_dir = os.path.join(file_dir, "test_binding_site") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts(write_log=False, print_log=True) - - def test_get_node_list(self): - corrcoef = self.li.interaction_corrcoef - node_list = get_node_list(corrcoef) - self.assertIsInstance(node_list, list) - - def test_collect_binding_poses(self): - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(self.li._node_list)} - contact_list = self.li.contact_residues_low - pose_pool = collect_bound_poses(binding_site_map, contact_list, self.li.trajfile_list[0], - self.li.topfile_list[0], self.li.lipid, self.li.stride, self.li.nprot) - self.assertIsInstance(pose_pool, dict) - - def tearDown(self): - shutil.rmtree(self.save_dir) - +""" +Tests for binding site detection, pose collection, scoring and surface area. +Uses the synthetic_traj_dir session fixture from conftest.py — no real data files needed. +""" -class TestBindingPoses(unittest.TestCase): - - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = os.path.join(file_dir, "binding_site") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts(write_log=False, print_log=True) - node_list = self.li.compute_binding_nodes(print_data=False) - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(node_list)} - contact_residue_dict = self.li.contact_residues_low - self.pose_pool = collect_bound_poses(binding_site_map, contact_residue_dict, self.li.trajfile_list[0], - self.li.topfile_list[0], self.li.lipid, self.li.stride, self.li.nprot) +import pytest +import numpy as np +from pylipid.func import (get_node_list, collect_bound_poses, vectorize_poses, + calculate_scores, write_bound_poses) +from pylipid.api import LipidInteraction +import shutil - def test_vectorize_poses(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, self.li._lipid_ref) - self.assertEqual(dist_matrix.shape[0], self.li._lipid_ref.n_atoms) - self.assertEqual(dist_matrix.shape[1], len(self.pose_pool[bs_id])) - self.assertEqual(dist_matrix.shape[2], len(self.li._node_list[bs_id])) - return - def test_calculate_scores(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, self.li._lipid_ref) +@pytest.fixture(scope="module") +def li(synthetic_traj_dir, tmp_path_factory): + save_dir = str(tmp_path_factory.mktemp("bs_output")) + instance = LipidInteraction( + synthetic_traj_dir["trajfile_list"], + cutoffs=synthetic_traj_dir["cutoffs"], + topfile_list=synthetic_traj_dir["topfile_list"], + lipid=synthetic_traj_dir["lipid"], + nprot=1, + save_dir=save_dir, + ) + instance.collect_residue_contacts() + instance.compute_residue_koff(plot_data=False) + instance.compute_binding_nodes(threshold=2, print_data=False) + yield instance + shutil.rmtree(save_dir, ignore_errors=True) + + +@pytest.fixture(scope="module") +def pose_pool(li, synthetic_traj_dir): + """Collect bound poses for all detected binding sites.""" + if not li._node_list: + pytest.skip("No binding sites detected in synthetic data") + binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(li._node_list)} + return collect_bound_poses( + binding_site_map, + li._contact_residues_low, + li.trajfile_list, + li.topfile_list, + li.lipid, + li._protein_ref, + li._lipid_ref, + stride=li.stride, + nprot=li.nprot, + ) + + +class TestBindingSites: + + def test_get_node_list(self, li): + corrcoef = li.interaction_corrcoef + node_list, modularity = get_node_list(corrcoef, threshold=2) + assert isinstance(node_list, list) + + def test_collect_binding_poses(self, pose_pool): + assert isinstance(pose_pool, dict) + + +class TestBindingPoses: + + def test_vectorize_poses(self, li, pose_pool): + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) == 0: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) + assert dist_matrix.shape[0] == li._lipid_ref.n_atoms + assert dist_matrix.shape[1] == len(pose_pool[bs_id]) + assert dist_matrix.shape[2] == len(nodes) + + def test_calculate_scores(self, li, pose_pool): + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) == 0: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) scores = calculate_scores(dist_matrix) - self.assertEqual(len(scores), pose_traj.n_frames) - scores = calculate_scores(dist_matrix, score_weights={"RHO": 10}) - self.assertEqual(len(scores), pose_traj.n_frames) - - - def test_write_binding_poses(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, self.li._lipid_ref) + assert len(scores) == pose_traj.n_frames + scores_weighted = calculate_scores(dist_matrix, score_weights={"RHO": 10}) + assert len(scores_weighted) == pose_traj.n_frames + + def test_write_binding_poses(self, li, pose_pool, tmp_path): + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) == 0: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) scores = calculate_scores(dist_matrix) - num_of_poses = min(5, pose_traj.n_frames) + if len(scores) == 0: + continue + num_of_poses = min(3, pose_traj.n_frames) pose_indices = np.argsort(scores)[::-1][:num_of_poses] - write_bound_poses(pose_traj, pose_indices, self.save_dir, pose_prefix="BSid{}_top".format(bs_id), + write_bound_poses(pose_traj, pose_indices, str(tmp_path), + pose_prefix="BSid{}_top".format(bs_id), pose_format="gro") - - def test_calculate_site_surface_area(self): - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(self.li._node_list)} - radii_book = {"BB": 0.36, "SC1": 0.33, "SC2": 0.33, "SC3": 0.33} - surface_area= calculate_site_surface_area(binding_site_map, radii_book, self.li.trajfile_list, - self.li.topfile_list, self.li.nprot, self.li.timeunit, - self.li.stride, self.li.dt_traj) - - def tearDown(self): - shutil.rmtree(self.save_dir) - - -if __name__ == "__main__": - unittest.main() + def test_compute_surface_area(self, li): + # Use the high-level API method — calculate_surface_area_wrapper is internal + li.compute_surface_area() diff --git a/tests/funcs/test_clusterer.py b/tests/funcs/test_clusterer.py index ef8e62c..0ba87fe 100644 --- a/tests/funcs/test_clusterer.py +++ b/tests/funcs/test_clusterer.py @@ -1,63 +1,96 @@ -import unittest -import os -import shutil +""" +Tests for pose clustering (DBSCAN and KMeans). +Uses the synthetic_traj_dir session fixture from conftest.py — no real data files needed. +""" + +import pytest import numpy as np from sklearn.decomposition import PCA from pylipid.func import collect_bound_poses, vectorize_poses, write_bound_poses from pylipid.func import cluster_DBSCAN, cluster_KMeans -from pylipid import LipidInteraction - -class TestCluster(unittest.TestCase): - - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = os.path.join(file_dir, "binding_site") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts(write_log=False, print_log=True) - node_list = self.li.compute_binding_nodes(print_data=False) - binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(node_list)} - contact_residue_dict = self.li.contact_residues_low - self.pose_pool = collect_bound_poses(binding_site_map, contact_residue_dict, self.li.trajfile_list[0], - self.li.topfile_list[0], self.li.lipid, self.li.stride, self.li.nprot) - - def test_cluster_DBSCAN(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, - self.li._lipid_ref) - lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() - for pose_id in np.arange(dist_matrix.shape[1])] - transformed_data = PCA(n_components=0.95).fit_transform(lipid_dist_per_pose) - cluster_labels = cluster_DBSCAN(transformed_data, eps=None, min_samples=None, - metric="euclidean") - self.assertEqual(len(cluster_labels), len(lipid_dist_per_pose)) - cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] - selected_pose_id = [np.random.choice(np.where(cluster_labels == cluster_id)[0], 1)[0] - for cluster_id in cluster_id_set] - write_bound_poses(pose_traj, selected_pose_id, self.save_dir, - pose_prefix="BSid{}_cluster_DBSCAN".format(bs_id), pose_format="gro") - - def test_cluster_KMeans(self): - for bs_id, nodes in enumerate(self.li._node_list): - dist_matrix, pose_traj = vectorize_poses(self.pose_pool[bs_id], nodes, self.li._protein_ref, - self.li._lipid_ref) - lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() - for pose_id in np.arange(dist_matrix.shape[1])] - transformed_data = PCA(n_components=0.95).fit_transform(lipid_dist_per_pose) - cluster_labels = cluster_KMeans(transformed_data, n_clusters=5) - self.assertEqual(len(cluster_labels), len(lipid_dist_per_pose)) - cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] - selected_pose_id = [np.random.choice(np.where(cluster_labels == cluster_id)[0], 1)[0] - for cluster_id in cluster_id_set] - write_bound_poses(pose_traj, selected_pose_id, self.save_dir, - pose_prefix="BSid{}_cluster_KMeans".format(bs_id), pose_format="gro") - - def tearDown(self): - shutil.rmtree(self.save_dir) - - if __name__ == "__main__": - unittest.main() +from pylipid.api import LipidInteraction +import shutil + + +@pytest.fixture(scope="module") +def li_and_poses(synthetic_traj_dir, tmp_path_factory): + save_dir = str(tmp_path_factory.mktemp("cluster_output")) + li = LipidInteraction( + synthetic_traj_dir["trajfile_list"], + cutoffs=synthetic_traj_dir["cutoffs"], + topfile_list=synthetic_traj_dir["topfile_list"], + lipid=synthetic_traj_dir["lipid"], + nprot=1, + save_dir=save_dir, + ) + li.collect_residue_contacts() + li.compute_residue_koff(plot_data=False) + li.compute_binding_nodes(threshold=2, print_data=False) + + if not li._node_list: + yield li, {} + else: + binding_site_map = {bs_id: nodes for bs_id, nodes in enumerate(li._node_list)} + pose_pool = collect_bound_poses( + binding_site_map, + li._contact_residues_low, + li.trajfile_list, + li.topfile_list, + li.lipid, + li._protein_ref, + li._lipid_ref, + stride=li.stride, + nprot=li.nprot, + ) + yield li, pose_pool + + shutil.rmtree(save_dir, ignore_errors=True) + + +class TestCluster: + + def test_cluster_DBSCAN(self, li_and_poses, tmp_path): + li, pose_pool = li_and_poses + if not li._node_list: + pytest.skip("No binding sites detected in synthetic data") + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) < 5: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) + lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() + for pose_id in np.arange(dist_matrix.shape[1])] + transformed_data = PCA(n_components=min(2, len(lipid_dist_per_pose) - 1)).fit_transform(lipid_dist_per_pose) + cluster_labels = cluster_DBSCAN(transformed_data, eps=None, min_samples=None, + metric="euclidean") + assert len(cluster_labels) == len(lipid_dist_per_pose) + cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] + if cluster_id_set: + selected_pose_id = [np.random.choice(np.where(cluster_labels == cid)[0], 1)[0] + for cid in cluster_id_set] + write_bound_poses(pose_traj, selected_pose_id, str(tmp_path), + pose_prefix="BSid{}_cluster_DBSCAN".format(bs_id), + pose_format="gro") + + def test_cluster_KMeans(self, li_and_poses, tmp_path): + li, pose_pool = li_and_poses + if not li._node_list: + pytest.skip("No binding sites detected in synthetic data") + for bs_id, nodes in enumerate(li._node_list): + if bs_id not in pose_pool or len(pose_pool[bs_id]) < 5: + continue + dist_matrix, pose_traj = vectorize_poses( + pose_pool[bs_id], nodes, li._protein_ref, li._lipid_ref) + lipid_dist_per_pose = [dist_matrix[:, pose_id, :].ravel() + for pose_id in np.arange(dist_matrix.shape[1])] + n_clusters = min(3, len(lipid_dist_per_pose)) + transformed_data = PCA(n_components=min(2, len(lipid_dist_per_pose) - 1)).fit_transform(lipid_dist_per_pose) + cluster_labels = cluster_KMeans(transformed_data, n_clusters=n_clusters) + assert len(cluster_labels) == len(lipid_dist_per_pose) + cluster_id_set = [label for label in np.unique(cluster_labels) if label != -1] + if cluster_id_set: + selected_pose_id = [np.random.choice(np.where(cluster_labels == cid)[0], 1)[0] + for cid in cluster_id_set] + write_bound_poses(pose_traj, selected_pose_id, str(tmp_path), + pose_prefix="BSid{}_cluster_KMeans".format(bs_id), + pose_format="gro") diff --git a/tests/funcs/test_kinetics.py b/tests/funcs/test_kinetics.py index 1bf2610..477582f 100644 --- a/tests/funcs/test_kinetics.py +++ b/tests/funcs/test_kinetics.py @@ -1,41 +1,48 @@ -import unittest -import os -import shutil +""" +Tests for koff and survival function calculations. +Uses the synthetic_traj_dir session fixture from conftest.py — no real data files needed. +""" + +import pytest import numpy as np +import shutil from pylipid.func import cal_koff, cal_survival_func -from pylipid import LipidInteraction - -class TestKinetics(unittest.TestCase): +from pylipid.api import LipidInteraction - def setUp(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 1.0] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = os.path.join(file_dir, "test_kinetics") - self.li = LipidInteraction(trajfile_list, topfile_list, cutoffs=cutoffs, lipid=lipid, - nprot=1, save_dir=self.save_dir) - self.li.collect_residue_contacts() - self.t_total = np.max(self.li._T_total) - self.timestep = np.min(self.li._timesteps) - def test_cal_survivla_function(self): - delta_t_list = np.arange(0, self.t_total, self.timestep) - survival_func = cal_survival_func(np.concatenate(self.li.durations[25]), self.t_total, delta_t_list) - self.assertIsInstance(survival_func, dict) +@pytest.fixture(scope="module") +def li(synthetic_traj_dir, tmp_path_factory): + save_dir = str(tmp_path_factory.mktemp("kinetics_output")) + instance = LipidInteraction( + synthetic_traj_dir["trajfile_list"], + cutoffs=synthetic_traj_dir["cutoffs"], + topfile_list=synthetic_traj_dir["topfile_list"], + lipid=synthetic_traj_dir["lipid"], + nprot=1, + save_dir=save_dir, + ) + instance.collect_residue_contacts() + instance.compute_residue_duration() + yield instance + shutil.rmtree(save_dir, ignore_errors=True) - def test_cal_koff(self): - koff, restime, properties = cal_koff(np.concatenate(self.li.durations[25]), self.t_total, - self.timestep, nbootstrap=20, initial_guess=[1,1,1,1]) - print(koff) - print(restime) - print(properties) - self.assertIsInstance(properties, dict) - def tearDown(self): - shutil.rmtree(self.li.save_dir) +class TestKinetics: + def test_cal_survival_function(self, li): + # Use residue 0 — guaranteed to exist in synthetic data + t_total = np.max(li._T_total) + timestep = np.min(li._timesteps) + durations = np.concatenate(li._duration[0]) + delta_t_list = np.arange(0, t_total, timestep) + survival_func = cal_survival_func(durations, t_total, delta_t_list) + assert isinstance(survival_func, dict) -if __name__ == "__main__": - unittest.main() + def test_cal_koff(self, li): + t_total = np.max(li._T_total) + timestep = np.min(li._timesteps) + durations = np.concatenate(li._duration[0]) + koff, restime, properties = cal_koff(durations, t_total, timestep, + nbootstrap=5, + initial_guess=[1, 1, 1, 1]) + assert isinstance(properties, dict) diff --git a/tests/plots/test_plot_1d.py b/tests/plots/test_plot_1d.py index 5769387..8184559 100644 --- a/tests/plots/test_plot_1d.py +++ b/tests/plots/test_plot_1d.py @@ -4,7 +4,7 @@ import shutil import pandas as pd from pylipid.util import check_dir -from pylipid.plot import plot_residue_data, plot_residue_data_logos +from pylipid.plot import plot_residue_data, plot_residue_data_logo from pylipid.plot import plot_binding_site_data, plot_surface_area class TestPlot1d(unittest.TestCase): @@ -23,18 +23,18 @@ def test_plot_residue_data(self): ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_1.pdf"), title="This is a test") logos = np.random.choice(letters_base, size=120) - plot_residue_data_logos(residue_index, logos, interactions, gap=100, letter_map=None, + plot_residue_data_logo(residue_index, logos, interactions, gap=100, letter_map=None, color_scheme="chemistry", ylabel="interactions", fn=os.path.join(self.save_dir, "plot_residue_data_logo_1.pdf")) # gap in-between sequence residue_index = np.concatenate([np.arange(120), np.arange(138, 183), np.arange(355, 382)]) - interactions = np.random.random(size=(120+(183-138)+(312-255))) + interactions = np.random.random(size=(120+(183-138)+(382-355))) plot_residue_data(residue_index, interactions, gap=50, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_2.pdf"), title="This is a test") logos = np.random.choice(letters_base, size=len(interactions)) - plot_residue_data_logos(residue_index, logos, interactions, gap=100, + plot_residue_data_logo(residue_index, logos, interactions, gap=100, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_logo_2.pdf")) # two chains in sequences. @@ -44,7 +44,7 @@ def test_plot_residue_data(self): ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_3.pdf"), title="This is a test") logos = np.random.choice(letters_base, size=len(interactions)) - plot_residue_data_logos(residue_index, logos, interactions, gap=100, + plot_residue_data_logo(residue_index, logos, interactions, gap=100, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_logo_3.pdf")) # check letter mapping @@ -52,7 +52,7 @@ def test_plot_residue_data(self): three_letter_seq = np.random.choice(list(letter_map.keys()), size=23) residue_index = np.arange(23) + 52 interactions = np.random.random(size=23) - plot_residue_data_logos(residue_index, three_letter_seq, interactions, gap=100, + plot_residue_data_logo(residue_index, three_letter_seq, interactions, gap=100, ylabel=None, fn=os.path.join(self.save_dir, "plot_residue_data_logo_4.pdf"), letter_map=letter_map) @@ -82,4 +82,3 @@ def tearDown(self): if __name__ == "__main__": unittest.main() - From 6b21cc3dfcf769eafea01841f1c75f964819ee8f Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Wed, 11 Mar 2026 15:33:11 +0000 Subject: [PATCH 11/24] update --- pylipid/util/corrcoef.py | 9 +- tests/api/test_LipidInteraction.py | 226 ++++++++++++++++++++--------- tests/conftest.py | 157 ++++++++++++++++++++ tests/data/.DS_Store | Bin 6148 -> 6148 bytes tests/util/test_utils.py | 66 +++++---- 5 files changed, 359 insertions(+), 99 deletions(-) create mode 100644 tests/conftest.py diff --git a/pylipid/util/corrcoef.py b/pylipid/util/corrcoef.py index f09a05b..85b4f5f 100644 --- a/pylipid/util/corrcoef.py +++ b/pylipid/util/corrcoef.py @@ -33,7 +33,14 @@ def sparse_corrcoef(A, B=None): # version-dependent sparse-dense arithmetic (scipy >= 1.14 / numpy >= 1.25). rowsum = np.asarray(A.sum(1)).flatten() # shape (n_rows,) ndarray centering = np.outer(rowsum, rowsum) / n # explicit outer product - C = (np.asarray(A.dot(A.T.conjugate()).todense()) - centering) / (n - 1) + # Handle both sparse matrices (from collect_residue_contacts) and + # plain ndarrays (from tests or direct calls). + dot = A.dot(A.T.conjugate()) + if sparse.issparse(dot): + dot = np.asarray(dot.todense()) + else: + dot = np.asarray(dot) + C = (dot - centering) / (n - 1) # The correlation coefficients are given by # C_{i,j} / sqrt(C_{i} * C_{j}) d = np.diag(C) diff --git a/tests/api/test_LipidInteraction.py b/tests/api/test_LipidInteraction.py index 8f8981f..4e88949 100644 --- a/tests/api/test_LipidInteraction.py +++ b/tests/api/test_LipidInteraction.py @@ -1,75 +1,157 @@ -import unittest -import os -import shutil -from pylipid.api import LipidInteraction -from pylipid.util import check_dir - -class TestLipidInteraction(unittest.TestCase): - - def test_pylipid(self): - trajfile_list = ["../data/run1/protein_lipids.xtc", "../data/run2/protein_lipids.xtc"] - topfile_list = ["../data/run1/protein_lipids.gro", "../data/run2/protein_lipids.gro"] - lipid = "CHOL" - cutoffs = [0.55, 0.8] - file_dir = os.path.dirname(os.path.abspath(__file__)) - self.save_dir = check_dir(os.path.join(file_dir, "test_pylipid")) - li = LipidInteraction(trajfile_list, cutoffs=cutoffs, topfile_list=topfile_list, lipid=lipid, - nprot=1, save_dir=self.save_dir) - li.collect_residue_contacts() - - li.compute_residue_duration() - li.compute_residue_duration(10) - li.compute_residue_duration([2,3,4]) - - li.compute_residue_koff(plot_data=False) - li.compute_residue_koff(10) - li.compute_residue_koff([2,4,5,10]) - - li.compute_binding_nodes(threshold=4) - li.compute_binding_nodes(threshold=2, print_data=False) - - li.compute_site_koff(plot_data=False) - li.compute_site_koff(binding_site_id=[1,2,3]) - li.compute_site_koff(binding_site_id=1) - - li.compute_site_duration() - li.compute_site_duration(1) - li.compute_site_duration([0,1]) - - li.analyze_bound_poses() - li.analyze_bound_poses(binding_site_id=[1,2,3]) - li.analyze_bound_poses(binding_site_id=[1,2,3], n_clusters=2) - - li.compute_surface_area() - li.compute_surface_area(binding_site_id=[1,2,3]) - li.compute_surface_area(binding_site_id=[1, 2, 3], radii={"BB": 0.30, "SC1": 0.2}) - - li.write_site_info() - li.write_site_info(sort_residue="Duration") - - li.show_stats_per_traj() - - li.save_data(item="Dataset") - li.save_data(item="Duration") - - li.save_coordinate(item="Residence Time") - li.save_coordinate(item="Duration") - - li.save_pymol_script(pdb_file="../data/receptor.pdb") - - li.plot(item="Duration") - li.plot(item="Residence Time") - - li.plot_logo(item="Residence Time") - li.plot_logo(item="Lipid Count") - - def tearDown(self): - shutil.rmtree(self.save_dir) - -if __name__ == "__main__": - unittest.main() - - +""" +Pytest fixtures for PyLipID tests. +Generates small synthetic MDTraj trajectories in a temporary directory so that +CI can run without any committed trajectory data files. +Topology: + - 30 protein residues (ALA/GLY alternating, one chain) + - 20 CHOL lipid molecules (3 atoms each: C1, C2, O1) + - Cubic box, 8 nm side + - 2 trajectory files, 100 frames each +""" +import os +import numpy as np +import pytest +import mdtraj as md + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_AA_NAMES = ["ALA", "GLY", "VAL", "LEU", "ILE", "PRO", "PHE", "TRP", + "MET", "SER"] +_N_PROTEIN_RESIDUES = 30 +_N_LIPID_MOLECULES = 20 +_N_FRAMES = 100 +_BOX_NM = 8.0 # cubic box side length in nm +_N_RUNS = 2 # number of trajectory files + + +def _build_topology(): + """Build a minimal MDTraj topology: protein + CHOL lipids.""" + top = md.Topology() + + # --- protein chain --- + pchain = top.add_chain() + for i in range(_N_PROTEIN_RESIDUES): + resname = _AA_NAMES[i % len(_AA_NAMES)] + res = top.add_residue(resname, pchain) + top.add_atom("CA", md.element.carbon, res) + top.add_atom("CB", md.element.carbon, res) + top.add_atom("N", md.element.nitrogen, res) + top.add_atom("C", md.element.carbon, res) + top.add_atom("O", md.element.oxygen, res) + + # --- lipid chain --- + lchain = top.add_chain() + for _ in range(_N_LIPID_MOLECULES): + res = top.add_residue("CHOL", lchain) + top.add_atom("C1", md.element.carbon, res) + top.add_atom("C2", md.element.carbon, res) + top.add_atom("O1", md.element.oxygen, res) + + return top + + +def _build_trajectory(top, rng, n_frames=_N_FRAMES): + """ + Build a synthetic trajectory. + + Protein residues are placed in a cluster near the box centre. + Lipid molecules are placed nearby so that some will fall within + the contact cutoffs and generate non-trivial interaction data. + """ + n_atoms = top.n_atoms + xyz = np.zeros((n_frames, n_atoms, 3), dtype=np.float32) + + atoms_per_prot_res = 5 # CA CB N C O + atoms_per_lipid = 3 # C1 C2 O1 + n_prot_atoms = _N_PROTEIN_RESIDUES * atoms_per_prot_res + + centre = _BOX_NM / 2.0 + + for frame in range(n_frames): + # protein — small fluctuations around centre + prot_base = rng.uniform(centre - 1.0, centre + 1.0, + size=(n_prot_atoms, 3)).astype(np.float32) + prot_noise = rng.normal(0, 0.05, size=prot_base.shape).astype(np.float32) + xyz[frame, :n_prot_atoms] = prot_base + prot_noise + + # lipids — placed close enough to protein to generate contacts + for lip_idx in range(_N_LIPID_MOLECULES): + start = n_prot_atoms + lip_idx * atoms_per_lipid + # Alternate between near (within cutoff) and far positions + if lip_idx < _N_LIPID_MOLECULES // 2: + offset = rng.uniform(0.3, 0.6, size=(atoms_per_lipid, 3)) + else: + offset = rng.uniform(1.5, 3.0, size=(atoms_per_lipid, 3)) + sign = rng.choice([-1, 1], size=(atoms_per_lipid, 3)) + xyz[frame, start:start + atoms_per_lipid] = ( + centre + sign * offset.astype(np.float32) + ) + + unitcell_lengths = np.full((n_frames, 3), _BOX_NM, dtype=np.float32) + unitcell_angles = np.full((n_frames, 3), 90.0, dtype=np.float32) + + return md.Trajectory(xyz, top, + unitcell_lengths=unitcell_lengths, + unitcell_angles=unitcell_angles) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session") +def synthetic_traj_dir(tmp_path_factory): + """ + Session-scoped fixture: builds synthetic trajectory files once per test + session and returns a dict with file paths and metadata. + + Returns + ------- + dict with keys: + trajfile_list : list of .xtc paths + topfile_list : list of .gro paths + pdb_file : path to receptor .pdb + lipid : lipid residue name (str) + cutoffs : [lower, upper] cutoffs in nm + """ + rng = np.random.default_rng(42) + top = _build_topology() + base = tmp_path_factory.mktemp("traj_data") + + trajfile_list = [] + topfile_list = [] + + for run_idx in range(_N_RUNS): + run_dir = base / "run{}".format(run_idx + 1) + run_dir.mkdir() + + traj = _build_trajectory(top, rng) + + xtc_path = str(run_dir / "protein_lipids.xtc") + gro_path = str(run_dir / "protein_lipids.gro") + + traj.save_xtc(xtc_path) + traj[0].save_gro(gro_path) + + trajfile_list.append(xtc_path) + topfile_list.append(gro_path) + + # receptor PDB (protein atoms only) + prot_atom_indices = top.select("protein") + prot_traj = traj[0].atom_slice(prot_atom_indices) + pdb_path = str(base / "receptor.pdb") + prot_traj.save_pdb(pdb_path) + + return { + "trajfile_list": trajfile_list, + "topfile_list": topfile_list, + "pdb_file": pdb_path, + "lipid": "CHOL", + "cutoffs": [0.55, 0.8], + } diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..4e88949 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,157 @@ +""" +Pytest fixtures for PyLipID tests. + +Generates small synthetic MDTraj trajectories in a temporary directory so that +CI can run without any committed trajectory data files. + +Topology: + - 30 protein residues (ALA/GLY alternating, one chain) + - 20 CHOL lipid molecules (3 atoms each: C1, C2, O1) + - Cubic box, 8 nm side + - 2 trajectory files, 100 frames each +""" + +import os +import numpy as np +import pytest +import mdtraj as md + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_AA_NAMES = ["ALA", "GLY", "VAL", "LEU", "ILE", "PRO", "PHE", "TRP", + "MET", "SER"] +_N_PROTEIN_RESIDUES = 30 +_N_LIPID_MOLECULES = 20 +_N_FRAMES = 100 +_BOX_NM = 8.0 # cubic box side length in nm +_N_RUNS = 2 # number of trajectory files + + +def _build_topology(): + """Build a minimal MDTraj topology: protein + CHOL lipids.""" + top = md.Topology() + + # --- protein chain --- + pchain = top.add_chain() + for i in range(_N_PROTEIN_RESIDUES): + resname = _AA_NAMES[i % len(_AA_NAMES)] + res = top.add_residue(resname, pchain) + top.add_atom("CA", md.element.carbon, res) + top.add_atom("CB", md.element.carbon, res) + top.add_atom("N", md.element.nitrogen, res) + top.add_atom("C", md.element.carbon, res) + top.add_atom("O", md.element.oxygen, res) + + # --- lipid chain --- + lchain = top.add_chain() + for _ in range(_N_LIPID_MOLECULES): + res = top.add_residue("CHOL", lchain) + top.add_atom("C1", md.element.carbon, res) + top.add_atom("C2", md.element.carbon, res) + top.add_atom("O1", md.element.oxygen, res) + + return top + + +def _build_trajectory(top, rng, n_frames=_N_FRAMES): + """ + Build a synthetic trajectory. + + Protein residues are placed in a cluster near the box centre. + Lipid molecules are placed nearby so that some will fall within + the contact cutoffs and generate non-trivial interaction data. + """ + n_atoms = top.n_atoms + xyz = np.zeros((n_frames, n_atoms, 3), dtype=np.float32) + + atoms_per_prot_res = 5 # CA CB N C O + atoms_per_lipid = 3 # C1 C2 O1 + n_prot_atoms = _N_PROTEIN_RESIDUES * atoms_per_prot_res + + centre = _BOX_NM / 2.0 + + for frame in range(n_frames): + # protein — small fluctuations around centre + prot_base = rng.uniform(centre - 1.0, centre + 1.0, + size=(n_prot_atoms, 3)).astype(np.float32) + prot_noise = rng.normal(0, 0.05, size=prot_base.shape).astype(np.float32) + xyz[frame, :n_prot_atoms] = prot_base + prot_noise + + # lipids — placed close enough to protein to generate contacts + for lip_idx in range(_N_LIPID_MOLECULES): + start = n_prot_atoms + lip_idx * atoms_per_lipid + # Alternate between near (within cutoff) and far positions + if lip_idx < _N_LIPID_MOLECULES // 2: + offset = rng.uniform(0.3, 0.6, size=(atoms_per_lipid, 3)) + else: + offset = rng.uniform(1.5, 3.0, size=(atoms_per_lipid, 3)) + sign = rng.choice([-1, 1], size=(atoms_per_lipid, 3)) + xyz[frame, start:start + atoms_per_lipid] = ( + centre + sign * offset.astype(np.float32) + ) + + unitcell_lengths = np.full((n_frames, 3), _BOX_NM, dtype=np.float32) + unitcell_angles = np.full((n_frames, 3), 90.0, dtype=np.float32) + + return md.Trajectory(xyz, top, + unitcell_lengths=unitcell_lengths, + unitcell_angles=unitcell_angles) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session") +def synthetic_traj_dir(tmp_path_factory): + """ + Session-scoped fixture: builds synthetic trajectory files once per test + session and returns a dict with file paths and metadata. + + Returns + ------- + dict with keys: + trajfile_list : list of .xtc paths + topfile_list : list of .gro paths + pdb_file : path to receptor .pdb + lipid : lipid residue name (str) + cutoffs : [lower, upper] cutoffs in nm + """ + rng = np.random.default_rng(42) + top = _build_topology() + base = tmp_path_factory.mktemp("traj_data") + + trajfile_list = [] + topfile_list = [] + + for run_idx in range(_N_RUNS): + run_dir = base / "run{}".format(run_idx + 1) + run_dir.mkdir() + + traj = _build_trajectory(top, rng) + + xtc_path = str(run_dir / "protein_lipids.xtc") + gro_path = str(run_dir / "protein_lipids.gro") + + traj.save_xtc(xtc_path) + traj[0].save_gro(gro_path) + + trajfile_list.append(xtc_path) + topfile_list.append(gro_path) + + # receptor PDB (protein atoms only) + prot_atom_indices = top.select("protein") + prot_traj = traj[0].atom_slice(prot_atom_indices) + pdb_path = str(base / "receptor.pdb") + prot_traj.save_pdb(pdb_path) + + return { + "trajfile_list": trajfile_list, + "topfile_list": topfile_list, + "pdb_file": pdb_path, + "lipid": "CHOL", + "cutoffs": [0.55, 0.8], + } diff --git a/tests/data/.DS_Store b/tests/data/.DS_Store index 2174845ecddddc7cb04f829b887d1c16c094bab6..f5aae607f02d3060e0f335785e42eb56255d0ae0 100644 GIT binary patch delta 140 zcmZoMXfc=|&e%S&P>hv>fq{WzVxfo>6OaJ{%s|Y@z#zboQk Date: Wed, 11 Mar 2026 16:30:00 +0000 Subject: [PATCH 12/24] Update kinetics.py --- pylipid/func/kinetics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylipid/func/kinetics.py b/pylipid/func/kinetics.py index d6ce542..d7da794 100644 --- a/pylipid/func/kinetics.py +++ b/pylipid/func/kinetics.py @@ -220,7 +220,7 @@ def _curve_fitting(survival_func, delta_t_list, initial_guess): survival_rates = np.nan_to_num([survival_func[delta_t] for delta_t in delta_t_list]) # y try: popt, pcov = curve_fit(_bi_expo, np.array(delta_t_list), np.array(survival_rates), p0=initial_guess, maxfev=100000) - n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.float128), *popt) + n_fitted = _bi_expo(np.array(delta_t_list, dtype=np.longdouble), *popt) r_squared = 1 - np.sum((np.nan_to_num(n_fitted) - np.nan_to_num(survival_rates))**2)/np.sum((survival_rates - np.mean(survival_rates))**2) ks = [abs(k) for k in popt[:2]] From 25d1877bb9defed248210656a277a1906b0d1627 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Wed, 11 Mar 2026 16:59:54 +0000 Subject: [PATCH 13/24] updates to tests --- pylipid/api/api.py | 3 ++- pylipid/plot/plot1d.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pylipid/api/api.py b/pylipid/api/api.py index 548a25d..64d7bb3 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -1497,7 +1497,8 @@ def compute_surface_area(self, binding_site_id=None, radii=None, plot_data=True, surface_area_per_residue[nodes] = surface_area_data["Binding Site {}".format(bs_id)].mean() self.dataset["Binding Site Surface Area"] = surface_area_per_residue # plot surface area - if plot_data: + _bs_cols = [c for c in surface_area_data.columns if c != "Time"] + if plot_data and _bs_cols: if save_dir is not None: surface_area_dir = check_dir(save_dir) else: diff --git a/pylipid/plot/plot1d.py b/pylipid/plot/plot1d.py index 0f12870..5ccfefd 100644 --- a/pylipid/plot/plot1d.py +++ b/pylipid/plot/plot1d.py @@ -360,6 +360,8 @@ def plot_surface_area(surface_area, fig_fn, timeunit=None, fig_close=False): row_set = list(set([ind[:2] for ind in surface_area.index])) row_set.sort() col_set = [col for col in surface_area.columns if col != "Time"] + if not col_set or not row_set: + return colors = [next(color_set) for dummy in col_set] fig, axes = plt.subplots(len(row_set), len(col_set), figsize=(len(col_set)*2.4, len(row_set)*1.6), sharex=True, sharey=True) From 0e099bb9dcefb1b293237b274329da7ccd54b6d0 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Wed, 11 Mar 2026 20:52:30 +0000 Subject: [PATCH 14/24] Update api.py --- pylipid/api/api.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pylipid/api/api.py b/pylipid/api/api.py index 64d7bb3..0121076 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -27,7 +27,8 @@ from scipy.sparse import coo_matrix import pandas as pd from tqdm import trange, tqdm -from p_tqdm import p_map +#from p_tqdm import p_map +from multiprocessing import get_context from ..func import cal_contact_residues from ..func import Duration from ..func import cal_lipidcount, cal_occupancy @@ -38,6 +39,20 @@ from ..plot import plot_residue_data, plot_corrcoef, plot_residue_data_logo from ..util import check_dir, write_PDB, write_pymol_script, sparse_corrcoef, get_traj_info +def _spawn_pmap(func, *iterables, num_cpus=None, desc=None): + """Parallel map using multiprocessing with 'spawn' (macOS-safe).""" + iterables = [list(it) for it in iterables] + if not iterables: + return [] + n = len(iterables[0]) + if any(len(it) != n for it in iterables): + raise ValueError("All iterables must have the same length") + if n == 0: + return [] + args_iterable = list(zip(*iterables)) + ctx = get_context("spawn") + with ctx.Pool(processes=num_cpus) as pool: + return list(tqdm(pool.starmap(func, args_iterable), total=n, desc=desc)) class LipidInteraction: def __init__(self, trajfile_list, cutoffs=[0.475, 0.7], lipid="CHOL", topfile_list=None, lipid_atoms=None, @@ -711,7 +726,7 @@ def compute_residue_koff(self, residue_id=None, nbootstrap=10, initial_guess=[1. else: fn_set = [False for dummy in selected_residue_id] - returned_values = p_map(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, + returned_values = _spawn_pmap(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, initial_guess=initial_guess, plot_data=plot_data, timeunit=self._timeunit, fig_close=fig_close), [np.concatenate(self._duration[residue_id]) for residue_id in selected_residue_id], @@ -1155,7 +1170,7 @@ def compute_site_koff(self, binding_site_id=None, nbootstrap=10, initial_guess=[ fn_set = [os.path.join(BS_dir, f"BS_id{bs_id}.{fig_format}") for bs_id in selected_bs_id] else: fn_set = [False for dummy in selected_bs_id] - returned_values = p_map(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, + returned_values = _spawn_pmap(partial(calculate_koff_wrapper, t_total=t_total, timestep=timestep, nbootstrap=nbootstrap, initial_guess=initial_guess, plot_data=plot_data, timeunit=self._timeunit, fig_close=fig_close), [np.concatenate(self._duration_BS[bs_id]) for bs_id in selected_bs_id], @@ -1370,7 +1385,7 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a if n_top_poses > 0: # multiprocessing wrapped under p_tqdm - rmsd_set = p_map(partial(analyze_pose_wrapper, protein_atom_indices=protein_atom_indices, + rmsd_set = _spawn_pmap(partial(analyze_pose_wrapper, protein_atom_indices=protein_atom_indices, lipid_atom_indices=lipid_atom_indices, n_top_poses=n_top_poses, pose_dir=pose_dir, atom_weights=atom_weights, kde_bw=kde_bw, pca_component=pca_component, pose_format=pose_format, n_clusters=n_clusters, @@ -1479,7 +1494,7 @@ def compute_surface_area(self, binding_site_id=None, radii=None, plot_data=True, selected_bs_id = np.atleast_1d(np.array(binding_site_id, dtype=int)) if binding_site_id is not None \ else np.arange(len(self._node_list), dtype=int) selected_bs_id_map = {bs_id: self._node_list[bs_id] for bs_id in selected_bs_id} - returned_values = p_map(partial(calculate_surface_area_wrapper, binding_site_map=selected_bs_id_map, + returned_values = _spawn_pmap(partial(calculate_surface_area_wrapper, binding_site_map=selected_bs_id_map, nprot=self._nprot, timeunit=self._timeunit, stride=self._stride, dt_traj=self._dt_traj, radii=radii_book), self._trajfile_list, self._topfile_list, np.arange(len(self._trajfile_list), dtype=int), From 298a929ebf0b5c08d4d046881ac44b5b4fe381d2 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Wed, 11 Mar 2026 21:16:34 +0000 Subject: [PATCH 15/24] Update __init__.py --- pylipid/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pylipid/__init__.py b/pylipid/__init__.py index 0a534fd..c128350 100644 --- a/pylipid/__init__.py +++ b/pylipid/__init__.py @@ -14,5 +14,25 @@ # copies or substantial portions of the Software. ############################################################################## +# --- BEGIN: macOS-safe bootstrap ------------------------------------------- +# Ensures PyLipID runs safely on macOS by: +# 1) forcing multiprocessing to use 'spawn' +# 2) preventing GUI backends from loading in worker processes +import multiprocessing as _mp + +try: + # Safe on macOS — harmless on Linux/Windows + _mp.set_start_method("spawn", force=True) +except RuntimeError: + # Already set or protected — ignore + pass + +# Use a non-interactive backend so importing PyLipID never loads macOS AppKit + import matplotlib + matplotlib.use("Agg", force=True) +except Exception: + pass +# --- END: macOS-safe bootstrap --------------------------------------------- + from ._version import __version__ from .api import LipidInteraction From 93634ef24c51258c7758a4e2904422fc1a89a6ff Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Wed, 11 Mar 2026 21:17:26 +0000 Subject: [PATCH 16/24] Update __init__.py --- pylipid/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pylipid/__init__.py b/pylipid/__init__.py index c128350..72a0e4c 100644 --- a/pylipid/__init__.py +++ b/pylipid/__init__.py @@ -28,6 +28,7 @@ pass # Use a non-interactive backend so importing PyLipID never loads macOS AppKit +try: import matplotlib matplotlib.use("Agg", force=True) except Exception: From 8dbe28a24bb49f6f6815e4d75481482bb5454344 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Thu, 12 Mar 2026 20:13:02 +0000 Subject: [PATCH 17/24] Update directory.py --- pylipid/util/directory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylipid/util/directory.py b/pylipid/util/directory.py index 265f4d2..616a0ca 100644 --- a/pylipid/util/directory.py +++ b/pylipid/util/directory.py @@ -38,6 +38,6 @@ def check_dir(directory=None, suffix=None, print_info=True): if not os.path.isdir(directory): os.makedirs(directory) if print_info: - print("Creating new director: {}".format(directory)) + print(f"Creating new directory: {path}") return directory From 2d736ae0a87da3e583c2a515939f47a0c33d4b8c Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Thu, 12 Mar 2026 20:35:34 +0000 Subject: [PATCH 18/24] Update directory.py --- pylipid/util/directory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylipid/util/directory.py b/pylipid/util/directory.py index 616a0ca..b108e3f 100644 --- a/pylipid/util/directory.py +++ b/pylipid/util/directory.py @@ -38,6 +38,6 @@ def check_dir(directory=None, suffix=None, print_info=True): if not os.path.isdir(directory): os.makedirs(directory) if print_info: - print(f"Creating new directory: {path}") + print("Creating new directory: {}".format(path)) return directory From 8e639492cfb7e2594b47611bc0403c45acc4f58f Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Thu, 12 Mar 2026 20:39:55 +0000 Subject: [PATCH 19/24] Update directory.py --- pylipid/util/directory.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/pylipid/util/directory.py b/pylipid/util/directory.py index b108e3f..114dce2 100644 --- a/pylipid/util/directory.py +++ b/pylipid/util/directory.py @@ -21,23 +21,18 @@ __all__ = ["check_dir"] -def check_dir(directory=None, suffix=None, print_info=True): - """Check directory +def check_dir(root, name=None): + """ + Create a directory under 'root'. If 'name' is provided, the directory + will be root/name. Returns the full path. + """ + import os - This function will combine the suffix with the given directory (or the current - working directory if None is given) to generate a new directory name, and create a - directory with this name if it does not exit. + path = os.path.join(root, name) if name is not None else root - """ - if directory is None: - directory = os.getcwd() - else: - directory = os.path.abspath(directory) - if suffix is not None: - directory = os.path.join(directory, suffix) - if not os.path.isdir(directory): - os.makedirs(directory) - if print_info: - print("Creating new directory: {}".format(path)) - return directory + if not os.path.isdir(path): + print(f"Creating new directory: {path}") + os.makedirs(path, exist_ok=True) + + return path From c03906db80e3600397e9c7c39b2510f20bcc082e Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Thu, 12 Mar 2026 21:12:43 +0000 Subject: [PATCH 20/24] Update api.py --- pylipid/api/api.py | 338 +++++++++++++++++---------------------------- 1 file changed, 129 insertions(+), 209 deletions(-) diff --git a/pylipid/api/api.py b/pylipid/api/api.py index 0121076..3342fa3 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -1198,215 +1198,135 @@ def compute_site_koff(self, binding_site_id=None, nbootstrap=10, initial_guess=[ [self._res_time_BS[bs_id] for bs_id in selected_bs_id] def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="auto", pose_format="gro", - score_weights=None, kde_bw=0.15, pca_component=0.90, plot_rmsd=True, save_dir=None, - eps=None, min_samples=None, metric="euclidean", - fig_close=False, fig_format="pdf", num_cpus=None): - r"""Analyze bound poses for binding sites. - - This function can find representative bound poses, cluster the bound poses and calculate pose RMSD for - binding sites. - - If ``n_top_poses`` is an integer larger than 0, this method will find the representative bound poses for the specified - binding sites. To do so, it evaluates all the bound poses in a binding site using a density-based scoring function - and ranks the poses using based on the scores. The scoring function is defined as: - - .. math:: - \text { score }=\sum_{i} W_{i} \cdot \hat{f}_{i, H}(D) - - where :math:`W_{i}` is the weight given to atom i of the lipid molecule, H is the bandwidth and - :math:`\hat{f}_{i, H}(D)` is a multivariate kernel density etimation of the position of atom i in the specified - binding site. :math:`\hat{f}_{i, H}(D)` is calculated from all the bound lipid poses in that binding site. - The position of atom i is a `p`-variant vector, :math:`\left[D_{i 1}, D_{i 2}, \ldots, D_{i p}\right]` where - :math:`D_{i p}` is the minimum distance to the residue `p` of the binding site. The multivariant kernel density - is estimated by `KDEMultivariate - `_ - provided by Statsmodels. Higher weights can be given to e.g. the headgroup atoms of phospholipids, to generate - better defined binding poses, but all lipid atoms are weighted equally by default. The use of relative positions - of lipid atoms in their binding site makes the analysis independent of the conformational changes in the rest - part of the protein. - - If ``n_clusters`` is given an integer larger than 0, this method will cluster the lipid bound poses in the specified - binding site using `KMeans `_ - provided by scikit. The KMeans cluster separates the samples into `n` clusters of equal variances, via minimizing - the `inertia`, which is defined as: - - .. math:: - \sum_{i=0}^{n} \min _{u_{i} \in C}\left(\left\|x_{i}-u_{i}\right\|^{2}\right) - - where :math:`u_{i}` is the `centroid` of cluster i. KMeans scales well with large dataset but performs poorly - with clusters of varying sizes and density, which are often the case for lipid poses in a binding site. - - If ``n_clusters`` is set to `auto`, this method will cluster the bound poses using a density-based cluster - `DBSCAN `_ provided by scikit. - DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples` - points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are - mutually density-connected and density-reachable, i.e. there is a path - :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance - :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon` - determine the performance of this cluster. If None, `min_samples` takes the value of 2 * ndim. If - :math:`\varepsilon` is None, it is set as the value at the knee of the k-distance plot. - - For writing out the cluster poses, this method will randomly select one pose from each cluster in the case of - using KMeans or one from the core samples of each cluster when DBSCAN is used, and writes the selected lipid - pose with the protein conformation to which it binds using MDTraj, in the provided pose format. - - The root mean square deviation (RMSD) of a lipid bound pose in a binding site is calculated from the relative - position of the pose in the binding site compared to the average position of the bound poses. Thus, the pose - RMSD is defined as: - - .. math:: - RMSD=\sqrt{\frac{\sum_{i=0}^{N} \sum_{j=0}^{M}\left(D_{i j}-\bar{D}_{j}\right)^{2}}{N}} - - where :math:`D_{i j}` is the distance of atom `i` to the residue `j` in the binding site; :math:`\bar{D}_{j}` is the - average distance of atom `i` from all bound poses in the binding site to residue `j`; `N` is the number of atoms in - the lipid molecule and `M` is the number of residues in the binding site. - - Parameters - ---------- - binding_site_id : int or list of int, default=None - The binding site ID used in PyLipID. This ID is the index in the binding site node list that is - calculated by the method ``compute_binding_nodes``. The ID of the N-th binding site is (N-1). If None, - the contact duration of all binding sites are calculated. - - n_top_poses : int, default=3 - Number of representative bound poses written out for the selected binding site. - - n_clusters : int or 'auto' - Number of clusters to form for bound poses of the selected binding site. If ``n_clusters`` is set to 'auto'`, the - density-based clusterer DBSCAN will be used. If ``n_clusters`` is given a non-zero integer, KMeans is used. - - pose_format : str, default="gro" - The coordinate format the representative poses and clsutered poses are saved with. Support the formats - that are included in MDtraj.save(). - - score_weights : dict or None, default=None - The weights given to atoms in the scoring function for finding the representative bound poses. It should in - the format of a Python dictionary {atom name: weight}. The atom name should be consisten with the topology. - By default, all atoms in the lipid molecule are weighted equally. - - kde_bw : scalar, default=0.15 - The bandwidth for the Gaussian kernel. Used in the density estimation of the lipid atom coordinates in the binding - site. Used by the function - `KDEMultivariate `_ . - - pca_component : int, float or ‘mle’, default=0.90 - The PCA used to decrease the dimensions of lipid atom coordinates. The coordinate of a lipid atom in - the binding site is expressed as a distance vector of the minimum distances to the residues in that binding site, - i.e. :math:`[D_{i 1}, D_{i 2}, .., D_{i p}]`. This can be in high dimensions. Hence, PCA is used on this distance - vector prior to calculation of the density. This PCA is carried out by - `PCA `_ in sci-kit. - - plot_rmsd : bool, default=True - Plot the binding site RMSDs in a violinplot. - - eps : float or None, default=None - The minimum neighbour distance used by - `DBSCAN `_ - if None, the value will determined from as the elbow point of the sorted minimum neighbour distance - of all the data points. - - min_samples : int or None, default=None - The minimum number of samples to be considered as core samples used by - `DBSCAN `_ . - If None, the value will be automatically determined based on the size of data. - - metric : str, default='euclidean' - The metric used to calculated neighbour distance used by - `DBSCAN `_ . - Default is the Euclidean distance. - - fig_close : bool, default=False - This parameter control whether to close the plotted figures using plt.close(). It can save memory if - many figures are generated. - - fig_format : str, default="pdf" - Figure format. Support formats included in matplotlib.pyplot. - - num_cpus : int or None default=None - The number of CPUs used for the tasks of ranking the poses and clustering poses. Python multiprocessing deployed by - `p_tqdm `_ is used to speed up these calculations. - - save_dir : str or None, default=None - The root directory for saving the pose analysis results. If None, the root directory set at the initiation of - ``LipidInteraction`` will be used. The representative poses and clustered poses will be stored in the directory - of Bound_Poses_{lipid} under the root directory. - - Returns - ------- - pose_pool : dict - Coordinates of the bound poses for the selected binding sites stored in a python dictionary - {binding_site_id: pose coordinates}. The poses coordinates include lipid coordinates and those of the receptor - at the time the pose was bound. The pose coordinates are stored in a - `mdtraj.Trajectory `_ object. - - rmsd_data : pandas.DataFrame - Bound poses RMSDs are stored by columns with column name of binding site id. - - See Also - -------- - pylipid.func.collect_bound_poses - Collect bound pose coordinates from trajectories. - pylipid.func.vectorize_poses - Convert bound poses into distance vectors. - pylipid.func.calculate_scores - Score the bound poses based on the probability density function of the position of lipid atoms - pylipid.func.analyze_pose_wrapper - A wrapper function that ranks poses, clusters poses and calculates pose RMSD - - """ - self._check_calculation("Binding Site ID", self.compute_binding_nodes, print_data=False) - - pose_dir = check_dir(save_dir, "Bound_Poses_{}".format(self._lipid)) if save_dir is not None \ - else check_dir(self._save_dir, "Bound_Poses_{}".format(self._lipid)) - if "Binding Site Pose RMSD" in self.dataset.columns: - pose_rmsd_per_residue = self.dataset["Binding Site Pose RMSD"].values - else: - pose_rmsd_per_residue = np.zeros(self._nresi_per_protein) - if binding_site_id is not None: - selected_bs_id = np.atleast_1d(binding_site_id) - else: - selected_bs_id = np.arange(len(self._node_list), dtype=int) - - # store bound lipid poses - selected_bs_map = {bs_id: self._node_list[bs_id] for bs_id in selected_bs_id} - pose_traj, pose_info = collect_bound_poses(selected_bs_map, self._contact_residues_low, self._trajfile_list, - self._topfile_list, self._lipid, self._protein_ref, self._lipid_ref, - stride=self._stride, nprot=self._nprot) - protein_atom_indices = [[atom.index for atom in residue.atoms] - for residue in self._protein_ref.top.residues] - lipid_atom_indices = [self._protein_ref.n_atoms + atom_idx - for atom_idx in np.arange(self._lipid_ref.n_atoms)] - atom_weights = {atom_idx: 1 for atom_idx in np.arange(self._lipid_ref.n_atoms)} - if score_weights is not None: - translate = {atom_idx: score_weights[self._lipid_ref.top.atom(atom_idx).name] - for atom_idx in np.arange(self._lipid_ref.n_atoms) - if self._lipid_ref.top.atom(atom_idx).name in score_weights.keys()} - atom_weights.update(translate) - - if n_top_poses > 0: - # multiprocessing wrapped under p_tqdm - rmsd_set = _spawn_pmap(partial(analyze_pose_wrapper, protein_atom_indices=protein_atom_indices, - lipid_atom_indices=lipid_atom_indices, n_top_poses=n_top_poses, - pose_dir=pose_dir, atom_weights=atom_weights, kde_bw=kde_bw, - pca_component=pca_component, pose_format=pose_format, n_clusters=n_clusters, - eps=eps, min_samples=min_samples, metric=metric, - trajfile_list=self._trajfile_list), - selected_bs_id, [pose_traj[bs_id] for bs_id in selected_bs_id], - [self._node_list[bs_id] for bs_id in selected_bs_id], - [pose_info[bs_id] for bs_id in selected_bs_id], num_cpus=num_cpus, desc="ANALYZE BOUND POSES") - RMSD_set = {} - for bs_id, rmsd in zip(selected_bs_id, rmsd_set): - RMSD_set["Binding Site {}".format(bs_id)] = rmsd - pose_rmsd_per_residue[self._node_list[bs_id]] = np.mean(RMSD_set["Binding Site {}".format(bs_id)]) - # update dataset - self.dataset["Binding Site Pose RMSD"] = pose_rmsd_per_residue - pose_rmsd_data = pd.DataFrame( - dict([(bs_label, pd.Series(rmsd_set)) for bs_label, rmsd_set in RMSD_set.items()])) - # plot RMSD - if plot_rmsd and n_top_poses > 0: - plot_binding_site_data(pose_rmsd_data, os.path.join(pose_dir, f"Pose_RMSD_violinplot.{fig_format}"), - title="{}".format(self._lipid), ylabel="RMSD (nm)", fig_close=fig_close) - return pose_traj, pose_rmsd_data + score_weights=None, kde_bw=0.15, pca_component=0.90, plot_rmsd=True, save_dir=None, + eps=None, min_samples=None, metric="euclidean", + fig_close=False, fig_format="pdf", num_cpus=None): + r"""Analyze bound poses for binding sites. + + (Docstring unchanged — omitted here for brevity) + """ + # Ensure required upstream steps are done + self._check_calculation("Binding Site ID", self.compute_binding_nodes, print_data=False) + + # Where to save pose outputs + pose_dir = check_dir(save_dir, "Bound_Poses_{}".format(self._lipid)) if save_dir is not None \ + else check_dir(self._save_dir, "Bound_Poses_{}".format(self._lipid)) + + # Initialize/extend per-residue RMSD column + if "Binding Site Pose RMSD" in self.dataset.columns: + pose_rmsd_per_residue = self.dataset["Binding Site Pose RMSD"].values + else: + pose_rmsd_per_residue = np.zeros(self._nresi_per_protein) + + # Which binding sites to process + if binding_site_id is not None: + selected_bs_id = np.atleast_1d(binding_site_id) + else: + selected_bs_id = np.arange(len(self._node_list), dtype=int) + + # Map of selected binding sites -> node lists + selected_bs_map = {bs_id: self._node_list[bs_id] for bs_id in selected_bs_id} + + # --- Heartbeat: announce pose collection -------------------------------- + print(f"[PyLipID] Gathering bound poses for analysis…") + print(f"[PyLipID] Binding sites to analyse: {len(selected_bs_id)}") + + # Collect bound poses and metadata + pose_traj, pose_info = collect_bound_poses( + selected_bs_map, + self._contact_residues_low, + self._trajfile_list, + self._topfile_list, + self._lipid, + self._protein_ref, + self._lipid_ref, + stride=self._stride, + nprot=self._nprot + ) + + # Quick per-site pose counts (helps reassure users on large jobs) + for bs_id in selected_bs_id: + try: + n_poses = len(pose_traj[bs_id]) + except Exception: + n_poses = 0 + print(f"[PyLipID] - Binding Site {bs_id}: {n_poses} poses") + + # Build atom index lists used by the analyzers + protein_atom_indices = [[atom.index for atom in residue.atoms] + for residue in self._protein_ref.top.residues] + lipid_atom_indices = [self._protein_ref.n_atoms + atom_idx + for atom_idx in np.arange(self._lipid_ref.n_atoms)] + + # Atom weights (default 1.0 each; allow user override by name) + atom_weights = {atom_idx: 1 for atom_idx in np.arange(self._lipid_ref.n_atoms)} + if score_weights is not None: + translate = {atom_idx: score_weights[self._lipid_ref.top.atom(atom_idx).name] + for atom_idx in np.arange(self._lipid_ref.n_atoms) + if self._lipid_Ref.top.atom(atom_idx).name in score_weights.keys()} + atom_weights.update(translate) + + RMSD_set = {} + + # If n_top_poses == 0, we skip the heavy scoring/clustering entirely + if n_top_poses > 0: + # --- Heartbeat: announce the heavy parallel step -------------------- + # Number of workers actually used + import os + n_workers = num_cpus if (isinstance(num_cpus, int) and num_cpus > 0) else (os.cpu_count() or 1) + n_workers = min(n_workers, max(1, len(selected_bs_id))) + print(f"[PyLipID] Starting multiprocessing for bound pose scoring/clustering… " + f"(sites={len(selected_bs_id)}, workers={n_workers})") + print(f"[PyLipID] Tip: reduce num_cpus or set n_top_poses=0 / n_clusters=0 to speed up.") + + # Prepare the task closure + task = partial(analyze_pose_wrapper, + protein_atom_indices=protein_atom_indices, + lipid_atom_indices=lipid_atom_indices, + n_top_poses=n_top_poses, + pose_dir=pose_dir, + atom_weights=atom_weights, + kde_bw=kde_bw, + pca_component=pca_component, + pose_format=pose_format, + n_clusters=n_clusters, + eps=eps, + min_samples=min_samples, + metric=metric, + trajfile_list=self._trajfile_list) + + # Run in parallel with a visible tqdm progress bar + print("[PyLipID] Calculating binding-site koff values… this may take some time.") + rmsd_set = _spawn_pmap(task, + selected_bs_id, + [pose_traj[bs_id] for bs_id in selected_bs_id], + [self._node_list[bs_id] for bs_id in selected_bs_id], + [pose_info[bs_id] for bs_id in selected_bs_id], + num_cpus=num_cpus, + desc="ANALYZE BOUND POSES") + + # Collect RMSD output per site and update per-residue RMSD column + for bs_id, rmsd in zip(selected_bs_id, rmsd_set): + RMSD_set[f"Binding Site {bs_id}"] = rmsd + pose_rmsd_per_residue[self._node_list[bs_id]] = np.mean(RMSD_set[f"Binding Site {bs_id}"]) + + # Update dataset + self.dataset["Binding Site Pose RMSD"] = pose_rmsd_per_residue + pose_rmsd_data = pd.DataFrame(dict([(bs_label, pd.Series(rmsd_set)) + for bs_label, rmsd_set in RMSD_set.items()])) + + # Plot RMSD violin (lazy import to avoid GUI in workers) + if plot_rmsd and n_top_poses > 0 and not pose_rmsd_data.empty: + from ..plot import plot_binding_site_data # lazy import + plot_binding_site_data( + pose_rmsd_data, + os.path.join(pose_dir, f"Pose_RMSD_violinplot.{fig_format}"), + title=f"{self._lipid}", + ylabel="RMSD (nm)", + fig_close=fig_close + ) + + return pose_traj, pose_rmsd_data def compute_surface_area(self, binding_site_id=None, radii=None, plot_data=True, save_dir=None, fig_close=False, fig_format="pdf", num_cpus=None): From 5d6de35eecf8d5ac91743f083680b5942e5f66fc Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Thu, 12 Mar 2026 21:17:05 +0000 Subject: [PATCH 21/24] Update api.py --- pylipid/api/api.py | 249 ++++++++++++++++++++++----------------------- 1 file changed, 124 insertions(+), 125 deletions(-) diff --git a/pylipid/api/api.py b/pylipid/api/api.py index 3342fa3..ba33b44 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -1201,132 +1201,131 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a score_weights=None, kde_bw=0.15, pca_component=0.90, plot_rmsd=True, save_dir=None, eps=None, min_samples=None, metric="euclidean", fig_close=False, fig_format="pdf", num_cpus=None): - r"""Analyze bound poses for binding sites. - - (Docstring unchanged — omitted here for brevity) - """ - # Ensure required upstream steps are done - self._check_calculation("Binding Site ID", self.compute_binding_nodes, print_data=False) - - # Where to save pose outputs - pose_dir = check_dir(save_dir, "Bound_Poses_{}".format(self._lipid)) if save_dir is not None \ - else check_dir(self._save_dir, "Bound_Poses_{}".format(self._lipid)) - - # Initialize/extend per-residue RMSD column - if "Binding Site Pose RMSD" in self.dataset.columns: - pose_rmsd_per_residue = self.dataset["Binding Site Pose RMSD"].values - else: - pose_rmsd_per_residue = np.zeros(self._nresi_per_protein) - - # Which binding sites to process - if binding_site_id is not None: - selected_bs_id = np.atleast_1d(binding_site_id) - else: - selected_bs_id = np.arange(len(self._node_list), dtype=int) - - # Map of selected binding sites -> node lists - selected_bs_map = {bs_id: self._node_list[bs_id] for bs_id in selected_bs_id} - - # --- Heartbeat: announce pose collection -------------------------------- - print(f"[PyLipID] Gathering bound poses for analysis…") - print(f"[PyLipID] Binding sites to analyse: {len(selected_bs_id)}") - - # Collect bound poses and metadata - pose_traj, pose_info = collect_bound_poses( - selected_bs_map, - self._contact_residues_low, - self._trajfile_list, - self._topfile_list, - self._lipid, - self._protein_ref, - self._lipid_ref, - stride=self._stride, - nprot=self._nprot - ) - - # Quick per-site pose counts (helps reassure users on large jobs) - for bs_id in selected_bs_id: - try: - n_poses = len(pose_traj[bs_id]) - except Exception: - n_poses = 0 - print(f"[PyLipID] - Binding Site {bs_id}: {n_poses} poses") - - # Build atom index lists used by the analyzers - protein_atom_indices = [[atom.index for atom in residue.atoms] - for residue in self._protein_ref.top.residues] - lipid_atom_indices = [self._protein_ref.n_atoms + atom_idx - for atom_idx in np.arange(self._lipid_ref.n_atoms)] - - # Atom weights (default 1.0 each; allow user override by name) - atom_weights = {atom_idx: 1 for atom_idx in np.arange(self._lipid_ref.n_atoms)} - if score_weights is not None: - translate = {atom_idx: score_weights[self._lipid_ref.top.atom(atom_idx).name] - for atom_idx in np.arange(self._lipid_ref.n_atoms) - if self._lipid_Ref.top.atom(atom_idx).name in score_weights.keys()} - atom_weights.update(translate) - - RMSD_set = {} - - # If n_top_poses == 0, we skip the heavy scoring/clustering entirely - if n_top_poses > 0: - # --- Heartbeat: announce the heavy parallel step -------------------- - # Number of workers actually used - import os - n_workers = num_cpus if (isinstance(num_cpus, int) and num_cpus > 0) else (os.cpu_count() or 1) - n_workers = min(n_workers, max(1, len(selected_bs_id))) - print(f"[PyLipID] Starting multiprocessing for bound pose scoring/clustering… " - f"(sites={len(selected_bs_id)}, workers={n_workers})") - print(f"[PyLipID] Tip: reduce num_cpus or set n_top_poses=0 / n_clusters=0 to speed up.") - - # Prepare the task closure - task = partial(analyze_pose_wrapper, - protein_atom_indices=protein_atom_indices, - lipid_atom_indices=lipid_atom_indices, - n_top_poses=n_top_poses, - pose_dir=pose_dir, - atom_weights=atom_weights, - kde_bw=kde_bw, - pca_component=pca_component, - pose_format=pose_format, - n_clusters=n_clusters, - eps=eps, - min_samples=min_samples, - metric=metric, - trajfile_list=self._trajfile_list) - - # Run in parallel with a visible tqdm progress bar - print("[PyLipID] Calculating binding-site koff values… this may take some time.") - rmsd_set = _spawn_pmap(task, - selected_bs_id, - [pose_traj[bs_id] for bs_id in selected_bs_id], - [self._node_list[bs_id] for bs_id in selected_bs_id], - [pose_info[bs_id] for bs_id in selected_bs_id], - num_cpus=num_cpus, - desc="ANALYZE BOUND POSES") - - # Collect RMSD output per site and update per-residue RMSD column - for bs_id, rmsd in zip(selected_bs_id, rmsd_set): - RMSD_set[f"Binding Site {bs_id}"] = rmsd - pose_rmsd_per_residue[self._node_list[bs_id]] = np.mean(RMSD_set[f"Binding Site {bs_id}"]) - - # Update dataset - self.dataset["Binding Site Pose RMSD"] = pose_rmsd_per_residue - pose_rmsd_data = pd.DataFrame(dict([(bs_label, pd.Series(rmsd_set)) - for bs_label, rmsd_set in RMSD_set.items()])) - - # Plot RMSD violin (lazy import to avoid GUI in workers) - if plot_rmsd and n_top_poses > 0 and not pose_rmsd_data.empty: - from ..plot import plot_binding_site_data # lazy import - plot_binding_site_data( - pose_rmsd_data, - os.path.join(pose_dir, f"Pose_RMSD_violinplot.{fig_format}"), - title=f"{self._lipid}", - ylabel="RMSD (nm)", - fig_close=fig_close + r"""Analyze bound poses for binding sites. + + (Docstring unchanged — omitted here for brevity) + """ + # Ensure required upstream steps are done + self._check_calculation("Binding Site ID", self.compute_binding_nodes, print_data=False) + + # Where to save pose outputs + pose_dir = check_dir(save_dir, "Bound_Poses_{}".format(self._lipid)) if save_dir is not None \ + else check_dir(self._save_dir, "Bound_Poses_{}".format(self._lipid)) + + # Initialize/extend per-residue RMSD column + if "Binding Site Pose RMSD" in self.dataset.columns: + pose_rmsd_per_residue = self.dataset["Binding Site Pose RMSD"].values + else: + pose_rmsd_per_residue = np.zeros(self._nresi_per_protein) + + # Which binding sites to process + if binding_site_id is not None: + selected_bs_id = np.atleast_1d(binding_site_id) + else: + selected_bs_id = np.arange(len(self._node_list), dtype=int) + + # Map of selected binding sites -> node lists + selected_bs_map = {bs_id: self._node_list[bs_id] for bs_id in selected_bs_id} + + # --- Heartbeat: announce pose collection -------------------------------- + print(f"[PyLipID] Gathering bound poses for analysis…") + print(f"[PyLipID] Binding sites to analyse: {len(selected_bs_id)}") + + # Collect bound poses and metadata + pose_traj, pose_info = collect_bound_poses( + selected_bs_map, + self._contact_residues_low, + self._trajfile_list, + self._topfile_list, + self._lipid, + self._protein_ref, + self._lipid_ref, + stride=self._stride, + nprot=self._nprot ) - - return pose_traj, pose_rmsd_data + + # Quick per-site pose counts (helps reassure users on large jobs) + for bs_id in selected_bs_id: + try: + n_poses = len(pose_traj[bs_id]) + except Exception: + n_poses = 0 + print(f"[PyLipID] - Binding Site {bs_id}: {n_poses} poses") + + # Build atom index lists used by the analyzers + protein_atom_indices = [[atom.index for atom in residue.atoms] + for residue in self._protein_ref.top.residues] + lipid_atom_indices = [self._protein_ref.n_atoms + atom_idx + for atom_idx in np.arange(self._lipid_ref.n_atoms)] + + # Atom weights (default 1.0 each; allow user override by name) + atom_weights = {atom_idx: 1 for atom_idx in np.arange(self._lipid_ref.n_atoms)} + if score_weights is not None: + translate = {atom_idx: score_weights[self._lipid_ref.top.atom(atom_idx).name] + for atom_idx in np.arange(self._lipid_ref.n_atoms) + if self._lipid_Ref.top.atom(atom_idx).name in score_weights.keys()} + atom_weights.update(translate) + + RMSD_set = {} + + # If n_top_poses == 0, we skip the heavy scoring/clustering entirely + if n_top_poses > 0: + # --- Heartbeat: announce the heavy parallel step -------------------- + # Number of workers actually used + import os + n_workers = num_cpus if (isinstance(num_cpus, int) and num_cpus > 0) else (os.cpu_count() or 1) + n_workers = min(n_workers, max(1, len(selected_bs_id))) + print(f"[PyLipID] Starting multiprocessing for bound pose scoring/clustering… " + f"(sites={len(selected_bs_id)}, workers={n_workers})") + print(f"[PyLipID] Tip: reduce num_cpus or set n_top_poses=0 / n_clusters=0 to speed up.") + + # Prepare the task closure + task = partial(analyze_pose_wrapper, + protein_atom_indices=protein_atom_indices, + lipid_atom_indices=lipid_atom_indices, + n_top_poses=n_top_poses, + pose_dir=pose_dir, + atom_weights=atom_weights, + kde_bw=kde_bw, + pca_component=pca_component, + pose_format=pose_format, + n_clusters=n_clusters, + eps=eps, + min_samples=min_samples, + metric=metric, + trajfile_list=self._trajfile_list) + + # Run in parallel with a visible tqdm progress bar + rmsd_set = _spawn_pmap(task, + selected_bs_id, + [pose_traj[bs_id] for bs_id in selected_bs_id], + [self._node_list[bs_id] for bs_id in selected_bs_id], + [pose_info[bs_id] for bs_id in selected_bs_id], + num_cpus=num_cpus, + desc="ANALYZE BOUND POSES") + + # Collect RMSD output per site and update per-residue RMSD column + for bs_id, rmsd in zip(selected_bs_id, rmsd_set): + RMSD_set[f"Binding Site {bs_id}"] = rmsd + pose_rmsd_per_residue[self._node_list[bs_id]] = np.mean(RMSD_set[f"Binding Site {bs_id}"]) + + # Update dataset + self.dataset["Binding Site Pose RMSD"] = pose_rmsd_per_residue + pose_rmsd_data = pd.DataFrame(dict([(bs_label, pd.Series(rmsd_set)) + for bs_label, rmsd_set in RMSD_set.items()])) + + # Plot RMSD violin (lazy import to avoid GUI in workers) + if plot_rmsd and n_top_poses > 0 and not pose_rmsd_data.empty: + from ..plot import plot_binding_site_data # lazy import + plot_binding_site_data( + pose_rmsd_data, + os.path.join(pose_dir, f"Pose_RMSD_violinplot.{fig_format}"), + title=f"{self._lipid}", + ylabel="RMSD (nm)", + fig_close=fig_close + ) + + return pose_traj, pose_rmsd_data def compute_surface_area(self, binding_site_id=None, radii=None, plot_data=True, save_dir=None, fig_close=False, fig_format="pdf", num_cpus=None): From c4ee8d833ff90502e8c3de856ccf7f347f2a7b66 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Thu, 12 Mar 2026 21:48:39 +0000 Subject: [PATCH 22/24] Update directory.py --- pylipid/util/directory.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pylipid/util/directory.py b/pylipid/util/directory.py index 114dce2..0b2f9df 100644 --- a/pylipid/util/directory.py +++ b/pylipid/util/directory.py @@ -21,17 +21,19 @@ __all__ = ["check_dir"] -def check_dir(root, name=None): +def check_dir(root, name=None, print_info=True): """ - Create a directory under 'root'. If 'name' is provided, the directory - will be root/name. Returns the full path. + Create (if needed) and return a directory under 'root'. + If 'name' is provided, the directory will be 'root/name'. + Set print_info=False to suppress the creation message. """ import os path = os.path.join(root, name) if name is not None else root if not os.path.isdir(path): - print(f"Creating new directory: {path}") + if print_info: + print(f"Creating new directory: {path}") os.makedirs(path, exist_ok=True) return path From 7f10a3681241c3cb1f5324332b5cbe9b05b99495 Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Fri, 13 Mar 2026 07:04:00 +0000 Subject: [PATCH 23/24] Update api.py --- pylipid/api/api.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pylipid/api/api.py b/pylipid/api/api.py index ba33b44..1c06b08 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -1277,7 +1277,7 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a n_workers = min(n_workers, max(1, len(selected_bs_id))) print(f"[PyLipID] Starting multiprocessing for bound pose scoring/clustering… " f"(sites={len(selected_bs_id)}, workers={n_workers})") - print(f"[PyLipID] Tip: reduce num_cpus or set n_top_poses=0 / n_clusters=0 to speed up.") + #print(f"[PyLipID] Reducing num_cpus or set n_top_poses=0 / n_clusters=0 to speed up.") # Prepare the task closure task = partial(analyze_pose_wrapper, @@ -1296,6 +1296,7 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a trajfile_list=self._trajfile_list) # Run in parallel with a visible tqdm progress bar + print("[PyLipID] Calculating binding-site koff values… this may take some time.") rmsd_set = _spawn_pmap(task, selected_bs_id, [pose_traj[bs_id] for bs_id in selected_bs_id], @@ -1303,7 +1304,7 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a [pose_info[bs_id] for bs_id in selected_bs_id], num_cpus=num_cpus, desc="ANALYZE BOUND POSES") - + print("[PyLipID] Finished calculating binding-site koff values.") # Collect RMSD output per site and update per-residue RMSD column for bs_id, rmsd in zip(selected_bs_id, rmsd_set): RMSD_set[f"Binding Site {bs_id}"] = rmsd From 92ad411a823326270e0bc1b7b6afac98be42bd7c Mon Sep 17 00:00:00 2001 From: Phill Stansfeld Date: Fri, 13 Mar 2026 08:08:04 +0000 Subject: [PATCH 24/24] updates to calcs --- pylipid/api/api.py | 6 ++-- pylipid/func/binding_site.py | 69 ++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/pylipid/api/api.py b/pylipid/api/api.py index 1c06b08..693c55b 100644 --- a/pylipid/api/api.py +++ b/pylipid/api/api.py @@ -1263,7 +1263,7 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a if score_weights is not None: translate = {atom_idx: score_weights[self._lipid_ref.top.atom(atom_idx).name] for atom_idx in np.arange(self._lipid_ref.n_atoms) - if self._lipid_Ref.top.atom(atom_idx).name in score_weights.keys()} + if self._lipid_ref.top.atom(atom_idx).name in score_weights.keys()} atom_weights.update(translate) RMSD_set = {} @@ -1296,7 +1296,7 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a trajfile_list=self._trajfile_list) # Run in parallel with a visible tqdm progress bar - print("[PyLipID] Calculating binding-site koff values… this may take some time.") + print("[PyLipID] Scoring and clustering bound poses… this may take some time.") rmsd_set = _spawn_pmap(task, selected_bs_id, [pose_traj[bs_id] for bs_id in selected_bs_id], @@ -1304,7 +1304,7 @@ def analyze_bound_poses(self, binding_site_id=None, n_top_poses=3, n_clusters="a [pose_info[bs_id] for bs_id in selected_bs_id], num_cpus=num_cpus, desc="ANALYZE BOUND POSES") - print("[PyLipID] Finished calculating binding-site koff values.") + print("[PyLipID] Finished clustering.") # Collect RMSD output per site and update per-residue RMSD column for bs_id, rmsd in zip(selected_bs_id, rmsd_set): RMSD_set[f"Binding Site {bs_id}"] = rmsd diff --git a/pylipid/func/binding_site.py b/pylipid/func/binding_site.py index aa8398f..e08d34c 100644 --- a/pylipid/func/binding_site.py +++ b/pylipid/func/binding_site.py @@ -201,16 +201,28 @@ def collect_bound_poses(binding_site_map, contact_residue_index, trajfile_list, contact_BS = [np.unique(np.concatenate( [contact_residue_index[node][list_to_take][frame_idx] for node in nodes])) for frame_idx in range(traj.n_frames)] - for frame_id in range(len(contact_BS)): - if len(contact_BS[frame_id]) > 0: - for lipid_id in contact_BS[frame_id]: - lipid_residue_index = lipid_residue_index_set[int(lipid_id)] - lipid_atom_indices = np.sort( - [atom.index for atom in traj.top.residue(lipid_residue_index).atoms]) - pose_pool[bs_id].append( - [np.copy(traj.xyz[frame_id, np.hstack([protein_indices, lipid_atom_indices])]), - np.copy(traj.unitcell_angles[frame_id]), np.copy(traj.unitcell_lengths[frame_id])]) - pose_info[bs_id].append((traj_idx, protein_idx, lipid_residue_index, frame_id*traj.timestep)) + # Group (frame_id, lipid_res_idx) pairs by lipid molecule so we can + # batch-extract all frames for each lipid with a single numpy slice + # instead of copying coordinates one frame at a time. + lip_frames = defaultdict(list) # {lipid_res_idx: [frame_ids]} + for frame_id, lipids_at_frame in enumerate(contact_BS): + for lipid_id in lipids_at_frame: + lip_res_idx = lipid_residue_index_set[int(lipid_id)] + lip_frames[lip_res_idx].append(frame_id) + for lip_res_idx, frame_ids in lip_frames.items(): + lip_atoms = np.sort( + [atom.index for atom in traj.top.residue(lip_res_idx).atoms]) + combined = np.hstack([protein_indices, lip_atoms]) + frame_arr = np.array(frame_ids) + # Single batch slice for all frames of this lipid + batch_xyz = traj.xyz[frame_arr][:, combined] + batch_angles = traj.unitcell_angles[frame_arr] + batch_lengths = traj.unitcell_lengths[frame_arr] + for i, frame_id in enumerate(frame_ids): + pose_pool[bs_id].append( + [batch_xyz[i], batch_angles[i], batch_lengths[i]]) + pose_info[bs_id].append( + (traj_idx, protein_idx, lip_res_idx, frame_id * traj.timestep)) pose_traj = {} for bs_id in pose_pool.keys(): pose_traj[bs_id] = md.Trajectory([frame[0] for frame in pose_pool[bs_id]], joined_top, @@ -250,12 +262,30 @@ def vectorize_poses(bound_poses, binding_nodes, protein_atom_indices, lipid_atom The distance matrix of the bound poses, in the shape of [n_lipid_atoms, n_poses, n_binding_site_residues] """ - # calculate distance to binding site residues for each lipid atom - dist_per_atom = np.array( - [np.array([md.compute_distances(bound_poses, - list(product([lipid_atom_indices[idx]], protein_atom_indices[node])), - periodic=True).min(axis=1) for node in binding_nodes]).T - for idx in np.arange(len(lipid_atom_indices))]) # shape: [n_lipid_atoms, n_poses, n_BS_residues] + n_lip = len(lipid_atom_indices) + n_nodes = len(binding_nodes) + + # Build one flat pair list covering every (lipid_atom × node_residue_atom) combination. + # block_starts[i] marks where the i-th (lipid_atom, node) block begins so that + # np.minimum.reduceat can collapse each block to its per-residue minimum in one pass. + # This replaces n_lip * n_nodes separate md.compute_distances calls with a single call. + all_pairs = [] + block_starts = [] # length = n_lip * n_nodes + + for lip_atom in lipid_atom_indices: + for node in binding_nodes: + block_starts.append(len(all_pairs)) + for prot_atom in protein_atom_indices[node]: + all_pairs.append((lip_atom, prot_atom)) + + # One call → shape [n_frames, n_total_pairs] + all_dists = md.compute_distances(bound_poses, all_pairs, periodic=True) + + # Per-block minimum → shape [n_frames, n_lip * n_nodes] + min_dists = np.minimum.reduceat(all_dists, block_starts, axis=1) + + # Reshape to [n_frames, n_lip, n_nodes] then transpose → [n_lip, n_frames, n_nodes] + dist_per_atom = min_dists.reshape(bound_poses.n_frames, n_lip, n_nodes).transpose(1, 0, 2) return dist_per_atom @@ -521,9 +551,10 @@ def analyze_pose_wrapper(bs_id, bound_poses, binding_nodes, pose_info, pose_dir= _write_pose_info([pose_info[int(pose_idx)] for pose_idx in selected_pose_indices], f"{pose_dir_clustered}/pose_info.txt", trajfile_list) ## calculate RMSD ## - dist_mean = np.mean(lipid_dist_per_pose, axis=0) - pose_rmsds = [rmsd(lipid_dist_per_pose[pose_id], dist_mean) - for pose_id in np.arange(len(lipid_dist_per_pose))] + # lipid_dist_per_pose is already computed above; vectorise over all poses at once + # instead of looping frame by frame. + dist_mean = lipid_dist_per_pose.mean(axis=0) + pose_rmsds = np.sqrt(((lipid_dist_per_pose - dist_mean) ** 2).mean(axis=1)).tolist() return pose_rmsds