From a46b5f7563a63d875f045b1c34ad6213ce0f5da9 Mon Sep 17 00:00:00 2001 From: Angeliki Vliora Date: Tue, 12 May 2026 11:08:28 +0200 Subject: [PATCH 1/3] Add pLDDT analysis script for FoldX5 vs FoldX5.1 classification agreement --- .../6.classification_plddt.py | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100755 tools/foldx_version_comparison/6.classification_plddt.py diff --git a/tools/foldx_version_comparison/6.classification_plddt.py b/tools/foldx_version_comparison/6.classification_plddt.py new file mode 100755 index 0000000..699edf1 --- /dev/null +++ b/tools/foldx_version_comparison/6.classification_plddt.py @@ -0,0 +1,206 @@ +# pLDDT analysis: are disagreements between FoldX5 and FoldX5.1 enriched in structurally uncertain regions? +# Each mutation is treated as a separate data point (mutations at the same position share the same pLDDT) +# Level 1: global plots across all proteins +# Level 2: per-protein summary to check if effect is consistent and not driven by large high-pLDDT proteins +# Run: python 6.classification_plddt.py -f /path/to/foldx5/dataset_tables -i /path/to/foldx51/dataset_tables -o ./plddt_results +import os +import glob +import argparse +import warnings +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib as mpl +import seaborn as sns +warnings.filterwarnings("ignore") +mpl.rcParams.update({ + "font.family": "sans-serif", + "font.size": 12, + "axes.titlesize": 13, + "axes.labelsize": 12, + "xtick.labelsize": 11, + "ytick.labelsize": 11, + "legend.fontsize": 11, + "axes.spines.top": False, + "axes.spines.right": False, +}) +color_agree = "#4A90A4" +color_disagree = "#B22222" +color_dist = "#6A9CC4" +col_ddg = "Stability (FoldX5, alphafold, kcal/mol)" +col_plddt = "AlphaFold2 model pLDDT score" +col_mut = "Mutation" +def classify_ddg(ddg): + # MAVISp classification thresholds applied to FoldX ddG only + # stabilizing: ddG <= -3, destabilizing: ddG >= 3, neutral: -2 < ddG < 2, uncertain: everything else + if ddg <= -3: + return "stabilizing" + elif ddg >= 3: + return "destabilizing" + elif -2 < ddg < 2: + return "neutral" + else: + return "uncertain" +# 1. data loading +def load_all_proteins(foldx5_dir, foldx51_dir): + # load all csvs from both folders, match by filename + # classify each mutation independently + # label each mutation as agree or disagree between the two versions + foldx5_files = {Path(f).stem: f for f in glob.glob(os.path.join(foldx5_dir, "*.csv"))} + foldx51_files = {Path(f).stem: f for f in glob.glob(os.path.join(foldx51_dir, "*.csv"))} + common_proteins = sorted(set(foldx5_files.keys()) & set(foldx51_files.keys())) + print(f"proteins in foldx5: {len(foldx5_files)}") + print(f"proteins in foldx5.1: {len(foldx51_files)}") + print(f"common proteins: {len(common_proteins)}") + all_protein_data = [] + for stem in common_proteins: + protein_name = stem.replace("-simple_mode", "") + try: + df_foldx5 = pd.read_csv(foldx5_files[stem]) + df_foldx51 = pd.read_csv(foldx51_files[stem]) + df_foldx5 = df_foldx5.rename(columns={col_ddg: "ddg_foldx5"}) + df_foldx51 = df_foldx51.rename(columns={col_ddg: "ddg_foldx51"}) + df_merged = pd.merge( + df_foldx51[[col_mut, "ddg_foldx51", col_plddt]], + df_foldx5[[col_mut, "ddg_foldx5"]], + on=col_mut + ) + if df_merged.empty: + continue + df_merged["class_foldx5"] = df_merged["ddg_foldx5"].apply(classify_ddg) + df_merged["class_foldx51"] = df_merged["ddg_foldx51"].apply(classify_ddg) + df_merged["agreement"] = df_merged.apply( + lambda row: "agree" if row["class_foldx5"] == row["class_foldx51"] else "disagree", + axis=1 + ) + df_merged["ddg_diff"] = abs(df_merged["ddg_foldx51"] - df_merged["ddg_foldx5"]) + df_merged["protein_name"] = protein_name + all_protein_data.append(df_merged) + except Exception as e: + print(f" could not process {protein_name}: {e}") + continue + if not all_protein_data: + return pd.DataFrame() + return pd.concat(all_protein_data, ignore_index=True) +# 2. level 1 - global plots +def plot_plddt_distribution(df_mutations, output_dir): + # kde distribution of plddt scores for agree vs disagree mutations across all proteins + os.makedirs(output_dir, exist_ok=True) + fig, ax = plt.subplots(figsize=(8, 6)) + for group, color in [("agree", color_agree), ("disagree", color_disagree)]: + subset = df_mutations[df_mutations["agreement"] == group][col_plddt] + if subset.empty: + print(f" no data for {group} group") + continue + sns.kdeplot(subset, ax=ax, color=color, + label=f"{group} (n={len(subset)})", + fill=True, alpha=0.3, linewidth=2) + ax.axvline(subset.median(), color=color, linestyle="--", linewidth=1.5, alpha=0.8) + ax.set_xlabel("pLDDT score") + ax.set_ylabel("density") + ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)") + ax.legend(frameon=False) + plt.tight_layout() + out = os.path.join(output_dir, "plddt_distribution_agree_disagree.png") + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"saved: {out}") +def plot_plddt_scatter(df_mutations, output_dir): + # scatter: plddt vs absolute ddg difference per mutation, colored by agree/disagree + fig, ax = plt.subplots(figsize=(8, 6)) + for group, color in [("agree", color_agree), ("disagree", color_disagree)]: + subset = df_mutations[df_mutations["agreement"] == group] + if subset.empty: + continue + ax.scatter(subset[col_plddt], subset["ddg_diff"], + color=color, alpha=0.5, s=40, + label=f"{group} (n={len(subset)})", + edgecolors="none") + ax.set_xlabel("pLDDT score") + ax.set_ylabel("|ΔΔG FoldX5.1 − ΔΔG FoldX5| (kcal/mol)") + ax.set_title("pLDDT vs prediction difference per mutation\nFoldX5 vs FoldX5.1 (all proteins)") + ax.legend(frameon=False) + plt.tight_layout() + out = os.path.join(output_dir, "plddt_vs_ddg_diff_scatter.png") + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"saved: {out}") + + +# 3. level 2 - per-protein summary +def plot_per_protein_delta(df_mutations, output_dir): + per_protein_results = [] + for protein_name, protein_group in df_mutations.groupby("protein_name"): + agree_plddt = protein_group[protein_group["agreement"] == "agree"][col_plddt] + disagree_plddt = protein_group[protein_group["agreement"] == "disagree"][col_plddt] + if agree_plddt.empty or disagree_plddt.empty: + continue + delta_median = disagree_plddt.median() - agree_plddt.median() + per_protein_results.append({ + "protein_name": protein_name, + "median_plddt_agree": agree_plddt.median(), + "median_plddt_disagree": disagree_plddt.median(), + "delta_median_plddt": delta_median, + "n_agree_mutations": len(agree_plddt), + "n_disagree_mutations": len(disagree_plddt), + }) + if not per_protein_results: + print(" no proteins with both agree and disagree mutations — skipping level 2 plot") + return + df_per_protein = pd.DataFrame(per_protein_results) + out_csv = os.path.join(output_dir, "per_protein_plddt_delta.csv") + df_per_protein.to_csv(out_csv, index=False) + print(f"saved: {out_csv}") + n_negative = (df_per_protein["delta_median_plddt"] < 0).sum() + n_positive = (df_per_protein["delta_median_plddt"] > 0).sum() + print(f" proteins where disagree plddt < agree plddt: {n_negative}") + print(f" proteins where disagree plddt > agree plddt: {n_positive}") + fig, ax = plt.subplots(figsize=(8, 6)) + sns.histplot(df_per_protein["delta_median_plddt"], kde=True, ax=ax, + color=color_dist, edgecolor="white", bins=20, alpha=0.8) + ax.axvline(0, color="#555555", linestyle="--", linewidth=1.5, label="no difference (0)") + ax.axvline(df_per_protein["delta_median_plddt"].median(), color=color_disagree, + linestyle="--", linewidth=1.5, + label=f"median: {df_per_protein['delta_median_plddt'].median():.1f}") + ax.set_xlabel("median pLDDT (disagree) − median pLDDT (agree)") + ax.set_ylabel("number of proteins") + ax.set_title(f"per-protein pLDDT difference: disagree vs agree mutations\n" + f"(n={len(df_per_protein)} proteins — " + f"{n_negative} negative, {n_positive} positive)") + ax.legend(frameon=False) + plt.tight_layout() + out = os.path.join(output_dir, "per_protein_plddt_delta.png") + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"saved: {out}") + + +# 4. main +def main(): + parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1") + parser.add_argument("-f", "--foldx5_dir", required=True, help="path to foldx5 dataset_tables folder") + parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder") + parser.add_argument("-o", "--output_dir", default="./plddt_results", help="where to save results") + args = parser.parse_args() + print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n") + print("[1] loading data...") + df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir) + if df_mutations.empty: + print("error: no data loaded. check folder paths and csv structure.") + return + print(f"\ntotal mutations: {len(df_mutations)}") + print(f"agree: {(df_mutations['agreement'] == 'agree').sum()}") + print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}") + os.makedirs(args.output_dir, exist_ok=True) + csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv") + df_mutations.to_csv(csv_out, index=False) + print(f"\nsaved results table: {csv_out}") + print("\n[2] global plots...") + plot_plddt_distribution(df_mutations, args.output_dir) + plot_plddt_scatter(df_mutations, args.output_dir) + print("\n[3] per-protein summary...") + plot_per_protein_delta(df_mutations, args.output_dir) + print("\ndone.") +if __name__ == "__main__": + main() From 5779d83db9fa98ba730a7090f33f42b0395894f3 Mon Sep 17 00:00:00 2001 From: Angeliki Vliora Date: Wed, 13 May 2026 16:01:29 +0200 Subject: [PATCH 2/3] Add RMSD matrix script for FoldX5 vs FoldX5.1 initial structures --- .../6.classification_plddt.py | 18 +++++++++++++++++- tools/foldx_version_comparison/rmsd_matrix.py | 11 ++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/tools/foldx_version_comparison/6.classification_plddt.py b/tools/foldx_version_comparison/6.classification_plddt.py index 699edf1..33aaae6 100755 --- a/tools/foldx_version_comparison/6.classification_plddt.py +++ b/tools/foldx_version_comparison/6.classification_plddt.py @@ -87,7 +87,9 @@ def load_all_proteins(foldx5_dir, foldx51_dir): def plot_plddt_distribution(df_mutations, output_dir): # kde distribution of plddt scores for agree vs disagree mutations across all proteins os.makedirs(output_dir, exist_ok=True) - fig, ax = plt.subplots(figsize=(8, 6)) + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + ax = axes[0] + ax_bar = axes[1] for group, color in [("agree", color_agree), ("disagree", color_disagree)]: subset = df_mutations[df_mutations["agreement"] == group][col_plddt] if subset.empty: @@ -101,6 +103,20 @@ def plot_plddt_distribution(df_mutations, output_dir): ax.set_ylabel("density") ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)") ax.legend(frameon=False) + # barplot with 3 pLDDT-wide bins + bin_edges = np.arange(0, 101, 3) + for group, color in [("agree", color_agree), ("disagree", color_disagree)]: + subset = df_mutations[df_mutations["agreement"] == group][col_plddt] + if subset.empty: + continue + counts, edges = np.histogram(subset, bins=bin_edges) + counts = counts / counts.sum() # normalize to fraction + ax_bar.bar(edges[:-1], counts, width=3, color=color, alpha=0.6, + label=f"{group} (n={len(subset)})", align="edge", edgecolor="white") + ax_bar.set_xlabel("pLDDT score") + ax_bar.set_ylabel("Fraction of mutations") + ax_bar.set_title("pLDDT distribution by classification agreement\n(3 Å bins)") + ax_bar.legend(frameon=False) plt.tight_layout() out = os.path.join(output_dir, "plddt_distribution_agree_disagree.png") plt.savefig(out, dpi=150, bbox_inches="tight") diff --git a/tools/foldx_version_comparison/rmsd_matrix.py b/tools/foldx_version_comparison/rmsd_matrix.py index 149429a..99f4a73 100755 --- a/tools/foldx_version_comparison/rmsd_matrix.py +++ b/tools/foldx_version_comparison/rmsd_matrix.py @@ -2,7 +2,7 @@ # For each protein/domain in both datasets, computes all-atom and CA RMSD # Matching is done by UniProt ID + protein name # RMSD is computed only on overlapping residues between the two structures -# Run: python rmsd_matrix.py --foldx5_dir /path/to/folder --foldx51_dir /path/to/folder --output_dir ./rmsd_results +# Run: python rmsd_matrix.py -f /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/foldx5_initial_structures -i /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/data_collection_foldx5.1 -o ./rmsd_results import os import glob import argparse @@ -212,7 +212,8 @@ def plot_rmsd_distributions(df, output_dir): fig, axes = plt.subplots(1, 2, figsize=(12, 5)) fig.suptitle("RMSD distributions: FoldX5 vs FoldX5.1 initial structures", fontsize=13, fontweight="bold", y=1.01) for ax, col, title in zip(axes, ["rmsd_all_atoms", "rmsd_ca"], ["All-atom RMSD (Å)", "Cα RMSD (Å)"]): - sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=10, alpha=0.8) + bin_edges = np.arange(0, df_clean[col].max() + 1, 1) + sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=bin_edges, alpha=0.8) ax.axvline(df_clean[col].median(), color="#CC0000", linestyle="--", linewidth=1.5, label=f"Median: {df_clean[col].median():.2f} Å") ax.axvline(df_clean[col].mean(), color="#555555", linestyle=":", linewidth=1.5, label=f"Mean: {df_clean[col].mean():.2f} Å") ax.set_xlabel(title) @@ -286,9 +287,9 @@ def plot_rmsd_violin(df, output_dir): def main(): parser = argparse.ArgumentParser(description="Compute RMSD between FoldX5 and FoldX5.1 initial structures") - parser.add_argument("--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)") - parser.add_argument("--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder") - parser.add_argument("--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)") + parser.add_argument("-f", "--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)") + parser.add_argument("-i", "--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder") + parser.add_argument("-o", "--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)") args = parser.parse_args() print("RMSD analysis: FoldX5 vs FoldX5.1 initial structures") From bd446d7adef6a6dc5402b83405083f001f607ae2 Mon Sep 17 00:00:00 2001 From: Angeliki Vliora Date: Fri, 29 May 2026 11:23:11 +0200 Subject: [PATCH 3/3] Remove scatter, barplot and violin from rmsd script - inline plotting functions in plddt script --- .../6.classification_plddt.py | 100 +++++++++--------- tools/foldx_version_comparison/rmsd_matrix.py | 66 ------------ 2 files changed, 50 insertions(+), 116 deletions(-) diff --git a/tools/foldx_version_comparison/6.classification_plddt.py b/tools/foldx_version_comparison/6.classification_plddt.py index 33aaae6..8897199 100755 --- a/tools/foldx_version_comparison/6.classification_plddt.py +++ b/tools/foldx_version_comparison/6.classification_plddt.py @@ -1,8 +1,5 @@ # pLDDT analysis: are disagreements between FoldX5 and FoldX5.1 enriched in structurally uncertain regions? -# Each mutation is treated as a separate data point (mutations at the same position share the same pLDDT) -# Level 1: global plots across all proteins -# Level 2: per-protein summary to check if effect is consistent and not driven by large high-pLDDT proteins -# Run: python 6.classification_plddt.py -f /path/to/foldx5/dataset_tables -i /path/to/foldx51/dataset_tables -o ./plddt_results +# each mutation is treated as a separate data point (mutations at the same position share the same pLDDT) import os import glob import argparse @@ -10,7 +7,6 @@ from pathlib import Path import numpy as np import pandas as pd -import matplotlib.pyplot as plt import matplotlib as mpl import seaborn as sns warnings.filterwarnings("ignore") @@ -31,6 +27,8 @@ col_ddg = "Stability (FoldX5, alphafold, kcal/mol)" col_plddt = "AlphaFold2 model pLDDT score" col_mut = "Mutation" + + def classify_ddg(ddg): # MAVISp classification thresholds applied to FoldX ddG only # stabilizing: ddG <= -3, destabilizing: ddG >= 3, neutral: -2 < ddG < 2, uncertain: everything else @@ -42,7 +40,8 @@ def classify_ddg(ddg): return "neutral" else: return "uncertain" -# 1. data loading + + def load_all_proteins(foldx5_dir, foldx51_dir): # load all csvs from both folders, match by filename # classify each mutation independently @@ -77,23 +76,49 @@ def load_all_proteins(foldx5_dir, foldx51_dir): df_merged["ddg_diff"] = abs(df_merged["ddg_foldx51"] - df_merged["ddg_foldx5"]) df_merged["protein_name"] = protein_name all_protein_data.append(df_merged) - except Exception as e: - print(f" could not process {protein_name}: {e}") + except KeyError as e: + print(f" skipping {protein_name}: missing column {e}") continue + except Exception as e: + raise RuntimeError(f"unexpected error processing {protein_name}: {e}") if not all_protein_data: return pd.DataFrame() return pd.concat(all_protein_data, ignore_index=True) -# 2. level 1 - global plots -def plot_plddt_distribution(df_mutations, output_dir): - # kde distribution of plddt scores for agree vs disagree mutations across all proteins - os.makedirs(output_dir, exist_ok=True) + + +def main(): + parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1") + parser.add_argument("-f", "--foldx5_dir", required=True, help="path to foldx5 dataset_tables folder") + parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder") + parser.add_argument("-o", "--output_dir", default="./plddt_results", help="where to save results") + args = parser.parse_args() + + print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n") + print("[1] loading data...") + df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir) + if df_mutations.empty: + print("error: no data loaded. check folder paths and csv structure.") + return + + print(f"\ntotal mutations: {len(df_mutations)}") + print(f"agree: {(df_mutations['agreement'] == 'agree').sum()}") + print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}") + + os.makedirs(args.output_dir, exist_ok=True) + csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv") + df_mutations.to_csv(csv_out, index=False) + print(f"\nsaved results table: {csv_out}") + + print("\n[2] global plots") + + # kde + barplot of plddt scores for agree vs disagree mutations + os.makedirs(args.output_dir, exist_ok=True) fig, axes = plt.subplots(1, 2, figsize=(14, 6)) ax = axes[0] ax_bar = axes[1] for group, color in [("agree", color_agree), ("disagree", color_disagree)]: subset = df_mutations[df_mutations["agreement"] == group][col_plddt] if subset.empty: - print(f" no data for {group} group") continue sns.kdeplot(subset, ax=ax, color=color, label=f"{group} (n={len(subset)})", @@ -103,27 +128,26 @@ def plot_plddt_distribution(df_mutations, output_dir): ax.set_ylabel("density") ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)") ax.legend(frameon=False) - # barplot with 3 pLDDT-wide bins bin_edges = np.arange(0, 101, 3) for group, color in [("agree", color_agree), ("disagree", color_disagree)]: subset = df_mutations[df_mutations["agreement"] == group][col_plddt] if subset.empty: continue counts, edges = np.histogram(subset, bins=bin_edges) - counts = counts / counts.sum() # normalize to fraction + counts = counts / counts.sum() ax_bar.bar(edges[:-1], counts, width=3, color=color, alpha=0.6, label=f"{group} (n={len(subset)})", align="edge", edgecolor="white") ax_bar.set_xlabel("pLDDT score") - ax_bar.set_ylabel("Fraction of mutations") + ax_bar.set_ylabel("fraction of mutations") ax_bar.set_title("pLDDT distribution by classification agreement\n(3 Å bins)") ax_bar.legend(frameon=False) plt.tight_layout() - out = os.path.join(output_dir, "plddt_distribution_agree_disagree.png") + out = os.path.join(args.output_dir, "plddt_distribution_agree_disagree.png") plt.savefig(out, dpi=150, bbox_inches="tight") plt.close() print(f"saved: {out}") -def plot_plddt_scatter(df_mutations, output_dir): - # scatter: plddt vs absolute ddg difference per mutation, colored by agree/disagree + + # scatter: plddt vs absolute ddg difference per mutation fig, ax = plt.subplots(figsize=(8, 6)) for group, color in [("agree", color_agree), ("disagree", color_disagree)]: subset = df_mutations[df_mutations["agreement"] == group] @@ -138,14 +162,14 @@ def plot_plddt_scatter(df_mutations, output_dir): ax.set_title("pLDDT vs prediction difference per mutation\nFoldX5 vs FoldX5.1 (all proteins)") ax.legend(frameon=False) plt.tight_layout() - out = os.path.join(output_dir, "plddt_vs_ddg_diff_scatter.png") + out = os.path.join(args.output_dir, "plddt_vs_ddg_diff_scatter.png") plt.savefig(out, dpi=150, bbox_inches="tight") plt.close() print(f"saved: {out}") + print("\n[3] per-protein summary") -# 3. level 2 - per-protein summary -def plot_per_protein_delta(df_mutations, output_dir): + # per-protein: median plddt of disagree - median plddt of agree per_protein_results = [] for protein_name, protein_group in df_mutations.groupby("protein_name"): agree_plddt = protein_group[protein_group["agreement"] == "agree"][col_plddt] @@ -165,7 +189,7 @@ def plot_per_protein_delta(df_mutations, output_dir): print(" no proteins with both agree and disagree mutations — skipping level 2 plot") return df_per_protein = pd.DataFrame(per_protein_results) - out_csv = os.path.join(output_dir, "per_protein_plddt_delta.csv") + out_csv = os.path.join(args.output_dir, "per_protein_plddt_delta.csv") df_per_protein.to_csv(out_csv, index=False) print(f"saved: {out_csv}") n_negative = (df_per_protein["delta_median_plddt"] < 0).sum() @@ -186,37 +210,13 @@ def plot_per_protein_delta(df_mutations, output_dir): f"{n_negative} negative, {n_positive} positive)") ax.legend(frameon=False) plt.tight_layout() - out = os.path.join(output_dir, "per_protein_plddt_delta.png") + out = os.path.join(args.output_dir, "per_protein_plddt_delta.png") plt.savefig(out, dpi=150, bbox_inches="tight") plt.close() print(f"saved: {out}") - -# 4. main -def main(): - parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1") - parser.add_argument("-f", "--foldx5_dir", required=True, help="path to foldx5 dataset_tables folder") - parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder") - parser.add_argument("-o", "--output_dir", default="./plddt_results", help="where to save results") - args = parser.parse_args() - print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n") - print("[1] loading data...") - df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir) - if df_mutations.empty: - print("error: no data loaded. check folder paths and csv structure.") - return - print(f"\ntotal mutations: {len(df_mutations)}") - print(f"agree: {(df_mutations['agreement'] == 'agree').sum()}") - print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}") - os.makedirs(args.output_dir, exist_ok=True) - csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv") - df_mutations.to_csv(csv_out, index=False) - print(f"\nsaved results table: {csv_out}") - print("\n[2] global plots...") - plot_plddt_distribution(df_mutations, args.output_dir) - plot_plddt_scatter(df_mutations, args.output_dir) - print("\n[3] per-protein summary...") - plot_per_protein_delta(df_mutations, args.output_dir) print("\ndone.") + + if __name__ == "__main__": main() diff --git a/tools/foldx_version_comparison/rmsd_matrix.py b/tools/foldx_version_comparison/rmsd_matrix.py index 99f4a73..bcab63c 100755 --- a/tools/foldx_version_comparison/rmsd_matrix.py +++ b/tools/foldx_version_comparison/rmsd_matrix.py @@ -2,7 +2,6 @@ # For each protein/domain in both datasets, computes all-atom and CA RMSD # Matching is done by UniProt ID + protein name # RMSD is computed only on overlapping residues between the two structures -# Run: python rmsd_matrix.py -f /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/foldx5_initial_structures -i /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/data_collection_foldx5.1 -o ./rmsd_results import os import glob import argparse @@ -18,10 +17,6 @@ warnings.filterwarnings("ignore") -COLOR_VIOLIN_ALL = "#C4A882" # pale brown -COLOR_VIOLIN_CA = "#B8A0C8" # pale purple -COLOR_BAR_ALL = "#B22222" # firebrick -COLOR_BAR_CA = "#4A90A4" # muted teal COLOR_DIST = "#6A9CC4" # soft blue mpl.rcParams.update({ @@ -201,8 +196,6 @@ def run_rmsd_analysis(pairs): def plot_rmsd_distributions(df, output_dir): # Plot 1: histogram + KDE for all-atom and CA RMSD with mean and median lines - # Plot 2: scatter of all-atom vs CA RMSD per structure - # Plot 3: bar plot of all structures sorted by CA RMSD os.makedirs(output_dir, exist_ok=True) df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"]) df_clean = df_clean.copy() @@ -225,64 +218,6 @@ def plot_rmsd_distributions(df, output_dir): plt.close() print(f"\nSaved: {out}") - # Plot 2: all-atom vs CA scatter with diagonal reference line - fig, ax = plt.subplots(figsize=(6, 6)) - ax.scatter(df_clean["rmsd_all_atoms"], df_clean["rmsd_ca"], color=COLOR_DIST, alpha=0.8, edgecolors="white", s=80) - lims = [0, max(df_clean[["rmsd_all_atoms", "rmsd_ca"]].max()) * 1.05] - ax.plot(lims, lims, color="#999999", linestyle="--", linewidth=1, label="y = x") - ax.set_xlabel("All-atom RMSD (Å)") - ax.set_ylabel("Cα RMSD (Å)") - ax.set_title("All-atom vs Cα RMSD per structure") - ax.legend(frameon=False) - out = os.path.join(output_dir, "rmsd_scatter_allatom_vs_ca.png") - plt.savefig(out, dpi=150, bbox_inches="tight") - plt.close() - print(f"Saved: {out}") - - # Plot 3: bar plot sorted by CA RMSD - df_sorted = df_clean.sort_values("rmsd_ca", ascending=False).reset_index(drop=True) - fig, ax = plt.subplots(figsize=(max(10, len(df_sorted) * 0.5), 5)) - x = np.arange(len(df_sorted)) - ax.bar(x - 0.2, df_sorted["rmsd_all_atoms"], width=0.4, label="All-atom", color=COLOR_BAR_ALL, alpha=0.85) - ax.bar(x + 0.2, df_sorted["rmsd_ca"], width=0.4, label="Cα", color=COLOR_BAR_CA, alpha=0.85) - ax.set_xticks(x) - ax.set_xticklabels(df_sorted["label_clean"], rotation=45, ha="right", fontsize=11) - ax.set_ylabel("RMSD (Å)") - ax.set_title("RMSD per structure (sorted by Cα RMSD)") - ax.legend(frameon=False) - plt.tight_layout() - out = os.path.join(output_dir, "rmsd_barplot_sorted.png") - plt.savefig(out, dpi=150, bbox_inches="tight") - plt.close() - print(f"Saved: {out}") - - -def plot_rmsd_violin(df, output_dir): - # Violin plot: distribution of all-atom and CA RMSD across all proteins - df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"]) - df_clean = df_clean.copy() - df_clean["label_clean"] = df_clean["label"].apply(lambda x: "_".join(x.split("_")[1:])) - df_melt = df_clean.melt( - id_vars="label_clean", - value_vars=["rmsd_all_atoms", "rmsd_ca"], - var_name="type", - value_name="rmsd" - ) - df_melt["type"] = df_melt["type"].replace({"rmsd_all_atoms": "All-atom", "rmsd_ca": "Cα"}) - fig, ax = plt.subplots(figsize=(6, 6)) - sns.violinplot(data=df_melt, x="type", y="rmsd", ax=ax, - palette={"All-atom": COLOR_VIOLIN_ALL, "Cα": COLOR_VIOLIN_CA}, - inner="box", cut=0) - ax.set_xlabel("") - ax.set_ylabel("RMSD (Å)") - ax.set_title("RMSD distribution across all proteins\nFoldX5 vs FoldX5.1") - plt.tight_layout() - out = os.path.join(output_dir, "rmsd_violin.png") - plt.savefig(out, dpi=150, bbox_inches="tight") - plt.close() - print(f"Saved: {out}") - - # 4. MAIN def main(): @@ -313,7 +248,6 @@ def main(): print(df[["rmsd_all_atoms", "rmsd_ca"]].describe().round(3).to_string()) print("\n[5] Generating plots...") plot_rmsd_distributions(df, args.output_dir) - plot_rmsd_violin(df, args.output_dir) print("\nDone.") if __name__ == "__main__": main()