ELELAB · angelikivliora · May 12, 2026 · May 13, 2026 · May 29, 2026
diff --git a/tools/foldx_version_comparison/6.classification_plddt.py b/tools/foldx_version_comparison/6.classification_plddt.py
@@ -0,0 +1,222 @@
+# pLDDT analysis: are disagreements between FoldX5 and FoldX5.1 enriched in structurally uncertain regions?
+# each mutation is treated as a separate data point (mutations at the same position share the same pLDDT)
+import os
+import glob
+import argparse
+import warnings
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import matplotlib as mpl
+import seaborn as sns
+warnings.filterwarnings("ignore")
+mpl.rcParams.update({
+    "font.family":       "sans-serif",
+    "font.size":         12,
+    "axes.titlesize":    13,
+    "axes.labelsize":    12,
+    "xtick.labelsize":   11,
+    "ytick.labelsize":   11,
+    "legend.fontsize":   11,
+    "axes.spines.top":   False,
+    "axes.spines.right": False,
+})
+color_agree    = "#4A90A4"
+color_disagree = "#B22222"
+color_dist     = "#6A9CC4"
+col_ddg   = "Stability (FoldX5, alphafold, kcal/mol)"
+col_plddt = "AlphaFold2 model pLDDT score"
+col_mut   = "Mutation"
+
+
+def classify_ddg(ddg):
+    # MAVISp classification thresholds applied to FoldX ddG only
+    # stabilizing: ddG <= -3, destabilizing: ddG >= 3, neutral: -2 < ddG < 2, uncertain: everything else
+    if ddg <= -3:
+        return "stabilizing"
+    elif ddg >= 3:
+        return "destabilizing"
+    elif -2 < ddg < 2:
+        return "neutral"
+    else:
+        return "uncertain"
+
+
+def load_all_proteins(foldx5_dir, foldx51_dir):
+    # load all csvs from both folders, match by filename
+    # classify each mutation independently
+    # label each mutation as agree or disagree between the two versions
+    foldx5_files  = {Path(f).stem: f for f in glob.glob(os.path.join(foldx5_dir,  "*.csv"))}
+    foldx51_files = {Path(f).stem: f for f in glob.glob(os.path.join(foldx51_dir, "*.csv"))}
+    common_proteins = sorted(set(foldx5_files.keys()) & set(foldx51_files.keys()))
+    print(f"proteins in foldx5:   {len(foldx5_files)}")
+    print(f"proteins in foldx5.1: {len(foldx51_files)}")
+    print(f"common proteins:      {len(common_proteins)}")
+    all_protein_data = []
+    for stem in common_proteins:
+        protein_name = stem.replace("-simple_mode", "")
+        try:
+            df_foldx5  = pd.read_csv(foldx5_files[stem])
+            df_foldx51 = pd.read_csv(foldx51_files[stem])
+            df_foldx5  = df_foldx5.rename(columns={col_ddg: "ddg_foldx5"})
+            df_foldx51 = df_foldx51.rename(columns={col_ddg: "ddg_foldx51"})
+            df_merged = pd.merge(
+                df_foldx51[[col_mut, "ddg_foldx51", col_plddt]],
+                df_foldx5[[col_mut, "ddg_foldx5"]],
+                on=col_mut
+            )
+            if df_merged.empty:
+                continue
+            df_merged["class_foldx5"]  = df_merged["ddg_foldx5"].apply(classify_ddg)
+            df_merged["class_foldx51"] = df_merged["ddg_foldx51"].apply(classify_ddg)
+            df_merged["agreement"]     = df_merged.apply(
+                lambda row: "agree" if row["class_foldx5"] == row["class_foldx51"] else "disagree",
+                axis=1
+            )
+            df_merged["ddg_diff"]     = abs(df_merged["ddg_foldx51"] - df_merged["ddg_foldx5"])
+            df_merged["protein_name"] = protein_name
+            all_protein_data.append(df_merged)
+        except KeyError as e:
+            print(f"  skipping {protein_name}: missing column {e}")
+            continue
+        except Exception as e:
+            raise RuntimeError(f"unexpected error processing {protein_name}: {e}")
+    if not all_protein_data:
+        return pd.DataFrame()
+    return pd.concat(all_protein_data, ignore_index=True)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1")
+    parser.add_argument("-f", "--foldx5_dir",  required=True, help="path to foldx5 dataset_tables folder")
+    parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder")
+    parser.add_argument("-o", "--output_dir",  default="./plddt_results", help="where to save results")
+    args = parser.parse_args()
+
+    print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n")
+    print("[1] loading data...")
+    df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir)
+    if df_mutations.empty:
+        print("error: no data loaded. check folder paths and csv structure.")
+        return
+
+    print(f"\ntotal mutations: {len(df_mutations)}")
+    print(f"agree:    {(df_mutations['agreement'] == 'agree').sum()}")
+    print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv")
+    df_mutations.to_csv(csv_out, index=False)
+    print(f"\nsaved results table: {csv_out}")
+
+    print("\n[2] global plots")
+
+    # kde + barplot of plddt scores for agree vs disagree mutations
+    os.makedirs(args.output_dir, exist_ok=True)
+    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+    ax = axes[0]
+    ax_bar = axes[1]
+    for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
+        subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
+        if subset.empty:
+            continue
+        sns.kdeplot(subset, ax=ax, color=color,
+                    label=f"{group} (n={len(subset)})",
+                    fill=True, alpha=0.3, linewidth=2)
+        ax.axvline(subset.median(), color=color, linestyle="--", linewidth=1.5, alpha=0.8)
+    ax.set_xlabel("pLDDT score")
+    ax.set_ylabel("density")
+    ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)")
+    ax.legend(frameon=False)
+    bin_edges = np.arange(0, 101, 3)
+    for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
+        subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
+        if subset.empty:
+            continue
+        counts, edges = np.histogram(subset, bins=bin_edges)
+        counts = counts / counts.sum()
+        ax_bar.bar(edges[:-1], counts, width=3, color=color, alpha=0.6,
+                   label=f"{group} (n={len(subset)})", align="edge", edgecolor="white")
+    ax_bar.set_xlabel("pLDDT score")
+    ax_bar.set_ylabel("fraction of mutations")
+    ax_bar.set_title("pLDDT distribution by classification agreement\n(3 Å bins)")
+    ax_bar.legend(frameon=False)
+    plt.tight_layout()
+    out = os.path.join(args.output_dir, "plddt_distribution_agree_disagree.png")
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"saved: {out}")
+
+    # scatter: plddt vs absolute ddg difference per mutation
+    fig, ax = plt.subplots(figsize=(8, 6))
+    for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
+        subset = df_mutations[df_mutations["agreement"] == group]
+        if subset.empty:
+            continue
+        ax.scatter(subset[col_plddt], subset["ddg_diff"],
+                   color=color, alpha=0.5, s=40,
+                   label=f"{group} (n={len(subset)})",
+                   edgecolors="none")
+    ax.set_xlabel("pLDDT score")
+    ax.set_ylabel("|ΔΔG FoldX5.1 − ΔΔG FoldX5| (kcal/mol)")
+    ax.set_title("pLDDT vs prediction difference per mutation\nFoldX5 vs FoldX5.1 (all proteins)")
+    ax.legend(frameon=False)
+    plt.tight_layout()
+    out = os.path.join(args.output_dir, "plddt_vs_ddg_diff_scatter.png")
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"saved: {out}")
+
+    print("\n[3] per-protein summary")
+
+    # per-protein: median plddt of disagree - median plddt of agree
+    per_protein_results = []
+    for protein_name, protein_group in df_mutations.groupby("protein_name"):
+        agree_plddt    = protein_group[protein_group["agreement"] == "agree"][col_plddt]
+        disagree_plddt = protein_group[protein_group["agreement"] == "disagree"][col_plddt]
+        if agree_plddt.empty or disagree_plddt.empty:
+            continue
+        delta_median = disagree_plddt.median() - agree_plddt.median()
+        per_protein_results.append({
+            "protein_name":          protein_name,
+            "median_plddt_agree":    agree_plddt.median(),
+            "median_plddt_disagree": disagree_plddt.median(),
+            "delta_median_plddt":    delta_median,
+            "n_agree_mutations":     len(agree_plddt),
+            "n_disagree_mutations":  len(disagree_plddt),
+        })
+    if not per_protein_results:
+        print("  no proteins with both agree and disagree mutations — skipping level 2 plot")
+        return
+    df_per_protein = pd.DataFrame(per_protein_results)
+    out_csv = os.path.join(args.output_dir, "per_protein_plddt_delta.csv")
+    df_per_protein.to_csv(out_csv, index=False)
+    print(f"saved: {out_csv}")
+    n_negative = (df_per_protein["delta_median_plddt"] < 0).sum()
+    n_positive = (df_per_protein["delta_median_plddt"] > 0).sum()
+    print(f"  proteins where disagree plddt < agree plddt: {n_negative}")
+    print(f"  proteins where disagree plddt > agree plddt: {n_positive}")
+    fig, ax = plt.subplots(figsize=(8, 6))
+    sns.histplot(df_per_protein["delta_median_plddt"], kde=True, ax=ax,
+                 color=color_dist, edgecolor="white", bins=20, alpha=0.8)
+    ax.axvline(0, color="#555555", linestyle="--", linewidth=1.5, label="no difference (0)")
+    ax.axvline(df_per_protein["delta_median_plddt"].median(), color=color_disagree,
+               linestyle="--", linewidth=1.5,
+               label=f"median: {df_per_protein['delta_median_plddt'].median():.1f}")
+    ax.set_xlabel("median pLDDT (disagree) − median pLDDT (agree)")
+    ax.set_ylabel("number of proteins")
+    ax.set_title(f"per-protein pLDDT difference: disagree vs agree mutations\n"
+                 f"(n={len(df_per_protein)} proteins — "
+                 f"{n_negative} negative, {n_positive} positive)")
+    ax.legend(frameon=False)
+    plt.tight_layout()
+    out = os.path.join(args.output_dir, "per_protein_plddt_delta.png")
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"saved: {out}")
+
+    print("\ndone.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/foldx_version_comparison/rmsd_matrix.py b/tools/foldx_version_comparison/rmsd_matrix.py
@@ -2,7 +2,6 @@
 # For each protein/domain in both datasets, computes all-atom and CA RMSD
 # Matching is done by UniProt ID + protein name
 # RMSD is computed only on overlapping residues between the two structures
-# Run: python rmsd_matrix.py --foldx5_dir /path/to/folder --foldx51_dir /path/to/folder --output_dir ./rmsd_results
 import os
 import glob
 import argparse
@@ -18,10 +17,6 @@
 warnings.filterwarnings("ignore")
 
 
-COLOR_VIOLIN_ALL  = "#C4A882"   # pale brown
-COLOR_VIOLIN_CA   = "#B8A0C8"   # pale purple
-COLOR_BAR_ALL     = "#B22222"   # firebrick
-COLOR_BAR_CA      = "#4A90A4"   # muted teal
 COLOR_DIST        = "#6A9CC4"   # soft blue
 
 mpl.rcParams.update({
@@ -201,8 +196,6 @@ def run_rmsd_analysis(pairs):
 
 def plot_rmsd_distributions(df, output_dir):
     # Plot 1: histogram + KDE for all-atom and CA RMSD with mean and median lines
-    # Plot 2: scatter of all-atom vs CA RMSD per structure
-    # Plot 3: bar plot of all structures sorted by CA RMSD
     os.makedirs(output_dir, exist_ok=True)
     df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"])
     df_clean = df_clean.copy()
@@ -212,7 +205,8 @@ def plot_rmsd_distributions(df, output_dir):
     fig, axes = plt.subplots(1, 2, figsize=(12, 5))
     fig.suptitle("RMSD distributions: FoldX5 vs FoldX5.1 initial structures", fontsize=13, fontweight="bold", y=1.01)
     for ax, col, title in zip(axes, ["rmsd_all_atoms", "rmsd_ca"], ["All-atom RMSD (Å)", "Cα RMSD (Å)"]):
-        sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=10, alpha=0.8)
+        bin_edges = np.arange(0, df_clean[col].max() + 1, 1)
+        sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=bin_edges, alpha=0.8)
         ax.axvline(df_clean[col].median(), color="#CC0000", linestyle="--", linewidth=1.5, label=f"Median: {df_clean[col].median():.2f} Å")
         ax.axvline(df_clean[col].mean(), color="#555555", linestyle=":", linewidth=1.5, label=f"Mean: {df_clean[col].mean():.2f} Å")
         ax.set_xlabel(title)
@@ -224,71 +218,13 @@ def plot_rmsd_distributions(df, output_dir):
     plt.close()
     print(f"\nSaved: {out}")
 
-    # Plot 2: all-atom vs CA scatter with diagonal reference line
-    fig, ax = plt.subplots(figsize=(6, 6))
-    ax.scatter(df_clean["rmsd_all_atoms"], df_clean["rmsd_ca"], color=COLOR_DIST, alpha=0.8, edgecolors="white", s=80)
-    lims = [0, max(df_clean[["rmsd_all_atoms", "rmsd_ca"]].max()) * 1.05]
-    ax.plot(lims, lims, color="#999999", linestyle="--", linewidth=1, label="y = x")
-    ax.set_xlabel("All-atom RMSD (Å)")
-    ax.set_ylabel("Cα RMSD (Å)")
-    ax.set_title("All-atom vs Cα RMSD per structure")
-    ax.legend(frameon=False)
-    out = os.path.join(output_dir, "rmsd_scatter_allatom_vs_ca.png")
-    plt.savefig(out, dpi=150, bbox_inches="tight")
-    plt.close()
-    print(f"Saved: {out}")
-
-    # Plot 3: bar plot sorted by CA RMSD
-    df_sorted = df_clean.sort_values("rmsd_ca", ascending=False).reset_index(drop=True)
-    fig, ax = plt.subplots(figsize=(max(10, len(df_sorted) * 0.5), 5))
-    x = np.arange(len(df_sorted))
-    ax.bar(x - 0.2, df_sorted["rmsd_all_atoms"], width=0.4, label="All-atom", color=COLOR_BAR_ALL, alpha=0.85)
-    ax.bar(x + 0.2, df_sorted["rmsd_ca"], width=0.4, label="Cα", color=COLOR_BAR_CA, alpha=0.85)
-    ax.set_xticks(x)
-    ax.set_xticklabels(df_sorted["label_clean"], rotation=45, ha="right", fontsize=11)
-    ax.set_ylabel("RMSD (Å)")
-    ax.set_title("RMSD per structure (sorted by Cα RMSD)")
-    ax.legend(frameon=False)
-    plt.tight_layout()
-    out = os.path.join(output_dir, "rmsd_barplot_sorted.png")
-    plt.savefig(out, dpi=150, bbox_inches="tight")
-    plt.close()
-    print(f"Saved: {out}")
-
-
-def plot_rmsd_violin(df, output_dir):
-    # Violin plot: distribution of all-atom and CA RMSD across all proteins
-    df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"])
-    df_clean = df_clean.copy()
-    df_clean["label_clean"] = df_clean["label"].apply(lambda x: "_".join(x.split("_")[1:]))
-    df_melt = df_clean.melt(
-        id_vars="label_clean",
-        value_vars=["rmsd_all_atoms", "rmsd_ca"],
-        var_name="type",
-        value_name="rmsd"
-    )
-    df_melt["type"] = df_melt["type"].replace({"rmsd_all_atoms": "All-atom", "rmsd_ca": "Cα"})
-    fig, ax = plt.subplots(figsize=(6, 6))
-    sns.violinplot(data=df_melt, x="type", y="rmsd", ax=ax,
-                   palette={"All-atom": COLOR_VIOLIN_ALL, "Cα": COLOR_VIOLIN_CA},
-                   inner="box", cut=0)
-    ax.set_xlabel("")
-    ax.set_ylabel("RMSD (Å)")
-    ax.set_title("RMSD distribution across all proteins\nFoldX5 vs FoldX5.1")
-    plt.tight_layout()
-    out = os.path.join(output_dir, "rmsd_violin.png")
-    plt.savefig(out, dpi=150, bbox_inches="tight")
-    plt.close()
-    print(f"Saved: {out}")
-
-
 # 4. MAIN
 
 def main():
     parser = argparse.ArgumentParser(description="Compute RMSD between FoldX5 and FoldX5.1 initial structures")
-    parser.add_argument("--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)")
-    parser.add_argument("--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder")
-    parser.add_argument("--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)")
+    parser.add_argument("-f", "--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)")
+    parser.add_argument("-i", "--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder")
+    parser.add_argument("-o", "--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)")
     args = parser.parse_args()
 
     print("RMSD analysis: FoldX5 vs FoldX5.1 initial structures")
@@ -312,7 +248,6 @@ def main():
     print(df[["rmsd_all_atoms", "rmsd_ca"]].describe().round(3).to_string())
     print("\n[5] Generating plots...")
     plot_rmsd_distributions(df, args.output_dir)
-    plot_rmsd_violin(df, args.output_dir)
     print("\nDone.")
 if __name__ == "__main__":
     main()