From a46b5f7563a63d875f045b1c34ad6213ce0f5da9 Mon Sep 17 00:00:00 2001
From: Angeliki Vliora <angelikivliora@gmail.com>
Date: Tue, 12 May 2026 11:08:28 +0200
Subject: [PATCH 1/3] Add pLDDT analysis script for FoldX5 vs FoldX5.1
 classification agreement

---
 .../6.classification_plddt.py                 | 206 ++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100755 tools/foldx_version_comparison/6.classification_plddt.py

diff --git a/tools/foldx_version_comparison/6.classification_plddt.py b/tools/foldx_version_comparison/6.classification_plddt.py
new file mode 100755
index 0000000..699edf1
--- /dev/null
+++ b/tools/foldx_version_comparison/6.classification_plddt.py
@@ -0,0 +1,206 @@
+# pLDDT analysis: are disagreements between FoldX5 and FoldX5.1 enriched in structurally uncertain regions?
+# Each mutation is treated as a separate data point (mutations at the same position share the same pLDDT)
+# Level 1: global plots across all proteins
+# Level 2: per-protein summary to check if effect is consistent and not driven by large high-pLDDT proteins
+# Run: python 6.classification_plddt.py  -f /path/to/foldx5/dataset_tables -i /path/to/foldx51/dataset_tables -o ./plddt_results
+import os
+import glob
+import argparse
+import warnings
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+warnings.filterwarnings("ignore")
+mpl.rcParams.update({
+    "font.family":       "sans-serif",
+    "font.size":         12,
+    "axes.titlesize":    13,
+    "axes.labelsize":    12,
+    "xtick.labelsize":   11,
+    "ytick.labelsize":   11,
+    "legend.fontsize":   11,
+    "axes.spines.top":   False,
+    "axes.spines.right": False,
+})
+color_agree    = "#4A90A4"
+color_disagree = "#B22222"
+color_dist     = "#6A9CC4"
+col_ddg   = "Stability (FoldX5, alphafold, kcal/mol)"
+col_plddt = "AlphaFold2 model pLDDT score"
+col_mut   = "Mutation"
+def classify_ddg(ddg):
+    # MAVISp classification thresholds applied to FoldX ddG only
+    # stabilizing: ddG <= -3, destabilizing: ddG >= 3, neutral: -2 < ddG < 2, uncertain: everything else
+    if ddg <= -3:
+        return "stabilizing"
+    elif ddg >= 3:
+        return "destabilizing"
+    elif -2 < ddg < 2:
+        return "neutral"
+    else:
+        return "uncertain"
+# 1. data loading
+def load_all_proteins(foldx5_dir, foldx51_dir):
+    # load all csvs from both folders, match by filename
+    # classify each mutation independently
+    # label each mutation as agree or disagree between the two versions
+    foldx5_files  = {Path(f).stem: f for f in glob.glob(os.path.join(foldx5_dir,  "*.csv"))}
+    foldx51_files = {Path(f).stem: f for f in glob.glob(os.path.join(foldx51_dir, "*.csv"))}
+    common_proteins = sorted(set(foldx5_files.keys()) & set(foldx51_files.keys()))
+    print(f"proteins in foldx5:   {len(foldx5_files)}")
+    print(f"proteins in foldx5.1: {len(foldx51_files)}")
+    print(f"common proteins:      {len(common_proteins)}")
+    all_protein_data = []
+    for stem in common_proteins:
+        protein_name = stem.replace("-simple_mode", "")
+        try:
+            df_foldx5  = pd.read_csv(foldx5_files[stem])
+            df_foldx51 = pd.read_csv(foldx51_files[stem])
+            df_foldx5  = df_foldx5.rename(columns={col_ddg: "ddg_foldx5"})
+            df_foldx51 = df_foldx51.rename(columns={col_ddg: "ddg_foldx51"})
+            df_merged = pd.merge(
+                df_foldx51[[col_mut, "ddg_foldx51", col_plddt]],
+                df_foldx5[[col_mut, "ddg_foldx5"]],
+                on=col_mut
+            )
+            if df_merged.empty:
+                continue
+            df_merged["class_foldx5"]  = df_merged["ddg_foldx5"].apply(classify_ddg)
+            df_merged["class_foldx51"] = df_merged["ddg_foldx51"].apply(classify_ddg)
+            df_merged["agreement"]     = df_merged.apply(
+                lambda row: "agree" if row["class_foldx5"] == row["class_foldx51"] else "disagree",
+                axis=1
+            )
+            df_merged["ddg_diff"]     = abs(df_merged["ddg_foldx51"] - df_merged["ddg_foldx5"])
+            df_merged["protein_name"] = protein_name
+            all_protein_data.append(df_merged)
+        except Exception as e:
+            print(f"  could not process {protein_name}: {e}")
+            continue
+    if not all_protein_data:
+        return pd.DataFrame()
+    return pd.concat(all_protein_data, ignore_index=True)
+# 2. level 1 - global plots
+def plot_plddt_distribution(df_mutations, output_dir):
+    # kde distribution of plddt scores for agree vs disagree mutations across all proteins
+    os.makedirs(output_dir, exist_ok=True)
+    fig, ax = plt.subplots(figsize=(8, 6))
+    for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
+        subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
+        if subset.empty:
+            print(f"  no data for {group} group")
+            continue
+        sns.kdeplot(subset, ax=ax, color=color,
+                    label=f"{group} (n={len(subset)})",
+                    fill=True, alpha=0.3, linewidth=2)
+        ax.axvline(subset.median(), color=color, linestyle="--", linewidth=1.5, alpha=0.8)
+    ax.set_xlabel("pLDDT score")
+    ax.set_ylabel("density")
+    ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)")
+    ax.legend(frameon=False)
+    plt.tight_layout()
+    out = os.path.join(output_dir, "plddt_distribution_agree_disagree.png")
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"saved: {out}")
+def plot_plddt_scatter(df_mutations, output_dir):
+    # scatter: plddt vs absolute ddg difference per mutation, colored by agree/disagree
+    fig, ax = plt.subplots(figsize=(8, 6))
+    for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
+        subset = df_mutations[df_mutations["agreement"] == group]
+        if subset.empty:
+            continue
+        ax.scatter(subset[col_plddt], subset["ddg_diff"],
+                   color=color, alpha=0.5, s=40,
+                   label=f"{group} (n={len(subset)})",
+                   edgecolors="none")
+    ax.set_xlabel("pLDDT score")
+    ax.set_ylabel("|ΔΔG FoldX5.1 − ΔΔG FoldX5| (kcal/mol)")
+    ax.set_title("pLDDT vs prediction difference per mutation\nFoldX5 vs FoldX5.1 (all proteins)")
+    ax.legend(frameon=False)
+    plt.tight_layout()
+    out = os.path.join(output_dir, "plddt_vs_ddg_diff_scatter.png")
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"saved: {out}")
+
+
+# 3. level 2 - per-protein summary
+def plot_per_protein_delta(df_mutations, output_dir):
+    per_protein_results = []
+    for protein_name, protein_group in df_mutations.groupby("protein_name"):
+        agree_plddt    = protein_group[protein_group["agreement"] == "agree"][col_plddt]
+        disagree_plddt = protein_group[protein_group["agreement"] == "disagree"][col_plddt]
+        if agree_plddt.empty or disagree_plddt.empty:
+            continue
+        delta_median = disagree_plddt.median() - agree_plddt.median()
+        per_protein_results.append({
+            "protein_name":          protein_name,
+            "median_plddt_agree":    agree_plddt.median(),
+            "median_plddt_disagree": disagree_plddt.median(),
+            "delta_median_plddt":    delta_median,
+            "n_agree_mutations":     len(agree_plddt),
+            "n_disagree_mutations":  len(disagree_plddt),
+        })
+    if not per_protein_results:
+        print("  no proteins with both agree and disagree mutations — skipping level 2 plot")
+        return
+    df_per_protein = pd.DataFrame(per_protein_results)
+    out_csv = os.path.join(output_dir, "per_protein_plddt_delta.csv")
+    df_per_protein.to_csv(out_csv, index=False)
+    print(f"saved: {out_csv}")
+    n_negative = (df_per_protein["delta_median_plddt"] < 0).sum()
+    n_positive = (df_per_protein["delta_median_plddt"] > 0).sum()
+    print(f"  proteins where disagree plddt < agree plddt: {n_negative}")
+    print(f"  proteins where disagree plddt > agree plddt: {n_positive}")
+    fig, ax = plt.subplots(figsize=(8, 6))
+    sns.histplot(df_per_protein["delta_median_plddt"], kde=True, ax=ax,
+                 color=color_dist, edgecolor="white", bins=20, alpha=0.8)
+    ax.axvline(0, color="#555555", linestyle="--", linewidth=1.5, label="no difference (0)")
+    ax.axvline(df_per_protein["delta_median_plddt"].median(), color=color_disagree,
+               linestyle="--", linewidth=1.5,
+               label=f"median: {df_per_protein['delta_median_plddt'].median():.1f}")
+    ax.set_xlabel("median pLDDT (disagree) − median pLDDT (agree)")
+    ax.set_ylabel("number of proteins")
+    ax.set_title(f"per-protein pLDDT difference: disagree vs agree mutations\n"
+                 f"(n={len(df_per_protein)} proteins — "
+                 f"{n_negative} negative, {n_positive} positive)")
+    ax.legend(frameon=False)
+    plt.tight_layout()
+    out = os.path.join(output_dir, "per_protein_plddt_delta.png")
+    plt.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"saved: {out}")
+
+
+# 4. main
+def main():
+    parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1")
+    parser.add_argument("-f", "--foldx5_dir",  required=True, help="path to foldx5 dataset_tables folder")
+    parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder")
+    parser.add_argument("-o", "--output_dir",  default="./plddt_results", help="where to save results")
+    args = parser.parse_args()
+    print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n")
+    print("[1] loading data...")
+    df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir)
+    if df_mutations.empty:
+        print("error: no data loaded. check folder paths and csv structure.")
+        return
+    print(f"\ntotal mutations: {len(df_mutations)}")
+    print(f"agree:    {(df_mutations['agreement'] == 'agree').sum()}")
+    print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}")
+    os.makedirs(args.output_dir, exist_ok=True)
+    csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv")
+    df_mutations.to_csv(csv_out, index=False)
+    print(f"\nsaved results table: {csv_out}")
+    print("\n[2] global plots...")
+    plot_plddt_distribution(df_mutations, args.output_dir)
+    plot_plddt_scatter(df_mutations, args.output_dir)
+    print("\n[3] per-protein summary...")
+    plot_per_protein_delta(df_mutations, args.output_dir)
+    print("\ndone.")
+if __name__ == "__main__":
+    main()

From 5779d83db9fa98ba730a7090f33f42b0395894f3 Mon Sep 17 00:00:00 2001
From: Angeliki Vliora <angelikivliora@gmail.com>
Date: Wed, 13 May 2026 16:01:29 +0200
Subject: [PATCH 2/3] Add RMSD matrix script for FoldX5 vs FoldX5.1 initial
 structures

---
 .../6.classification_plddt.py                  | 18 +++++++++++++++++-
 tools/foldx_version_comparison/rmsd_matrix.py  | 11 ++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/tools/foldx_version_comparison/6.classification_plddt.py b/tools/foldx_version_comparison/6.classification_plddt.py
index 699edf1..33aaae6 100755
--- a/tools/foldx_version_comparison/6.classification_plddt.py
+++ b/tools/foldx_version_comparison/6.classification_plddt.py
@@ -87,7 +87,9 @@ def load_all_proteins(foldx5_dir, foldx51_dir):
 def plot_plddt_distribution(df_mutations, output_dir):
     # kde distribution of plddt scores for agree vs disagree mutations across all proteins
     os.makedirs(output_dir, exist_ok=True)
-    fig, ax = plt.subplots(figsize=(8, 6))
+    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+    ax = axes[0]
+    ax_bar = axes[1]
     for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
         subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
         if subset.empty:
@@ -101,6 +103,20 @@ def plot_plddt_distribution(df_mutations, output_dir):
     ax.set_ylabel("density")
     ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)")
     ax.legend(frameon=False)
+    # barplot with 3 pLDDT-wide bins
+    bin_edges = np.arange(0, 101, 3)
+    for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
+        subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
+        if subset.empty:
+            continue
+        counts, edges = np.histogram(subset, bins=bin_edges)
+        counts = counts / counts.sum()  # normalize to fraction
+        ax_bar.bar(edges[:-1], counts, width=3, color=color, alpha=0.6,
+                   label=f"{group} (n={len(subset)})", align="edge", edgecolor="white")
+    ax_bar.set_xlabel("pLDDT score")
+    ax_bar.set_ylabel("Fraction of mutations")
+    ax_bar.set_title("pLDDT distribution by classification agreement\n(3 Å bins)")
+    ax_bar.legend(frameon=False)
     plt.tight_layout()
     out = os.path.join(output_dir, "plddt_distribution_agree_disagree.png")
     plt.savefig(out, dpi=150, bbox_inches="tight")
diff --git a/tools/foldx_version_comparison/rmsd_matrix.py b/tools/foldx_version_comparison/rmsd_matrix.py
index 149429a..99f4a73 100755
--- a/tools/foldx_version_comparison/rmsd_matrix.py
+++ b/tools/foldx_version_comparison/rmsd_matrix.py
@@ -2,7 +2,7 @@
 # For each protein/domain in both datasets, computes all-atom and CA RMSD
 # Matching is done by UniProt ID + protein name
 # RMSD is computed only on overlapping residues between the two structures
-# Run: python rmsd_matrix.py --foldx5_dir /path/to/folder --foldx51_dir /path/to/folder --output_dir ./rmsd_results
+# Run: python rmsd_matrix.py -f /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/foldx5_initial_structures -i /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/data_collection_foldx5.1 -o ./rmsd_results
 import os
 import glob
 import argparse
@@ -212,7 +212,8 @@ def plot_rmsd_distributions(df, output_dir):
     fig, axes = plt.subplots(1, 2, figsize=(12, 5))
     fig.suptitle("RMSD distributions: FoldX5 vs FoldX5.1 initial structures", fontsize=13, fontweight="bold", y=1.01)
     for ax, col, title in zip(axes, ["rmsd_all_atoms", "rmsd_ca"], ["All-atom RMSD (Å)", "Cα RMSD (Å)"]):
-        sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=10, alpha=0.8)
+        bin_edges = np.arange(0, df_clean[col].max() + 1, 1)
+        sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=bin_edges, alpha=0.8)
         ax.axvline(df_clean[col].median(), color="#CC0000", linestyle="--", linewidth=1.5, label=f"Median: {df_clean[col].median():.2f} Å")
         ax.axvline(df_clean[col].mean(), color="#555555", linestyle=":", linewidth=1.5, label=f"Mean: {df_clean[col].mean():.2f} Å")
         ax.set_xlabel(title)
@@ -286,9 +287,9 @@ def plot_rmsd_violin(df, output_dir):
 
 def main():
     parser = argparse.ArgumentParser(description="Compute RMSD between FoldX5 and FoldX5.1 initial structures")
-    parser.add_argument("--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)")
-    parser.add_argument("--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder")
-    parser.add_argument("--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)")
+    parser.add_argument("-f", "--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)")
+    parser.add_argument("-i", "--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder")
+    parser.add_argument("-o", "--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)")
     args = parser.parse_args()
 
     print("RMSD analysis: FoldX5 vs FoldX5.1 initial structures")

From bd446d7adef6a6dc5402b83405083f001f607ae2 Mon Sep 17 00:00:00 2001
From: Angeliki Vliora <angelikivliora@gmail.com>
Date: Fri, 29 May 2026 11:23:11 +0200
Subject: [PATCH 3/3] Remove scatter, barplot and violin from rmsd script -
 inline plotting functions in plddt script

---
 .../6.classification_plddt.py                 | 100 +++++++++---------
 tools/foldx_version_comparison/rmsd_matrix.py |  66 ------------
 2 files changed, 50 insertions(+), 116 deletions(-)

diff --git a/tools/foldx_version_comparison/6.classification_plddt.py b/tools/foldx_version_comparison/6.classification_plddt.py
index 33aaae6..8897199 100755
--- a/tools/foldx_version_comparison/6.classification_plddt.py
+++ b/tools/foldx_version_comparison/6.classification_plddt.py
@@ -1,8 +1,5 @@
 # pLDDT analysis: are disagreements between FoldX5 and FoldX5.1 enriched in structurally uncertain regions?
-# Each mutation is treated as a separate data point (mutations at the same position share the same pLDDT)
-# Level 1: global plots across all proteins
-# Level 2: per-protein summary to check if effect is consistent and not driven by large high-pLDDT proteins
-# Run: python 6.classification_plddt.py  -f /path/to/foldx5/dataset_tables -i /path/to/foldx51/dataset_tables -o ./plddt_results
+# each mutation is treated as a separate data point (mutations at the same position share the same pLDDT)
 import os
 import glob
 import argparse
@@ -10,7 +7,6 @@
 from pathlib import Path
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
 import matplotlib as mpl
 import seaborn as sns
 warnings.filterwarnings("ignore")
@@ -31,6 +27,8 @@
 col_ddg   = "Stability (FoldX5, alphafold, kcal/mol)"
 col_plddt = "AlphaFold2 model pLDDT score"
 col_mut   = "Mutation"
+
+
 def classify_ddg(ddg):
     # MAVISp classification thresholds applied to FoldX ddG only
     # stabilizing: ddG <= -3, destabilizing: ddG >= 3, neutral: -2 < ddG < 2, uncertain: everything else
@@ -42,7 +40,8 @@ def classify_ddg(ddg):
         return "neutral"
     else:
         return "uncertain"
-# 1. data loading
+
+
 def load_all_proteins(foldx5_dir, foldx51_dir):
     # load all csvs from both folders, match by filename
     # classify each mutation independently
@@ -77,23 +76,49 @@ def load_all_proteins(foldx5_dir, foldx51_dir):
             df_merged["ddg_diff"]     = abs(df_merged["ddg_foldx51"] - df_merged["ddg_foldx5"])
             df_merged["protein_name"] = protein_name
             all_protein_data.append(df_merged)
-        except Exception as e:
-            print(f"  could not process {protein_name}: {e}")
+        except KeyError as e:
+            print(f"  skipping {protein_name}: missing column {e}")
             continue
+        except Exception as e:
+            raise RuntimeError(f"unexpected error processing {protein_name}: {e}")
     if not all_protein_data:
         return pd.DataFrame()
     return pd.concat(all_protein_data, ignore_index=True)
-# 2. level 1 - global plots
-def plot_plddt_distribution(df_mutations, output_dir):
-    # kde distribution of plddt scores for agree vs disagree mutations across all proteins
-    os.makedirs(output_dir, exist_ok=True)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1")
+    parser.add_argument("-f", "--foldx5_dir",  required=True, help="path to foldx5 dataset_tables folder")
+    parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder")
+    parser.add_argument("-o", "--output_dir",  default="./plddt_results", help="where to save results")
+    args = parser.parse_args()
+
+    print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n")
+    print("[1] loading data...")
+    df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir)
+    if df_mutations.empty:
+        print("error: no data loaded. check folder paths and csv structure.")
+        return
+
+    print(f"\ntotal mutations: {len(df_mutations)}")
+    print(f"agree:    {(df_mutations['agreement'] == 'agree').sum()}")
+    print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv")
+    df_mutations.to_csv(csv_out, index=False)
+    print(f"\nsaved results table: {csv_out}")
+
+    print("\n[2] global plots")
+
+    # kde + barplot of plddt scores for agree vs disagree mutations
+    os.makedirs(args.output_dir, exist_ok=True)
     fig, axes = plt.subplots(1, 2, figsize=(14, 6))
     ax = axes[0]
     ax_bar = axes[1]
     for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
         subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
         if subset.empty:
-            print(f"  no data for {group} group")
             continue
         sns.kdeplot(subset, ax=ax, color=color,
                     label=f"{group} (n={len(subset)})",
@@ -103,27 +128,26 @@ def plot_plddt_distribution(df_mutations, output_dir):
     ax.set_ylabel("density")
     ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)")
     ax.legend(frameon=False)
-    # barplot with 3 pLDDT-wide bins
     bin_edges = np.arange(0, 101, 3)
     for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
         subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
         if subset.empty:
             continue
         counts, edges = np.histogram(subset, bins=bin_edges)
-        counts = counts / counts.sum()  # normalize to fraction
+        counts = counts / counts.sum()
         ax_bar.bar(edges[:-1], counts, width=3, color=color, alpha=0.6,
                    label=f"{group} (n={len(subset)})", align="edge", edgecolor="white")
     ax_bar.set_xlabel("pLDDT score")
-    ax_bar.set_ylabel("Fraction of mutations")
+    ax_bar.set_ylabel("fraction of mutations")
     ax_bar.set_title("pLDDT distribution by classification agreement\n(3 Å bins)")
     ax_bar.legend(frameon=False)
     plt.tight_layout()
-    out = os.path.join(output_dir, "plddt_distribution_agree_disagree.png")
+    out = os.path.join(args.output_dir, "plddt_distribution_agree_disagree.png")
     plt.savefig(out, dpi=150, bbox_inches="tight")
     plt.close()
     print(f"saved: {out}")
-def plot_plddt_scatter(df_mutations, output_dir):
-    # scatter: plddt vs absolute ddg difference per mutation, colored by agree/disagree
+
+    # scatter: plddt vs absolute ddg difference per mutation
     fig, ax = plt.subplots(figsize=(8, 6))
     for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
         subset = df_mutations[df_mutations["agreement"] == group]
@@ -138,14 +162,14 @@ def plot_plddt_scatter(df_mutations, output_dir):
     ax.set_title("pLDDT vs prediction difference per mutation\nFoldX5 vs FoldX5.1 (all proteins)")
     ax.legend(frameon=False)
     plt.tight_layout()
-    out = os.path.join(output_dir, "plddt_vs_ddg_diff_scatter.png")
+    out = os.path.join(args.output_dir, "plddt_vs_ddg_diff_scatter.png")
     plt.savefig(out, dpi=150, bbox_inches="tight")
     plt.close()
     print(f"saved: {out}")
 
+    print("\n[3] per-protein summary")
 
-# 3. level 2 - per-protein summary
-def plot_per_protein_delta(df_mutations, output_dir):
+    # per-protein: median plddt of disagree - median plddt of agree
     per_protein_results = []
     for protein_name, protein_group in df_mutations.groupby("protein_name"):
         agree_plddt    = protein_group[protein_group["agreement"] == "agree"][col_plddt]
@@ -165,7 +189,7 @@ def plot_per_protein_delta(df_mutations, output_dir):
         print("  no proteins with both agree and disagree mutations — skipping level 2 plot")
         return
     df_per_protein = pd.DataFrame(per_protein_results)
-    out_csv = os.path.join(output_dir, "per_protein_plddt_delta.csv")
+    out_csv = os.path.join(args.output_dir, "per_protein_plddt_delta.csv")
     df_per_protein.to_csv(out_csv, index=False)
     print(f"saved: {out_csv}")
     n_negative = (df_per_protein["delta_median_plddt"] < 0).sum()
@@ -186,37 +210,13 @@ def plot_per_protein_delta(df_mutations, output_dir):
                  f"{n_negative} negative, {n_positive} positive)")
     ax.legend(frameon=False)
     plt.tight_layout()
-    out = os.path.join(output_dir, "per_protein_plddt_delta.png")
+    out = os.path.join(args.output_dir, "per_protein_plddt_delta.png")
     plt.savefig(out, dpi=150, bbox_inches="tight")
     plt.close()
     print(f"saved: {out}")
 
-
-# 4. main
-def main():
-    parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1")
-    parser.add_argument("-f", "--foldx5_dir",  required=True, help="path to foldx5 dataset_tables folder")
-    parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder")
-    parser.add_argument("-o", "--output_dir",  default="./plddt_results", help="where to save results")
-    args = parser.parse_args()
-    print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n")
-    print("[1] loading data...")
-    df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir)
-    if df_mutations.empty:
-        print("error: no data loaded. check folder paths and csv structure.")
-        return
-    print(f"\ntotal mutations: {len(df_mutations)}")
-    print(f"agree:    {(df_mutations['agreement'] == 'agree').sum()}")
-    print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}")
-    os.makedirs(args.output_dir, exist_ok=True)
-    csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv")
-    df_mutations.to_csv(csv_out, index=False)
-    print(f"\nsaved results table: {csv_out}")
-    print("\n[2] global plots...")
-    plot_plddt_distribution(df_mutations, args.output_dir)
-    plot_plddt_scatter(df_mutations, args.output_dir)
-    print("\n[3] per-protein summary...")
-    plot_per_protein_delta(df_mutations, args.output_dir)
     print("\ndone.")
+
+
 if __name__ == "__main__":
     main()
diff --git a/tools/foldx_version_comparison/rmsd_matrix.py b/tools/foldx_version_comparison/rmsd_matrix.py
index 99f4a73..bcab63c 100755
--- a/tools/foldx_version_comparison/rmsd_matrix.py
+++ b/tools/foldx_version_comparison/rmsd_matrix.py
@@ -2,7 +2,6 @@
 # For each protein/domain in both datasets, computes all-atom and CA RMSD
 # Matching is done by UniProt ID + protein name
 # RMSD is computed only on overlapping residues between the two structures
-# Run: python rmsd_matrix.py -f /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/foldx5_initial_structures -i /data/user/shared_projects/mavisp_ensemble_sim_length/foldx5.1_evaluation/data_collection_foldx5.1 -o ./rmsd_results
 import os
 import glob
 import argparse
@@ -18,10 +17,6 @@
 warnings.filterwarnings("ignore")
 
 
-COLOR_VIOLIN_ALL  = "#C4A882"   # pale brown
-COLOR_VIOLIN_CA   = "#B8A0C8"   # pale purple
-COLOR_BAR_ALL     = "#B22222"   # firebrick
-COLOR_BAR_CA      = "#4A90A4"   # muted teal
 COLOR_DIST        = "#6A9CC4"   # soft blue
 
 mpl.rcParams.update({
@@ -201,8 +196,6 @@ def run_rmsd_analysis(pairs):
 
 def plot_rmsd_distributions(df, output_dir):
     # Plot 1: histogram + KDE for all-atom and CA RMSD with mean and median lines
-    # Plot 2: scatter of all-atom vs CA RMSD per structure
-    # Plot 3: bar plot of all structures sorted by CA RMSD
     os.makedirs(output_dir, exist_ok=True)
     df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"])
     df_clean = df_clean.copy()
@@ -225,64 +218,6 @@ def plot_rmsd_distributions(df, output_dir):
     plt.close()
     print(f"\nSaved: {out}")
 
-    # Plot 2: all-atom vs CA scatter with diagonal reference line
-    fig, ax = plt.subplots(figsize=(6, 6))
-    ax.scatter(df_clean["rmsd_all_atoms"], df_clean["rmsd_ca"], color=COLOR_DIST, alpha=0.8, edgecolors="white", s=80)
-    lims = [0, max(df_clean[["rmsd_all_atoms", "rmsd_ca"]].max()) * 1.05]
-    ax.plot(lims, lims, color="#999999", linestyle="--", linewidth=1, label="y = x")
-    ax.set_xlabel("All-atom RMSD (Å)")
-    ax.set_ylabel("Cα RMSD (Å)")
-    ax.set_title("All-atom vs Cα RMSD per structure")
-    ax.legend(frameon=False)
-    out = os.path.join(output_dir, "rmsd_scatter_allatom_vs_ca.png")
-    plt.savefig(out, dpi=150, bbox_inches="tight")
-    plt.close()
-    print(f"Saved: {out}")
-
-    # Plot 3: bar plot sorted by CA RMSD
-    df_sorted = df_clean.sort_values("rmsd_ca", ascending=False).reset_index(drop=True)
-    fig, ax = plt.subplots(figsize=(max(10, len(df_sorted) * 0.5), 5))
-    x = np.arange(len(df_sorted))
-    ax.bar(x - 0.2, df_sorted["rmsd_all_atoms"], width=0.4, label="All-atom", color=COLOR_BAR_ALL, alpha=0.85)
-    ax.bar(x + 0.2, df_sorted["rmsd_ca"], width=0.4, label="Cα", color=COLOR_BAR_CA, alpha=0.85)
-    ax.set_xticks(x)
-    ax.set_xticklabels(df_sorted["label_clean"], rotation=45, ha="right", fontsize=11)
-    ax.set_ylabel("RMSD (Å)")
-    ax.set_title("RMSD per structure (sorted by Cα RMSD)")
-    ax.legend(frameon=False)
-    plt.tight_layout()
-    out = os.path.join(output_dir, "rmsd_barplot_sorted.png")
-    plt.savefig(out, dpi=150, bbox_inches="tight")
-    plt.close()
-    print(f"Saved: {out}")
-
-
-def plot_rmsd_violin(df, output_dir):
-    # Violin plot: distribution of all-atom and CA RMSD across all proteins
-    df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"])
-    df_clean = df_clean.copy()
-    df_clean["label_clean"] = df_clean["label"].apply(lambda x: "_".join(x.split("_")[1:]))
-    df_melt = df_clean.melt(
-        id_vars="label_clean",
-        value_vars=["rmsd_all_atoms", "rmsd_ca"],
-        var_name="type",
-        value_name="rmsd"
-    )
-    df_melt["type"] = df_melt["type"].replace({"rmsd_all_atoms": "All-atom", "rmsd_ca": "Cα"})
-    fig, ax = plt.subplots(figsize=(6, 6))
-    sns.violinplot(data=df_melt, x="type", y="rmsd", ax=ax,
-                   palette={"All-atom": COLOR_VIOLIN_ALL, "Cα": COLOR_VIOLIN_CA},
-                   inner="box", cut=0)
-    ax.set_xlabel("")
-    ax.set_ylabel("RMSD (Å)")
-    ax.set_title("RMSD distribution across all proteins\nFoldX5 vs FoldX5.1")
-    plt.tight_layout()
-    out = os.path.join(output_dir, "rmsd_violin.png")
-    plt.savefig(out, dpi=150, bbox_inches="tight")
-    plt.close()
-    print(f"Saved: {out}")
-
-
 # 4. MAIN
 
 def main():
@@ -313,7 +248,6 @@ def main():
     print(df[["rmsd_all_atoms", "rmsd_ca"]].describe().round(3).to_string())
     print("\n[5] Generating plots...")
     plot_rmsd_distributions(df, args.output_dir)
-    plot_rmsd_violin(df, args.output_dir)
     print("\nDone.")
 if __name__ == "__main__":
     main()