Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 222 additions & 0 deletions tools/foldx_version_comparison/6.classification_plddt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# pLDDT analysis: are disagreements between FoldX5 and FoldX5.1 enriched in structurally uncertain regions?
# each mutation is treated as a separate data point (mutations at the same position share the same pLDDT)
import os
import glob
import argparse
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
warnings.filterwarnings("ignore")
mpl.rcParams.update({
"font.family": "sans-serif",
"font.size": 12,
"axes.titlesize": 13,
"axes.labelsize": 12,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"axes.spines.top": False,
"axes.spines.right": False,
})
color_agree = "#4A90A4"
color_disagree = "#B22222"
color_dist = "#6A9CC4"
col_ddg = "Stability (FoldX5, alphafold, kcal/mol)"
col_plddt = "AlphaFold2 model pLDDT score"
col_mut = "Mutation"


def classify_ddg(ddg):
# MAVISp classification thresholds applied to FoldX ddG only
# stabilizing: ddG <= -3, destabilizing: ddG >= 3, neutral: -2 < ddG < 2, uncertain: everything else
if ddg <= -3:
return "stabilizing"
elif ddg >= 3:
return "destabilizing"
elif -2 < ddg < 2:
return "neutral"
else:
return "uncertain"


def load_all_proteins(foldx5_dir, foldx51_dir):
# load all csvs from both folders, match by filename
# classify each mutation independently
# label each mutation as agree or disagree between the two versions
foldx5_files = {Path(f).stem: f for f in glob.glob(os.path.join(foldx5_dir, "*.csv"))}
foldx51_files = {Path(f).stem: f for f in glob.glob(os.path.join(foldx51_dir, "*.csv"))}
common_proteins = sorted(set(foldx5_files.keys()) & set(foldx51_files.keys()))
print(f"proteins in foldx5: {len(foldx5_files)}")
print(f"proteins in foldx5.1: {len(foldx51_files)}")
print(f"common proteins: {len(common_proteins)}")
all_protein_data = []
for stem in common_proteins:
protein_name = stem.replace("-simple_mode", "")
try:
df_foldx5 = pd.read_csv(foldx5_files[stem])
df_foldx51 = pd.read_csv(foldx51_files[stem])
df_foldx5 = df_foldx5.rename(columns={col_ddg: "ddg_foldx5"})
df_foldx51 = df_foldx51.rename(columns={col_ddg: "ddg_foldx51"})
df_merged = pd.merge(
df_foldx51[[col_mut, "ddg_foldx51", col_plddt]],
df_foldx5[[col_mut, "ddg_foldx5"]],
on=col_mut
)
if df_merged.empty:
continue
df_merged["class_foldx5"] = df_merged["ddg_foldx5"].apply(classify_ddg)
df_merged["class_foldx51"] = df_merged["ddg_foldx51"].apply(classify_ddg)
df_merged["agreement"] = df_merged.apply(
lambda row: "agree" if row["class_foldx5"] == row["class_foldx51"] else "disagree",
axis=1
)
df_merged["ddg_diff"] = abs(df_merged["ddg_foldx51"] - df_merged["ddg_foldx5"])
df_merged["protein_name"] = protein_name
all_protein_data.append(df_merged)
except KeyError as e:
print(f" skipping {protein_name}: missing column {e}")
continue
except Exception as e:
raise RuntimeError(f"unexpected error processing {protein_name}: {e}")
if not all_protein_data:
return pd.DataFrame()
return pd.concat(all_protein_data, ignore_index=True)


def main():
parser = argparse.ArgumentParser(description="plddt analysis: disagreements between foldx5 and foldx5.1")
parser.add_argument("-f", "--foldx5_dir", required=True, help="path to foldx5 dataset_tables folder")
parser.add_argument("-i", "--foldx51_dir", required=True, help="path to foldx5.1 dataset_tables folder")
parser.add_argument("-o", "--output_dir", default="./plddt_results", help="where to save results")
args = parser.parse_args()

print("plddt analysis: foldx5 vs foldx5.1 classification agreement\n")
print("[1] loading data...")
df_mutations = load_all_proteins(args.foldx5_dir, args.foldx51_dir)
if df_mutations.empty:
print("error: no data loaded. check folder paths and csv structure.")
return

print(f"\ntotal mutations: {len(df_mutations)}")
print(f"agree: {(df_mutations['agreement'] == 'agree').sum()}")
print(f"disagree: {(df_mutations['agreement'] == 'disagree').sum()}")

os.makedirs(args.output_dir, exist_ok=True)
csv_out = os.path.join(args.output_dir, "mutation_agreement_results.csv")
df_mutations.to_csv(csv_out, index=False)
print(f"\nsaved results table: {csv_out}")

print("\n[2] global plots")

# kde + barplot of plddt scores for agree vs disagree mutations
os.makedirs(args.output_dir, exist_ok=True)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
ax = axes[0]
ax_bar = axes[1]
for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
if subset.empty:
continue
sns.kdeplot(subset, ax=ax, color=color,
label=f"{group} (n={len(subset)})",
fill=True, alpha=0.3, linewidth=2)
ax.axvline(subset.median(), color=color, linestyle="--", linewidth=1.5, alpha=0.8)
ax.set_xlabel("pLDDT score")
ax.set_ylabel("density")
ax.set_title("pLDDT distribution by classification agreement\nFoldX5 vs FoldX5.1 (all proteins)")
ax.legend(frameon=False)
bin_edges = np.arange(0, 101, 3)
for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
subset = df_mutations[df_mutations["agreement"] == group][col_plddt]
if subset.empty:
continue
counts, edges = np.histogram(subset, bins=bin_edges)
counts = counts / counts.sum()
ax_bar.bar(edges[:-1], counts, width=3, color=color, alpha=0.6,
label=f"{group} (n={len(subset)})", align="edge", edgecolor="white")
ax_bar.set_xlabel("pLDDT score")
ax_bar.set_ylabel("fraction of mutations")
ax_bar.set_title("pLDDT distribution by classification agreement\n(3 Å bins)")
ax_bar.legend(frameon=False)
plt.tight_layout()
out = os.path.join(args.output_dir, "plddt_distribution_agree_disagree.png")
plt.savefig(out, dpi=150, bbox_inches="tight")
plt.close()
print(f"saved: {out}")

# scatter: plddt vs absolute ddg difference per mutation
fig, ax = plt.subplots(figsize=(8, 6))
for group, color in [("agree", color_agree), ("disagree", color_disagree)]:
subset = df_mutations[df_mutations["agreement"] == group]
if subset.empty:
continue
ax.scatter(subset[col_plddt], subset["ddg_diff"],
color=color, alpha=0.5, s=40,
label=f"{group} (n={len(subset)})",
edgecolors="none")
ax.set_xlabel("pLDDT score")
ax.set_ylabel("|ΔΔG FoldX5.1 − ΔΔG FoldX5| (kcal/mol)")
ax.set_title("pLDDT vs prediction difference per mutation\nFoldX5 vs FoldX5.1 (all proteins)")
ax.legend(frameon=False)
plt.tight_layout()
out = os.path.join(args.output_dir, "plddt_vs_ddg_diff_scatter.png")
plt.savefig(out, dpi=150, bbox_inches="tight")
plt.close()
print(f"saved: {out}")

print("\n[3] per-protein summary")

# per-protein: median plddt of disagree - median plddt of agree
per_protein_results = []
for protein_name, protein_group in df_mutations.groupby("protein_name"):
agree_plddt = protein_group[protein_group["agreement"] == "agree"][col_plddt]
disagree_plddt = protein_group[protein_group["agreement"] == "disagree"][col_plddt]
if agree_plddt.empty or disagree_plddt.empty:
continue
delta_median = disagree_plddt.median() - agree_plddt.median()
per_protein_results.append({
"protein_name": protein_name,
"median_plddt_agree": agree_plddt.median(),
"median_plddt_disagree": disagree_plddt.median(),
"delta_median_plddt": delta_median,
"n_agree_mutations": len(agree_plddt),
"n_disagree_mutations": len(disagree_plddt),
})
if not per_protein_results:
print(" no proteins with both agree and disagree mutations — skipping level 2 plot")
return
df_per_protein = pd.DataFrame(per_protein_results)
out_csv = os.path.join(args.output_dir, "per_protein_plddt_delta.csv")
df_per_protein.to_csv(out_csv, index=False)
print(f"saved: {out_csv}")
n_negative = (df_per_protein["delta_median_plddt"] < 0).sum()
n_positive = (df_per_protein["delta_median_plddt"] > 0).sum()
print(f" proteins where disagree plddt < agree plddt: {n_negative}")
print(f" proteins where disagree plddt > agree plddt: {n_positive}")
fig, ax = plt.subplots(figsize=(8, 6))
sns.histplot(df_per_protein["delta_median_plddt"], kde=True, ax=ax,
color=color_dist, edgecolor="white", bins=20, alpha=0.8)
ax.axvline(0, color="#555555", linestyle="--", linewidth=1.5, label="no difference (0)")
ax.axvline(df_per_protein["delta_median_plddt"].median(), color=color_disagree,
linestyle="--", linewidth=1.5,
label=f"median: {df_per_protein['delta_median_plddt'].median():.1f}")
ax.set_xlabel("median pLDDT (disagree) − median pLDDT (agree)")
ax.set_ylabel("number of proteins")
ax.set_title(f"per-protein pLDDT difference: disagree vs agree mutations\n"
f"(n={len(df_per_protein)} proteins — "
f"{n_negative} negative, {n_positive} positive)")
ax.legend(frameon=False)
plt.tight_layout()
out = os.path.join(args.output_dir, "per_protein_plddt_delta.png")
plt.savefig(out, dpi=150, bbox_inches="tight")
plt.close()
print(f"saved: {out}")

print("\ndone.")


if __name__ == "__main__":
main()
75 changes: 5 additions & 70 deletions tools/foldx_version_comparison/rmsd_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# For each protein/domain in both datasets, computes all-atom and CA RMSD
# Matching is done by UniProt ID + protein name
# RMSD is computed only on overlapping residues between the two structures
# Run: python rmsd_matrix.py --foldx5_dir /path/to/folder --foldx51_dir /path/to/folder --output_dir ./rmsd_results
import os
import glob
import argparse
Expand All @@ -18,10 +17,6 @@
warnings.filterwarnings("ignore")


COLOR_VIOLIN_ALL = "#C4A882" # pale brown
COLOR_VIOLIN_CA = "#B8A0C8" # pale purple
COLOR_BAR_ALL = "#B22222" # firebrick
COLOR_BAR_CA = "#4A90A4" # muted teal
COLOR_DIST = "#6A9CC4" # soft blue

mpl.rcParams.update({
Expand Down Expand Up @@ -201,8 +196,6 @@ def run_rmsd_analysis(pairs):

def plot_rmsd_distributions(df, output_dir):
# Plot 1: histogram + KDE for all-atom and CA RMSD with mean and median lines
# Plot 2: scatter of all-atom vs CA RMSD per structure
# Plot 3: bar plot of all structures sorted by CA RMSD
os.makedirs(output_dir, exist_ok=True)
df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"])
df_clean = df_clean.copy()
Expand All @@ -212,7 +205,8 @@ def plot_rmsd_distributions(df, output_dir):
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle("RMSD distributions: FoldX5 vs FoldX5.1 initial structures", fontsize=13, fontweight="bold", y=1.01)
for ax, col, title in zip(axes, ["rmsd_all_atoms", "rmsd_ca"], ["All-atom RMSD (Å)", "Cα RMSD (Å)"]):
sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=10, alpha=0.8)
bin_edges = np.arange(0, df_clean[col].max() + 1, 1)
sns.histplot(df_clean[col], ax=ax, color=COLOR_DIST, edgecolor="white", bins=bin_edges, alpha=0.8)
ax.axvline(df_clean[col].median(), color="#CC0000", linestyle="--", linewidth=1.5, label=f"Median: {df_clean[col].median():.2f} Å")
ax.axvline(df_clean[col].mean(), color="#555555", linestyle=":", linewidth=1.5, label=f"Mean: {df_clean[col].mean():.2f} Å")
ax.set_xlabel(title)
Expand All @@ -224,71 +218,13 @@ def plot_rmsd_distributions(df, output_dir):
plt.close()
print(f"\nSaved: {out}")

# Plot 2: all-atom vs CA scatter with diagonal reference line
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(df_clean["rmsd_all_atoms"], df_clean["rmsd_ca"], color=COLOR_DIST, alpha=0.8, edgecolors="white", s=80)
lims = [0, max(df_clean[["rmsd_all_atoms", "rmsd_ca"]].max()) * 1.05]
ax.plot(lims, lims, color="#999999", linestyle="--", linewidth=1, label="y = x")
ax.set_xlabel("All-atom RMSD (Å)")
ax.set_ylabel("Cα RMSD (Å)")
ax.set_title("All-atom vs Cα RMSD per structure")
ax.legend(frameon=False)
out = os.path.join(output_dir, "rmsd_scatter_allatom_vs_ca.png")
plt.savefig(out, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved: {out}")

# Plot 3: bar plot sorted by CA RMSD
df_sorted = df_clean.sort_values("rmsd_ca", ascending=False).reset_index(drop=True)
fig, ax = plt.subplots(figsize=(max(10, len(df_sorted) * 0.5), 5))
x = np.arange(len(df_sorted))
ax.bar(x - 0.2, df_sorted["rmsd_all_atoms"], width=0.4, label="All-atom", color=COLOR_BAR_ALL, alpha=0.85)
ax.bar(x + 0.2, df_sorted["rmsd_ca"], width=0.4, label="Cα", color=COLOR_BAR_CA, alpha=0.85)
ax.set_xticks(x)
ax.set_xticklabels(df_sorted["label_clean"], rotation=45, ha="right", fontsize=11)
ax.set_ylabel("RMSD (Å)")
ax.set_title("RMSD per structure (sorted by Cα RMSD)")
ax.legend(frameon=False)
plt.tight_layout()
out = os.path.join(output_dir, "rmsd_barplot_sorted.png")
plt.savefig(out, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved: {out}")


def plot_rmsd_violin(df, output_dir):
# Violin plot: distribution of all-atom and CA RMSD across all proteins
df_clean = df.dropna(subset=["rmsd_all_atoms", "rmsd_ca"])
df_clean = df_clean.copy()
df_clean["label_clean"] = df_clean["label"].apply(lambda x: "_".join(x.split("_")[1:]))
df_melt = df_clean.melt(
id_vars="label_clean",
value_vars=["rmsd_all_atoms", "rmsd_ca"],
var_name="type",
value_name="rmsd"
)
df_melt["type"] = df_melt["type"].replace({"rmsd_all_atoms": "All-atom", "rmsd_ca": "Cα"})
fig, ax = plt.subplots(figsize=(6, 6))
sns.violinplot(data=df_melt, x="type", y="rmsd", ax=ax,
palette={"All-atom": COLOR_VIOLIN_ALL, "Cα": COLOR_VIOLIN_CA},
inner="box", cut=0)
ax.set_xlabel("")
ax.set_ylabel("RMSD (Å)")
ax.set_title("RMSD distribution across all proteins\nFoldX5 vs FoldX5.1")
plt.tight_layout()
out = os.path.join(output_dir, "rmsd_violin.png")
plt.savefig(out, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved: {out}")


# 4. MAIN

def main():
parser = argparse.ArgumentParser(description="Compute RMSD between FoldX5 and FoldX5.1 initial structures")
parser.add_argument("--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)")
parser.add_argument("--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder")
parser.add_argument("--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)")
parser.add_argument("-f", "--foldx5_dir", required=True, help="Path to FoldX5 folder (organized by protein name)")
parser.add_argument("-i", "--foldx51_dir", required=True, help="Root of FoldX5.1 data collection folder")
parser.add_argument("-o", "--output_dir", default="./rmsd_results", help="Where to save results (default: ./rmsd_results)")
args = parser.parse_args()

print("RMSD analysis: FoldX5 vs FoldX5.1 initial structures")
Expand All @@ -312,7 +248,6 @@ def main():
print(df[["rmsd_all_atoms", "rmsd_ca"]].describe().round(3).to_string())
print("\n[5] Generating plots...")
plot_rmsd_distributions(df, args.output_dir)
plot_rmsd_violin(df, args.output_dir)
print("\nDone.")
if __name__ == "__main__":
main()