From 5176a8e6b6a6e953515c45a807c9512e3bddaa0c Mon Sep 17 00:00:00 2001
From: Yifei Ding <yifeiding@protonmail.com>
Date: Mon, 9 Mar 2026 20:55:19 -0700
Subject: [PATCH] Fix df_mash_corr_dist silently overwritten with raw Mash
 distances

In 6 places across cells 14, 20, 23, 32, and 34, df_mash_corr_dist
(and related _complete variants) were incorrectly assigned from
df_mash_square instead of from themselves. This caused correlation
distance matrices to be replaced with raw Mash distances, changing
the effective fcluster thresholds by ~30x and producing fragmented
clustering results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../2b_mash_filtration_and_clustering.ipynb   | 96 ++-----------------
 1 file changed, 9 insertions(+), 87 deletions(-)

diff --git a/examples/2b_mash_filtration_and_clustering.ipynb b/examples/2b_mash_filtration_and_clustering.ipynb
index a408128..e624b6a 100644
--- a/examples/2b_mash_filtration_and_clustering.ipynb
+++ b/examples/2b_mash_filtration_and_clustering.ipynb
@@ -197,13 +197,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# scrubbed_strains = scrubbed_species_metadata.genome_id.astype('str')\n",
-    "\n",
-    "df_mash_square = df_mash_square.loc[scrubbed_strains, scrubbed_strains]\n",
-    "df_mash_corr = df_mash_corr.loc[scrubbed_strains, scrubbed_strains]\n",
-    "df_mash_corr_dist = df_mash_square.loc[scrubbed_strains, scrubbed_strains]"
-   ]
+   "source": "# scrubbed_strains = scrubbed_species_metadata.genome_id.astype('str')\n\ndf_mash_square = df_mash_square.loc[scrubbed_strains, scrubbed_strains]\ndf_mash_corr = df_mash_corr.loc[scrubbed_strains, scrubbed_strains]\ndf_mash_corr_dist = df_mash_corr_dist.loc[scrubbed_strains, scrubbed_strains]"
   },
   {
    "cell_type": "markdown",
@@ -263,17 +257,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "for repr_strain in repr_strains:\n",
-    "    cond = df_mash_square.loc[repr_strain] < cutoff\n",
-    "    good_strains = df_mash_square.loc[repr_strain][cond].index\n",
-    "    \n",
-    "    df_mash_square = df_mash_square.loc[good_strains, good_strains]\n",
-    "    df_mash_corr = df_mash_corr.loc[good_strains, good_strains]\n",
-    "    df_mash_corr_dist = df_mash_square.loc[good_strains, good_strains]\n",
-    "    \n",
-    "df_mash_corr_dist.shape"
-   ]
+   "source": "for repr_strain in repr_strains:\n    cond = df_mash_square.loc[repr_strain] < cutoff\n    good_strains = df_mash_square.loc[repr_strain][cond].index\n    \n    df_mash_square = df_mash_square.loc[good_strains, good_strains]\n    df_mash_corr = df_mash_corr.loc[good_strains, good_strains]\n    df_mash_corr_dist = df_mash_corr_dist.loc[good_strains, good_strains]\n    \ndf_mash_corr_dist.shape"
   },
   {
    "cell_type": "code",
@@ -303,20 +287,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "cond = scrubbed_species_summary.genome_status == 'Complete'\n",
-    "complete_seqs = set(scrubbed_species_summary[cond].genome_id)\n",
-    "complete_seqs = sorted(\n",
-    "    complete_seqs.intersection(set(df_mash_square.index))\n",
-    ")\n",
-    "\n",
-    "\n",
-    "df_mash_square_complete = df_mash_square.loc[complete_seqs, complete_seqs]\n",
-    "df_mash_corr_complete = df_mash_square.loc[complete_seqs, complete_seqs]\n",
-    "df_mash_corr_dist_complete = df_mash_square.loc[complete_seqs, complete_seqs]\n",
-    "\n",
-    "df_mash_corr_dist_complete.shape"
-   ]
+   "source": "cond = scrubbed_species_summary.genome_status == 'Complete'\ncomplete_seqs = set(scrubbed_species_summary[cond].genome_id)\ncomplete_seqs = sorted(\n    complete_seqs.intersection(set(df_mash_square.index))\n)\n\n\ndf_mash_square_complete = df_mash_square.loc[complete_seqs, complete_seqs]\ndf_mash_corr_complete = df_mash_corr.loc[complete_seqs, complete_seqs]\ndf_mash_corr_dist_complete = df_mash_corr_dist.loc[complete_seqs, complete_seqs]\n\ndf_mash_corr_dist_complete.shape"
   },
   {
    "cell_type": "code",
@@ -435,19 +406,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "bad_genomes_list = []\n",
-    "\n",
-    "for genome in df_mash_square_complete.index:\n",
-    "    cluster = clst.loc[genome, 'cluster']\n",
-    "    if cluster in bad_clusters:\n",
-    "        bad_genomes_list.append(genome)\n",
-    "\n",
-    "# Update filtration\n",
-    "df_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n",
-    "df_mash_corr_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n",
-    "df_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)"
-   ]
+   "source": "bad_genomes_list = []\n\nfor genome in df_mash_square_complete.index:\n    cluster = clst.loc[genome, 'cluster']\n    if cluster in bad_clusters:\n        bad_genomes_list.append(genome)\n\n# Update filtration\ndf_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\ndf_mash_corr_complete = remove_bad_strains(df_mash_corr_complete, bad_genomes_list)\ndf_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)"
   },
   {
    "cell_type": "markdown",
@@ -461,44 +420,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "iteration = 1\n",
-    "prev = 0\n",
-    "curr = len(clst.cluster.unique())\n",
-    "\n",
-    "while(np.abs(prev - curr) > 0 ):\n",
-    "    print(f'iteration {iteration}...{curr}')\n",
-    "    \n",
-    "    # Cluster\n",
-    "    link, dist, clst = cluster_corr_dist(df_mash_corr_dist_complete, thresh=elbow_threshold)\n",
-    "    \n",
-    "    # Color each cluster\n",
-    "    cm = matplotlib.colormaps.get_cmap('tab20')\n",
-    "    clr = dict(zip(sorted(clst.cluster.unique()), cm.colors))\n",
-    "    clst['color'] = clst.cluster.map(clr)\n",
-    "    \n",
-    "    # Increment\n",
-    "    prev = curr\n",
-    "    curr = len(clst.cluster.unique())\n",
-    "    \n",
-    "    # Define bad clusters\n",
-    "    bad_clusters = clst.cluster.value_counts()[clst.cluster.value_counts() < small_clst_limit]\n",
-    "    \n",
-    "    # Remove bad genomes\n",
-    "    bad_genomes_list = []\n",
-    "    for genome in df_mash_square_complete.index:\n",
-    "        cluster = clst.loc[genome, 'cluster']\n",
-    "        if cluster in bad_clusters:\n",
-    "            bad_genomes_list.append(genome)\n",
-    "    \n",
-    "    # Update filtration\n",
-    "    df_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n",
-    "    df_mash_corr_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n",
-    "    df_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)\n",
-    "    \n",
-    "    # Increment\n",
-    "    iteration +=1"
-   ]
+   "source": "iteration = 1\nprev = 0\ncurr = len(clst.cluster.unique())\n\nwhile(np.abs(prev - curr) > 0 ):\n    print(f'iteration {iteration}...{curr}')\n    \n    # Cluster\n    link, dist, clst = cluster_corr_dist(df_mash_corr_dist_complete, thresh=elbow_threshold)\n    \n    # Color each cluster\n    cm = matplotlib.colormaps.get_cmap('tab20')\n    clr = dict(zip(sorted(clst.cluster.unique()), cm.colors))\n    clst['color'] = clst.cluster.map(clr)\n    \n    # Increment\n    prev = curr\n    curr = len(clst.cluster.unique())\n    \n    # Define bad clusters\n    bad_clusters = clst.cluster.value_counts()[clst.cluster.value_counts() < small_clst_limit]\n    \n    # Remove bad genomes\n    bad_genomes_list = []\n    for genome in df_mash_square_complete.index:\n        cluster = clst.loc[genome, 'cluster']\n        if cluster in bad_clusters:\n            bad_genomes_list.append(genome)\n    \n    # Update filtration\n    df_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n    df_mash_corr_complete = remove_bad_strains(df_mash_corr_complete, bad_genomes_list)\n    df_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)\n    \n    # Increment\n    iteration +=1"
   },
   {
    "cell_type": "code",
@@ -647,9 +569,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python pyphylon",
+   "display_name": "pangenome",
    "language": "python",
-   "name": "pyphylontesting"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -661,9 +583,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.8"
+   "version": "3.11.14"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file