From 5176a8e6b6a6e953515c45a807c9512e3bddaa0c Mon Sep 17 00:00:00 2001 From: Yifei Ding Date: Mon, 9 Mar 2026 20:55:19 -0700 Subject: [PATCH] Fix df_mash_corr_dist silently overwritten with raw Mash distances In 6 places across cells 14, 20, 23, 32, and 34, df_mash_corr_dist (and related _complete variants) were incorrectly assigned from df_mash_square instead of from themselves. This caused correlation distance matrices to be replaced with raw Mash distances, changing the effective fcluster thresholds by ~30x and producing fragmented clustering results. Co-Authored-By: Claude Opus 4.6 --- .../2b_mash_filtration_and_clustering.ipynb | 96 ++----------------- 1 file changed, 9 insertions(+), 87 deletions(-) diff --git a/examples/2b_mash_filtration_and_clustering.ipynb b/examples/2b_mash_filtration_and_clustering.ipynb index a408128..e624b6a 100644 --- a/examples/2b_mash_filtration_and_clustering.ipynb +++ b/examples/2b_mash_filtration_and_clustering.ipynb @@ -197,13 +197,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# scrubbed_strains = scrubbed_species_metadata.genome_id.astype('str')\n", - "\n", - "df_mash_square = df_mash_square.loc[scrubbed_strains, scrubbed_strains]\n", - "df_mash_corr = df_mash_corr.loc[scrubbed_strains, scrubbed_strains]\n", - "df_mash_corr_dist = df_mash_square.loc[scrubbed_strains, scrubbed_strains]" - ] + "source": "# scrubbed_strains = scrubbed_species_metadata.genome_id.astype('str')\n\ndf_mash_square = df_mash_square.loc[scrubbed_strains, scrubbed_strains]\ndf_mash_corr = df_mash_corr.loc[scrubbed_strains, scrubbed_strains]\ndf_mash_corr_dist = df_mash_corr_dist.loc[scrubbed_strains, scrubbed_strains]" }, { "cell_type": "markdown", @@ -263,17 +257,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "for repr_strain in repr_strains:\n", - " cond = df_mash_square.loc[repr_strain] < cutoff\n", - " good_strains = df_mash_square.loc[repr_strain][cond].index\n", - " \n", - " df_mash_square = df_mash_square.loc[good_strains, good_strains]\n", - " df_mash_corr = df_mash_corr.loc[good_strains, good_strains]\n", - " df_mash_corr_dist = df_mash_square.loc[good_strains, good_strains]\n", - " \n", - "df_mash_corr_dist.shape" - ] + "source": "for repr_strain in repr_strains:\n cond = df_mash_square.loc[repr_strain] < cutoff\n good_strains = df_mash_square.loc[repr_strain][cond].index\n \n df_mash_square = df_mash_square.loc[good_strains, good_strains]\n df_mash_corr = df_mash_corr.loc[good_strains, good_strains]\n df_mash_corr_dist = df_mash_corr_dist.loc[good_strains, good_strains]\n \ndf_mash_corr_dist.shape" }, { "cell_type": "code", @@ -303,20 +287,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "cond = scrubbed_species_summary.genome_status == 'Complete'\n", - "complete_seqs = set(scrubbed_species_summary[cond].genome_id)\n", - "complete_seqs = sorted(\n", - " complete_seqs.intersection(set(df_mash_square.index))\n", - ")\n", - "\n", - "\n", - "df_mash_square_complete = df_mash_square.loc[complete_seqs, complete_seqs]\n", - "df_mash_corr_complete = df_mash_square.loc[complete_seqs, complete_seqs]\n", - "df_mash_corr_dist_complete = df_mash_square.loc[complete_seqs, complete_seqs]\n", - "\n", - "df_mash_corr_dist_complete.shape" - ] + "source": "cond = scrubbed_species_summary.genome_status == 'Complete'\ncomplete_seqs = set(scrubbed_species_summary[cond].genome_id)\ncomplete_seqs = sorted(\n complete_seqs.intersection(set(df_mash_square.index))\n)\n\n\ndf_mash_square_complete = df_mash_square.loc[complete_seqs, complete_seqs]\ndf_mash_corr_complete = df_mash_corr.loc[complete_seqs, complete_seqs]\ndf_mash_corr_dist_complete = df_mash_corr_dist.loc[complete_seqs, complete_seqs]\n\ndf_mash_corr_dist_complete.shape" }, { "cell_type": "code", @@ -435,19 +406,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "bad_genomes_list = []\n", - "\n", - "for genome in df_mash_square_complete.index:\n", - " cluster = clst.loc[genome, 'cluster']\n", - " if cluster in bad_clusters:\n", - " bad_genomes_list.append(genome)\n", - "\n", - "# Update filtration\n", - "df_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n", - "df_mash_corr_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n", - "df_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)" - ] + "source": "bad_genomes_list = []\n\nfor genome in df_mash_square_complete.index:\n cluster = clst.loc[genome, 'cluster']\n if cluster in bad_clusters:\n bad_genomes_list.append(genome)\n\n# Update filtration\ndf_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\ndf_mash_corr_complete = remove_bad_strains(df_mash_corr_complete, bad_genomes_list)\ndf_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)" }, { "cell_type": "markdown", @@ -461,44 +420,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "iteration = 1\n", - "prev = 0\n", - "curr = len(clst.cluster.unique())\n", - "\n", - "while(np.abs(prev - curr) > 0 ):\n", - " print(f'iteration {iteration}...{curr}')\n", - " \n", - " # Cluster\n", - " link, dist, clst = cluster_corr_dist(df_mash_corr_dist_complete, thresh=elbow_threshold)\n", - " \n", - " # Color each cluster\n", - " cm = matplotlib.colormaps.get_cmap('tab20')\n", - " clr = dict(zip(sorted(clst.cluster.unique()), cm.colors))\n", - " clst['color'] = clst.cluster.map(clr)\n", - " \n", - " # Increment\n", - " prev = curr\n", - " curr = len(clst.cluster.unique())\n", - " \n", - " # Define bad clusters\n", - " bad_clusters = clst.cluster.value_counts()[clst.cluster.value_counts() < small_clst_limit]\n", - " \n", - " # Remove bad genomes\n", - " bad_genomes_list = []\n", - " for genome in df_mash_square_complete.index:\n", - " cluster = clst.loc[genome, 'cluster']\n", - " if cluster in bad_clusters:\n", - " bad_genomes_list.append(genome)\n", - " \n", - " # Update filtration\n", - " df_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n", - " df_mash_corr_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n", - " df_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)\n", - " \n", - " # Increment\n", - " iteration +=1" - ] + "source": "iteration = 1\nprev = 0\ncurr = len(clst.cluster.unique())\n\nwhile(np.abs(prev - curr) > 0 ):\n print(f'iteration {iteration}...{curr}')\n \n # Cluster\n link, dist, clst = cluster_corr_dist(df_mash_corr_dist_complete, thresh=elbow_threshold)\n \n # Color each cluster\n cm = matplotlib.colormaps.get_cmap('tab20')\n clr = dict(zip(sorted(clst.cluster.unique()), cm.colors))\n clst['color'] = clst.cluster.map(clr)\n \n # Increment\n prev = curr\n curr = len(clst.cluster.unique())\n \n # Define bad clusters\n bad_clusters = clst.cluster.value_counts()[clst.cluster.value_counts() < small_clst_limit]\n \n # Remove bad genomes\n bad_genomes_list = []\n for genome in df_mash_square_complete.index:\n cluster = clst.loc[genome, 'cluster']\n if cluster in bad_clusters:\n bad_genomes_list.append(genome)\n \n # Update filtration\n df_mash_square_complete = remove_bad_strains(df_mash_square_complete, bad_genomes_list)\n df_mash_corr_complete = remove_bad_strains(df_mash_corr_complete, bad_genomes_list)\n df_mash_corr_dist_complete = remove_bad_strains(df_mash_corr_dist_complete, bad_genomes_list)\n \n # Increment\n iteration +=1" }, { "cell_type": "code", @@ -647,9 +569,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python pyphylon", + "display_name": "pangenome", "language": "python", - "name": "pyphylontesting" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -661,9 +583,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.8" + "version": "3.11.14" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file