From 0201acca206498a7076393780285168c2d90f6b2 Mon Sep 17 00:00:00 2001 From: Clay Moore Date: Mon, 18 May 2026 18:11:41 -0500 Subject: [PATCH] fix: guard against division-by-zero and uninitialized state in metrics/masking code - mlm_memmap.py: normalize codon weights only when mean > 0 to prevent NaN propagating into np.random.binomial when all token weights are zero - mlm_memmap.py: clamp conditional random-replace probability to [0, 1] and guard against mask_replace_prob == 1.0 causing ZeroDivisionError - dead_latents.py: initialize _last_avg_nonzero in __init__ so get_stats() is safe to call before the first update() - evo2_dataset.py: remove duplicate ASCII 45 ('-') entry in VALID_DNA_AND_DEGENERATE Signed-off-by: Clay Moore --- .../sae/src/sae/eval/dead_latents.py | 1 + .../codonfm_ptl_te/src/data/preprocess/mlm_memmap.py | 11 ++++++++--- .../bionemo/evo2/data/megatron/hyena/evo2_dataset.py | 1 - 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py b/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py index 6ee425adf6..ba8655932b 100644 --- a/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py +++ b/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py @@ -67,6 +67,7 @@ def __init__( # Tracks activity since last reset self.recently_active = torch.zeros(hidden_dim, dtype=torch.bool, device=device) + self._last_avg_nonzero: float = 0.0 @torch.no_grad() def update(self, codes: torch.Tensor) -> None: diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py index a286849fe5..ed576b8c62 100644 --- a/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py +++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py @@ -66,7 +66,9 @@ def process_item( # noqa: D417 prob_input_sequence = np.ones_like(input_sequence_toks) * mlm_probability if codon_weights is not None: pos_weight = codon_weights[input_sequence_toks] - pos_weight[1:-1] = pos_weight[1:-1] / pos_weight[1:-1].mean() + mean_weight = pos_weight[1:-1].mean() + if mean_weight > 0: + pos_weight[1:-1] = pos_weight[1:-1] / mean_weight prob_input_sequence = prob_input_sequence * pos_weight prob_input_sequence = np.clip(prob_input_sequence, 0.05, 0.4) mask_indices = np.random.binomial(1, prob_input_sequence).astype(bool) # noqa: NPY002 @@ -82,10 +84,13 @@ def process_item( # noqa: D417 masked_input_sequence_toks[indices_replaced] = tokenizer.mask_token_id if random_replace_prob > 0.0: + remaining_prob = 1.0 - mask_replace_prob + # Conditional probability of random replacement given the token was not replaced by [MASK]. + # Guard against mask_replace_prob == 1.0 (no remaining positions) or rounding above 1.0. + conditional_prob = min(random_replace_prob / remaining_prob, 1.0) if remaining_prob > 0.0 else 0.0 indices_random = np.random.binomial( # noqa: NPY002 1, - (np.ones_like(mask_indices).astype(bool) & mask_indices & ~indices_replaced) - * (random_replace_prob / (1 - mask_replace_prob)), + (np.ones_like(mask_indices).astype(bool) & mask_indices & ~indices_replaced) * conditional_prob, ).astype(bool) valid_tokens = np.setdiff1d( np.arange(tokenizer.vocab_size), diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py index 5073059990..ec7adf0470 100644 --- a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py +++ b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py @@ -38,7 +38,6 @@ class Evo2Dataset(GPTDataset): MAX_TAG_LEN = 2048 VALID_DNA_AND_DEGENERATE: ClassVar[set[int]] = { - 45, 45, 65, 66,