From 0201acca206498a7076393780285168c2d90f6b2 Mon Sep 17 00:00:00 2001
From: Clay Moore <claytonwaynemoore@gmail.com>
Date: Mon, 18 May 2026 18:11:41 -0500
Subject: [PATCH] fix: guard against division-by-zero and uninitialized state
 in metrics/masking code

- mlm_memmap.py: normalize codon weights only when mean > 0 to prevent NaN
  propagating into np.random.binomial when all token weights are zero
- mlm_memmap.py: clamp conditional random-replace probability to [0, 1] and
  guard against mask_replace_prob == 1.0 causing ZeroDivisionError
- dead_latents.py: initialize _last_avg_nonzero in __init__ so get_stats()
  is safe to call before the first update()
- evo2_dataset.py: remove duplicate ASCII 45 ('-') entry in VALID_DNA_AND_DEGENERATE

Signed-off-by: Clay Moore <claytonwaynemoore@gmail.com>
---
 .../sae/src/sae/eval/dead_latents.py                  |  1 +
 .../codonfm_ptl_te/src/data/preprocess/mlm_memmap.py  | 11 ++++++++---
 .../bionemo/evo2/data/megatron/hyena/evo2_dataset.py  |  1 -
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py b/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py
index 6ee425adf6..ba8655932b 100644
--- a/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py
+++ b/bionemo-recipes/interpretability/sparse_autoencoders/sae/src/sae/eval/dead_latents.py
@@ -67,6 +67,7 @@ def __init__(
 
         # Tracks activity since last reset
         self.recently_active = torch.zeros(hidden_dim, dtype=torch.bool, device=device)
+        self._last_avg_nonzero: float = 0.0
 
     @torch.no_grad()
     def update(self, codes: torch.Tensor) -> None:
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py b/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py
index a286849fe5..ed576b8c62 100644
--- a/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py
+++ b/bionemo-recipes/recipes/codonfm_ptl_te/src/data/preprocess/mlm_memmap.py
@@ -66,7 +66,9 @@ def process_item(  # noqa: D417
         prob_input_sequence = np.ones_like(input_sequence_toks) * mlm_probability
         if codon_weights is not None:
             pos_weight = codon_weights[input_sequence_toks]
-            pos_weight[1:-1] = pos_weight[1:-1] / pos_weight[1:-1].mean()
+            mean_weight = pos_weight[1:-1].mean()
+            if mean_weight > 0:
+                pos_weight[1:-1] = pos_weight[1:-1] / mean_weight
             prob_input_sequence = prob_input_sequence * pos_weight
             prob_input_sequence = np.clip(prob_input_sequence, 0.05, 0.4)
         mask_indices = np.random.binomial(1, prob_input_sequence).astype(bool)  # noqa: NPY002
@@ -82,10 +84,13 @@ def process_item(  # noqa: D417
         masked_input_sequence_toks[indices_replaced] = tokenizer.mask_token_id
 
     if random_replace_prob > 0.0:
+        remaining_prob = 1.0 - mask_replace_prob
+        # Conditional probability of random replacement given the token was not replaced by [MASK].
+        # Guard against mask_replace_prob == 1.0 (no remaining positions) or rounding above 1.0.
+        conditional_prob = min(random_replace_prob / remaining_prob, 1.0) if remaining_prob > 0.0 else 0.0
         indices_random = np.random.binomial(  # noqa: NPY002
             1,
-            (np.ones_like(mask_indices).astype(bool) & mask_indices & ~indices_replaced)
-            * (random_replace_prob / (1 - mask_replace_prob)),
+            (np.ones_like(mask_indices).astype(bool) & mask_indices & ~indices_replaced) * conditional_prob,
         ).astype(bool)
         valid_tokens = np.setdiff1d(
             np.arange(tokenizer.vocab_size),
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py
index 5073059990..ec7adf0470 100644
--- a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py
+++ b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/data/megatron/hyena/evo2_dataset.py
@@ -38,7 +38,6 @@ class Evo2Dataset(GPTDataset):
     MAX_TAG_LEN = 2048
 
     VALID_DNA_AND_DEGENERATE: ClassVar[set[int]] = {
-        45,
         45,
         65,
         66,