From 1a1286934df561ebd66492ef235274c4c01f0d62 Mon Sep 17 00:00:00 2001
From: Dhruvil <dhruvilparikh79@gmail.com>
Date: Sat, 16 May 2026 19:43:06 -0400
Subject: [PATCH] Add FORBIDDEN_TOKENS to Gemma4Tokenizer covering image +
 audio placeholders

`Gemma4Tokenizer` did not define `FORBIDDEN_TOKENS`, so it inherited
the base class default of `()`. The sampler at
`gemma/gm/text/_sampler.py:501` adds
`self.tokenizer.FORBIDDEN_TOKENS` to the per-call forbidden set; for
Gemma 4 that meant nothing was added, and text-only sampling could
emit raw multimodal placeholder tokens (`<|image|>`,
`<start_of_image>`, `<image|>`, `<|audio|>`, `<start_of_audio>`,
`<audio|>`), producing corrupted text-only output.

Mirror the existing `Gemma3Tokenizer` / `Gemma3nTokenizer` pattern
on the new class, but include all six distinct ids because Gemma 4
assigns different token ids to each placeholder (Gemma 3 reuses
`IMAGE_PLACEHOLDER == START_OF_IMAGE == 255999`, so listing both
there is redundant; in Gemma 4 all six are distinct per
`_Gemma4SpecialTokens` and all six must be forbidden).

Regression test added in `_tokenizer_test.py` asserts that every one
of the six multimodal ids appears in
`Gemma4Tokenizer.FORBIDDEN_TOKENS`.

Fixes #613.
---
 gemma/gm/text/_tokenizer.py      | 16 ++++++++++++++++
 gemma/gm/text/_tokenizer_test.py | 24 ++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/gemma/gm/text/_tokenizer.py b/gemma/gm/text/_tokenizer.py
index bf351456..0f0c7381 100644
--- a/gemma/gm/text/_tokenizer.py
+++ b/gemma/gm/text/_tokenizer.py
@@ -483,6 +483,22 @@ class Gemma4Tokenizer(Tokenizer):
 
   special_tokens = _Gemma4SpecialTokens
 
+  # Tokens which are forbidden to be generated in the sampler. Mirrors
+  # `Gemma3Tokenizer` / `Gemma3nTokenizer` but covers both the image and audio
+  # multimodal placeholders, since Gemma 4 uses distinct ids for each
+  # placeholder/start/end token (Gemma 3 reuses `IMAGE_PLACEHOLDER ==
+  # START_OF_IMAGE == 255999`, so listing both there is redundant; in Gemma 4
+  # all six ids are distinct, so all six must be forbidden to keep raw
+  # multimodal placeholders out of text-only generation). See #613.
+  FORBIDDEN_TOKENS = (
+      special_tokens.IMAGE_PLACEHOLDER,
+      special_tokens.START_OF_IMAGE,
+      special_tokens.END_OF_IMAGE,
+      special_tokens.AUDIO_PLACEHOLDER,
+      special_tokens.START_OF_AUDIO,
+      special_tokens.END_OF_AUDIO,
+  )
+
   VERSION = 4
   FORMAT: ClassVar[dialog.Format] = dialog.Format.GEMMA4
 
diff --git a/gemma/gm/text/_tokenizer_test.py b/gemma/gm/text/_tokenizer_test.py
index 39958fb7..38a5800e 100644
--- a/gemma/gm/text/_tokenizer_test.py
+++ b/gemma/gm/text/_tokenizer_test.py
@@ -25,3 +25,27 @@ def test_pickle():
   tokenizer.encode('Hello world!')  # Trigger the lazy-loading of the tokenizer.
 
   pickle.dumps(tokenizer)
+
+
+def test_gemma4_tokenizer_forbids_multimodal_placeholder_tokens():
+  """Regression test for https://github.com/google-deepmind/gemma/issues/613.
+
+  Gemma 4 introduced distinct token ids for image and audio multimodal
+  placeholders. The tokenizer must mark all six as forbidden so the sampler
+  cannot generate raw placeholder tokens during text-only inference (which
+  would corrupt the output).
+  """
+  forbidden = gm.text.Gemma4Tokenizer.FORBIDDEN_TOKENS
+  st = gm.text.Gemma4Tokenizer.special_tokens
+  for token in (
+      st.IMAGE_PLACEHOLDER,
+      st.START_OF_IMAGE,
+      st.END_OF_IMAGE,
+      st.AUDIO_PLACEHOLDER,
+      st.START_OF_AUDIO,
+      st.END_OF_AUDIO,
+  ):
+    assert token in forbidden, (
+        f'Token {token!r} ({st(token).name}) must be in Gemma4Tokenizer'
+        f'.FORBIDDEN_TOKENS, but FORBIDDEN_TOKENS is {forbidden!r}'
+    )