From 1a1286934df561ebd66492ef235274c4c01f0d62 Mon Sep 17 00:00:00 2001 From: Dhruvil Date: Sat, 16 May 2026 19:43:06 -0400 Subject: [PATCH] Add FORBIDDEN_TOKENS to Gemma4Tokenizer covering image + audio placeholders `Gemma4Tokenizer` did not define `FORBIDDEN_TOKENS`, so it inherited the base class default of `()`. The sampler at `gemma/gm/text/_sampler.py:501` adds `self.tokenizer.FORBIDDEN_TOKENS` to the per-call forbidden set; for Gemma 4 that meant nothing was added, and text-only sampling could emit raw multimodal placeholder tokens (`<|image|>`, ``, ``, `<|audio|>`, ``, ``), producing corrupted text-only output. Mirror the existing `Gemma3Tokenizer` / `Gemma3nTokenizer` pattern on the new class, but include all six distinct ids because Gemma 4 assigns different token ids to each placeholder (Gemma 3 reuses `IMAGE_PLACEHOLDER == START_OF_IMAGE == 255999`, so listing both there is redundant; in Gemma 4 all six are distinct per `_Gemma4SpecialTokens` and all six must be forbidden). Regression test added in `_tokenizer_test.py` asserts that every one of the six multimodal ids appears in `Gemma4Tokenizer.FORBIDDEN_TOKENS`. Fixes #613. --- gemma/gm/text/_tokenizer.py | 16 ++++++++++++++++ gemma/gm/text/_tokenizer_test.py | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/gemma/gm/text/_tokenizer.py b/gemma/gm/text/_tokenizer.py index bf351456..0f0c7381 100644 --- a/gemma/gm/text/_tokenizer.py +++ b/gemma/gm/text/_tokenizer.py @@ -483,6 +483,22 @@ class Gemma4Tokenizer(Tokenizer): special_tokens = _Gemma4SpecialTokens + # Tokens which are forbidden to be generated in the sampler. Mirrors + # `Gemma3Tokenizer` / `Gemma3nTokenizer` but covers both the image and audio + # multimodal placeholders, since Gemma 4 uses distinct ids for each + # placeholder/start/end token (Gemma 3 reuses `IMAGE_PLACEHOLDER == + # START_OF_IMAGE == 255999`, so listing both there is redundant; in Gemma 4 + # all six ids are distinct, so all six must be forbidden to keep raw + # multimodal placeholders out of text-only generation). See #613. + FORBIDDEN_TOKENS = ( + special_tokens.IMAGE_PLACEHOLDER, + special_tokens.START_OF_IMAGE, + special_tokens.END_OF_IMAGE, + special_tokens.AUDIO_PLACEHOLDER, + special_tokens.START_OF_AUDIO, + special_tokens.END_OF_AUDIO, + ) + VERSION = 4 FORMAT: ClassVar[dialog.Format] = dialog.Format.GEMMA4 diff --git a/gemma/gm/text/_tokenizer_test.py b/gemma/gm/text/_tokenizer_test.py index 39958fb7..38a5800e 100644 --- a/gemma/gm/text/_tokenizer_test.py +++ b/gemma/gm/text/_tokenizer_test.py @@ -25,3 +25,27 @@ def test_pickle(): tokenizer.encode('Hello world!') # Trigger the lazy-loading of the tokenizer. pickle.dumps(tokenizer) + + +def test_gemma4_tokenizer_forbids_multimodal_placeholder_tokens(): + """Regression test for https://github.com/google-deepmind/gemma/issues/613. + + Gemma 4 introduced distinct token ids for image and audio multimodal + placeholders. The tokenizer must mark all six as forbidden so the sampler + cannot generate raw placeholder tokens during text-only inference (which + would corrupt the output). + """ + forbidden = gm.text.Gemma4Tokenizer.FORBIDDEN_TOKENS + st = gm.text.Gemma4Tokenizer.special_tokens + for token in ( + st.IMAGE_PLACEHOLDER, + st.START_OF_IMAGE, + st.END_OF_IMAGE, + st.AUDIO_PLACEHOLDER, + st.START_OF_AUDIO, + st.END_OF_AUDIO, + ): + assert token in forbidden, ( + f'Token {token!r} ({st(token).name}) must be in Gemma4Tokenizer' + f'.FORBIDDEN_TOKENS, but FORBIDDEN_TOKENS is {forbidden!r}' + )