diff --git a/gemma/gm/text/_tokenizer.py b/gemma/gm/text/_tokenizer.py index bf351456..0f0c7381 100644 --- a/gemma/gm/text/_tokenizer.py +++ b/gemma/gm/text/_tokenizer.py @@ -483,6 +483,22 @@ class Gemma4Tokenizer(Tokenizer): special_tokens = _Gemma4SpecialTokens + # Tokens which are forbidden to be generated in the sampler. Mirrors + # `Gemma3Tokenizer` / `Gemma3nTokenizer` but covers both the image and audio + # multimodal placeholders, since Gemma 4 uses distinct ids for each + # placeholder/start/end token (Gemma 3 reuses `IMAGE_PLACEHOLDER == + # START_OF_IMAGE == 255999`, so listing both there is redundant; in Gemma 4 + # all six ids are distinct, so all six must be forbidden to keep raw + # multimodal placeholders out of text-only generation). See #613. + FORBIDDEN_TOKENS = ( + special_tokens.IMAGE_PLACEHOLDER, + special_tokens.START_OF_IMAGE, + special_tokens.END_OF_IMAGE, + special_tokens.AUDIO_PLACEHOLDER, + special_tokens.START_OF_AUDIO, + special_tokens.END_OF_AUDIO, + ) + VERSION = 4 FORMAT: ClassVar[dialog.Format] = dialog.Format.GEMMA4 diff --git a/gemma/gm/text/_tokenizer_test.py b/gemma/gm/text/_tokenizer_test.py index 39958fb7..38a5800e 100644 --- a/gemma/gm/text/_tokenizer_test.py +++ b/gemma/gm/text/_tokenizer_test.py @@ -25,3 +25,27 @@ def test_pickle(): tokenizer.encode('Hello world!') # Trigger the lazy-loading of the tokenizer. pickle.dumps(tokenizer) + + +def test_gemma4_tokenizer_forbids_multimodal_placeholder_tokens(): + """Regression test for https://github.com/google-deepmind/gemma/issues/613. + + Gemma 4 introduced distinct token ids for image and audio multimodal + placeholders. The tokenizer must mark all six as forbidden so the sampler + cannot generate raw placeholder tokens during text-only inference (which + would corrupt the output). + """ + forbidden = gm.text.Gemma4Tokenizer.FORBIDDEN_TOKENS + st = gm.text.Gemma4Tokenizer.special_tokens + for token in ( + st.IMAGE_PLACEHOLDER, + st.START_OF_IMAGE, + st.END_OF_IMAGE, + st.AUDIO_PLACEHOLDER, + st.START_OF_AUDIO, + st.END_OF_AUDIO, + ): + assert token in forbidden, ( + f'Token {token!r} ({st(token).name}) must be in Gemma4Tokenizer' + f'.FORBIDDEN_TOKENS, but FORBIDDEN_TOKENS is {forbidden!r}' + )