From fee107ada1f67572fd2849cfeabb3d3b5b333289 Mon Sep 17 00:00:00 2001
From: aIbrahiim <abdoibrahim1017@gmail.com>
Date: Mon, 16 Mar 2026 14:22:11 +0200
Subject: [PATCH] Fix vLLM Gemma benchmark by updating transformers version

---
 ...m_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt | 1 +
 .../examples/inference/pytorch_language_modeling.py            | 3 +--
 sdks/python/apache_beam/examples/inference/vllm_gemma_batch.py | 1 -
 .../apache_beam/ml/inference/vllm_tests_requirements.txt       | 3 ++-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
index 23af8197d8d4..fd2101afa3f1 100644
--- a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
+++ b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
@@ -32,5 +32,6 @@
 --metrics_table=gemma_vllm_batch
 --influx_measurement=gemma_vllm_batch
 --model_gcs_path=gs://apache-beam-ml/models/gemma-2b-it
+--requirements_file=apache_beam/ml/inference/vllm_tests_requirements.txt
 --dataflow_service_options=worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver
 --experiments=use_runner_v2
diff --git a/sdks/python/apache_beam/examples/inference/pytorch_language_modeling.py b/sdks/python/apache_beam/examples/inference/pytorch_language_modeling.py
index 946c4fadd113..d995df09a159 100644
--- a/sdks/python/apache_beam/examples/inference/pytorch_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/pytorch_language_modeling.py
@@ -52,8 +52,7 @@ def tokenize_sentence(
     text_and_mask: tuple[str, str],
     bert_tokenizer: BertTokenizer) -> tuple[str, dict[str, torch.Tensor]]:
   text, masked_text = text_and_mask
-  tokenized_sentence = bert_tokenizer.encode_plus(
-      masked_text, return_tensors="pt")
+  tokenized_sentence = bert_tokenizer(masked_text, return_tensors="pt")
 
   # Workaround to manually remove batch dim until we have the feature to
   # add optional batching flag.
diff --git a/sdks/python/apache_beam/examples/inference/vllm_gemma_batch.py b/sdks/python/apache_beam/examples/inference/vllm_gemma_batch.py
index f6e33e5be786..c30e8991d665 100644
--- a/sdks/python/apache_beam/examples/inference/vllm_gemma_batch.py
+++ b/sdks/python/apache_beam/examples/inference/vllm_gemma_batch.py
@@ -103,7 +103,6 @@ def run(argv=None, save_main_session=True, test_pipeline=None):
 
   gem = opts.view_as(GemmaVLLMOptions)
   opts.view_as(SetupOptions).save_main_session = save_main_session
-
   logging.info("Pipeline starting with model path: %s", gem.model_gcs_path)
   handler = GcsVLLMCompletionsModelHandler(
       model_name=gem.model_gcs_path,
diff --git a/sdks/python/apache_beam/ml/inference/vllm_tests_requirements.txt b/sdks/python/apache_beam/ml/inference/vllm_tests_requirements.txt
index 0f8c6a6a673d..ad5877edbec5 100644
--- a/sdks/python/apache_beam/ml/inference/vllm_tests_requirements.txt
+++ b/sdks/python/apache_beam/ml/inference/vllm_tests_requirements.txt
@@ -17,6 +17,7 @@
 torch>=1.7.1
 torchvision>=0.8.2
 pillow>=8.0.0
-transformers>=4.18.0
+transformers==4.57.1
+sentencepiece==0.2.1
 google-cloud-monitoring>=2.27.0
 openai>=1.52.2