From da2ecd2edf8bb5fea9821001769aa2084c5515b6 Mon Sep 17 00:00:00 2001
From: Danny Mccormick <dannymccormick@google.com>
Date: Thu, 2 Apr 2026 12:13:57 -0400
Subject: [PATCH] fix vllm lint

---
 .../examples/inference/vllm_text_completion.py           | 9 +++++----
 sdks/python/apache_beam/ml/inference/vllm_inference.py   | 7 ++++---
 2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py
index 00fe3c319dd2..a7468f521ebb 100644
--- a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py
+++ b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py
@@ -38,9 +38,10 @@
 from apache_beam.options.pipeline_options import SetupOptions
 from apache_beam.runners.runner import PipelineResult
 
-# Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the engine
-# warms the sampler with many dummy sequences unless max_num_seqs is reduced, and
-# the default gpu_memory_utilization can leave no free VRAM for that step.
+# Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the
+# engine warms the sampler with many dummy sequences unless max_num_seqs is
+# reduced, and the default gpu_memory_utilization can leave no free VRAM for
+# that step.
 _DEFAULT_VLLM_MAX_NUM_SEQS = 32
 _DEFAULT_VLLM_GPU_MEMORY_UTILIZATION = 0.72
 
@@ -141,7 +142,7 @@ def parse_known_args(argv):
 
 
 def build_vllm_server_kwargs(known_args) -> dict[str, str]:
-  """Returns CLI flags for ``VLLMCompletionsModelHandler(..., vllm_server_kwargs=...)``."""
+  """Returns vllm_server_kwargs for ``VLLMCompletionsModelHandler``."""
   return {
       'max-num-seqs': str(known_args.vllm_max_num_seqs),
       'gpu-memory-utilization': str(known_args.vllm_gpu_memory_utilization),
diff --git a/sdks/python/apache_beam/ml/inference/vllm_inference.py b/sdks/python/apache_beam/ml/inference/vllm_inference.py
index 6c0435dc951d..38283f1efd42 100644
--- a/sdks/python/apache_beam/ml/inference/vllm_inference.py
+++ b/sdks/python/apache_beam/ml/inference/vllm_inference.py
@@ -201,9 +201,10 @@ def __init__(
         `python -m vllm.entrypoints.openai.api_serverv <beam provided args>
         <vllm_server_kwargs>`. For example, you could pass
         `{'echo': 'true'}` to prepend new messages with the previous message.
-        On ~16GB GPUs, pass lower ``max-num-seqs`` and ``gpu-memory-utilization``
-        values (see ``apache_beam.examples.inference.vllm_text_completion``).
-        For a list of possible kwargs, see
+        On ~16GB GPUs, pass lower ``max-num-seqs`` and
+        ``gpu-memory-utilization`` values (see
+        ``apache_beam.examples.inference.vllm_text_completion``). For a list of
+        possible kwargs, see
         https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api
       min_batch_size: optional. the minimum batch size to use when batching
         inputs.