From da2ecd2edf8bb5fea9821001769aa2084c5515b6 Mon Sep 17 00:00:00 2001 From: Danny Mccormick Date: Thu, 2 Apr 2026 12:13:57 -0400 Subject: [PATCH] fix vllm lint --- .../examples/inference/vllm_text_completion.py | 9 +++++---- sdks/python/apache_beam/ml/inference/vllm_inference.py | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py index 00fe3c319dd2..a7468f521ebb 100644 --- a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py +++ b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py @@ -38,9 +38,10 @@ from apache_beam.options.pipeline_options import SetupOptions from apache_beam.runners.runner import PipelineResult -# Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the engine -# warms the sampler with many dummy sequences unless max_num_seqs is reduced, and -# the default gpu_memory_utilization can leave no free VRAM for that step. +# Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the +# engine warms the sampler with many dummy sequences unless max_num_seqs is +# reduced, and the default gpu_memory_utilization can leave no free VRAM for +# that step. _DEFAULT_VLLM_MAX_NUM_SEQS = 32 _DEFAULT_VLLM_GPU_MEMORY_UTILIZATION = 0.72 @@ -141,7 +142,7 @@ def parse_known_args(argv): def build_vllm_server_kwargs(known_args) -> dict[str, str]: - """Returns CLI flags for ``VLLMCompletionsModelHandler(..., vllm_server_kwargs=...)``.""" + """Returns vllm_server_kwargs for ``VLLMCompletionsModelHandler``.""" return { 'max-num-seqs': str(known_args.vllm_max_num_seqs), 'gpu-memory-utilization': str(known_args.vllm_gpu_memory_utilization), diff --git a/sdks/python/apache_beam/ml/inference/vllm_inference.py b/sdks/python/apache_beam/ml/inference/vllm_inference.py index 6c0435dc951d..38283f1efd42 100644 --- a/sdks/python/apache_beam/ml/inference/vllm_inference.py +++ b/sdks/python/apache_beam/ml/inference/vllm_inference.py @@ -201,9 +201,10 @@ def __init__( `python -m vllm.entrypoints.openai.api_serverv `. For example, you could pass `{'echo': 'true'}` to prepend new messages with the previous message. - On ~16GB GPUs, pass lower ``max-num-seqs`` and ``gpu-memory-utilization`` - values (see ``apache_beam.examples.inference.vllm_text_completion``). - For a list of possible kwargs, see + On ~16GB GPUs, pass lower ``max-num-seqs`` and + ``gpu-memory-utilization`` values (see + ``apache_beam.examples.inference.vllm_text_completion``). For a list of + possible kwargs, see https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api min_batch_size: optional. the minimum batch size to use when batching inputs.