From 53a7adf28f1e1de5a224e85f71acd18101d57195 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 24 Jun 2025 11:40:28 +0800 Subject: [PATCH 01/14] split _build_model method for TorchLlm and TrtLlm Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm.py | 335 ++++++++++++++++++++------------ tensorrt_llm/llmapi/llm_args.py | 19 +- 2 files changed, 211 insertions(+), 143 deletions(-) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 5635d4016f66..e85224e83fe7 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -12,7 +12,6 @@ from tqdm import tqdm from transformers import PreTrainedTokenizerBase -from tensorrt_llm.builder import BuildConfig from tensorrt_llm.inputs.data import TextPrompt from tensorrt_llm.inputs.registry import DefaultInputProcessor @@ -592,134 +591,6 @@ def _build_model(self): llm_build_stats=weakref.proxy( self.llm_build_stats)) self._engine_dir, self._hf_model_dir = model_loader() - # update the model_dir to a local dir for the runtime, such as tokenizer loading. - if self._engine_dir is not None: - self.args.model = self._engine_dir - - # Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub. - # It should also be before bindings ExecutorConfig, which may depend on tokenizer info. - self._tokenizer = self._try_load_tokenizer() - - # Multimodal special handling: - # 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor - # 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__ - self.input_processor = create_input_processor(self._hf_model_dir, - self.tokenizer) - self.tokenizer = self.input_processor.tokenizer - - max_batch_size = self.args.max_batch_size - max_num_tokens = self.args.max_num_tokens - max_seq_len = self.args.max_seq_len - - build_config = self.args.build_config if self._on_trt_backend else BuildConfig( - ) - - max_batch_size = max_batch_size or build_config.max_batch_size - max_num_tokens = max_num_tokens or build_config.max_num_tokens - max_seq_len = max_seq_len or build_config.max_seq_len - - self._executor_config = tllm.ExecutorConfig( - max_beam_width=self.args.max_beam_width, - scheduler_config=PybindMirror.maybe_to_pybind( - self.args.scheduler_config), - batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type) - or tllm.BatchingType.INFLIGHT, - max_batch_size=max_batch_size, - max_num_tokens=max_num_tokens, - gather_generation_logits=self.args.gather_generation_logits) - if self.args.backend is None: - # also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens - if max_seq_len is not None: - self._executor_config.max_seq_len = max_seq_len - else: - engine_config = EngineConfig.from_json_file(self._engine_dir / - "config.json") - self._executor_config.max_seq_len = engine_config.build_config.max_seq_len - if self.args.kv_cache_config is not None: - self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind( - self.args.kv_cache_config) - if os.getenv("FORCE_DETERMINISTIC", "0") == "1": - # Disable KV cache reuse for deterministic mode - self._executor_config.kv_cache_config.enable_block_reuse = False - self._executor_config.kv_cache_config.enable_partial_reuse = False - if self.args.peft_cache_config is not None: - self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind( - self.args.peft_cache_config) - elif self._on_trt_backend and self.args.build_config.plugin_config.lora_plugin: - engine_config = EngineConfig.from_json_file(self._engine_dir / - "config.json") - lora_config = engine_config.build_config.lora_config - max_lora_rank = lora_config.max_lora_rank - num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \ - len(lora_config.lora_target_modules + lora_config.missing_qkv_modules) - self._executor_config.peft_cache_config = tllm.PeftCacheConfig( - num_device_module_layer=max_lora_rank * num_lora_modules * - self.args.max_loras, - num_host_module_layer=max_lora_rank * num_lora_modules * - self.args.max_cpu_loras, - ) - if self.args.decoding_config is not None: - self._executor_config.decoding_config = self.args.decoding_config - if self.args.guided_decoding_backend == 'xgrammar': - self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( - backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. - XGRAMMAR, - **_xgrammar_tokenizer_info(self.tokenizer)) - elif self.args.guided_decoding_backend == 'llguidance': - self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( - backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. - LLGUIDANCE, - **_llguidance_tokenizer_info(self.tokenizer)) - elif self.args.guided_decoding_backend is not None: - raise ValueError( - f"Unrecognized guided decoding backend {self.args.guided_decoding_backend}" - ) - - self._executor_config.normalize_log_probs = self.args.normalize_log_probs - self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill - self._executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width - if self._on_trt_backend and self.args.extended_runtime_perf_knob_config is not None: - self._executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind( - self.args.extended_runtime_perf_knob_config) - if self.args.cache_transceiver_config is not None: - self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind( - self.args.cache_transceiver_config) - from tensorrt_llm._torch.pyexecutor.config import update_executor_config - update_executor_config( - self._executor_config, - backend=self.args.backend, - pytorch_backend_config=self.args.get_pytorch_backend_config() - if self.args.backend in ["pytorch", "_autodeploy"] else None, - mapping=self.args.parallel_config.to_mapping(), - build_config=self.args.build_config - if self._on_trt_backend else None, - speculative_config=self.args.speculative_config, - hf_model_dir=self._hf_model_dir, - trt_engine_dir=self._engine_dir, - max_input_len=self.args.max_input_len, - max_seq_len=max_seq_len) - self._executor_config.llm_parallel_config = self.args.parallel_config - return_logits = (self.args.gather_generation_logits - or (self.args.build_config - and self.args.build_config.gather_context_logits)) - - self._executor = self._executor_cls.create( - self._engine_dir, - executor_config=self._executor_config, - batched_logits_processor=self.args.batched_logits_processor, - model_world_size=self.args.parallel_config.world_size, - mpi_session=self.mpi_session, - reuse_mpi_comm=external_mpi_comm_available( - self.args.parallel_config.world_size), - return_logits=return_logits, - postproc_worker_config=PostprocWorkerConfig( - num_postprocess_workers=self.args.num_postprocess_workers, - postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir, - ), - is_llm_executor=True, - lora_config=self.args.lora_config, - garbage_collection_gen0_threshold=self.args. - garbage_collection_gen0_threshold) @property def _on_trt_backend(self) -> bool: @@ -860,6 +731,116 @@ def save(self, engine_dir: str) -> None: f"Copying {file} to {target_engine_dir / file.name}\n") shutil.copy(file, target_engine_dir / file.name) + def _build_model(self): + super()._build_model() + # update the model_dir to a local dir for the runtime, such as tokenizer loading. + if self._engine_dir is not None: + self.args.model = self._engine_dir + + # Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub. + # It should also be before bindings ExecutorConfig, which may depend on tokenizer info. + self._tokenizer = self._try_load_tokenizer() + + # Multimodal special handling: + # 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor + # 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__ + self.input_processor = create_input_processor(self._hf_model_dir, + self.tokenizer) + self.tokenizer = self.input_processor.tokenizer + + max_batch_size = self.args.max_batch_size + max_num_tokens = self.args.max_num_tokens + max_seq_len = self.args.max_seq_len + + build_config = self.args.build_config + + max_batch_size = max_batch_size or build_config.max_batch_size + max_num_tokens = max_num_tokens or build_config.max_num_tokens + max_seq_len = max_seq_len or build_config.max_seq_len + + self._executor_config = tllm.ExecutorConfig( + max_beam_width=self.args.max_beam_width, + scheduler_config=PybindMirror.maybe_to_pybind( + self.args.scheduler_config), + batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type) + or tllm.BatchingType.INFLIGHT, + max_batch_size=max_batch_size, + max_num_tokens=max_num_tokens, + gather_generation_logits=self.args.gather_generation_logits) + + # also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens + if max_seq_len is not None: + self._executor_config.max_seq_len = max_seq_len + else: + engine_config = EngineConfig.from_json_file(self._engine_dir / + "config.json") + self._executor_config.max_seq_len = engine_config.build_config.max_seq_len + + if self.args.kv_cache_config is not None: + self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind( + self.args.kv_cache_config) + if os.getenv("FORCE_DETERMINISTIC", "0") == "1": + # Disable KV cache reuse for deterministic mode + self._executor_config.kv_cache_config.enable_block_reuse = False + self._executor_config.kv_cache_config.enable_partial_reuse = False + if self.args.peft_cache_config is not None: + self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind( + self.args.peft_cache_config) + elif self.args.build_config.plugin_config.lora_plugin: + engine_config = EngineConfig.from_json_file(self._engine_dir / + "config.json") + lora_config = engine_config.build_config.lora_config + max_lora_rank = lora_config.max_lora_rank + num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \ + len(lora_config.lora_target_modules + lora_config.missing_qkv_modules) + self._executor_config.peft_cache_config = tllm.PeftCacheConfig( + num_device_module_layer=max_lora_rank * num_lora_modules * + self.args.max_loras, + num_host_module_layer=max_lora_rank * num_lora_modules * + self.args.max_cpu_loras, + ) + if self.args.decoding_config is not None: + self._executor_config.decoding_config = self.args.decoding_config + if self.args.guided_decoding_backend == 'xgrammar': + self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( + backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. + XGRAMMAR, + **_xgrammar_tokenizer_info(self.tokenizer)) + else: + raise ValueError( + f"Unsupported guided decoding backend {self.args.guided_decoding_backend}" + ) + + self._executor_config.normalize_log_probs = self.args.normalize_log_probs + self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill + self._executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width + if self.args.extended_runtime_perf_knob_config is not None: + self._executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind( + self.args.extended_runtime_perf_knob_config) + if self.args.cache_transceiver_config is not None: + self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind( + self.args.cache_transceiver_config) + self._executor_config.llm_parallel_config = self.args.parallel_config + return_logits = (self.args.gather_generation_logits + or (self.args.build_config + and self.args.build_config.gather_context_logits)) + + self._executor = self._executor_cls.create( + self._engine_dir, + executor_config=self._executor_config, + batched_logits_processor=self.args.batched_logits_processor, + model_world_size=self.args.parallel_config.world_size, + mpi_session=self.mpi_session, + reuse_mpi_comm=external_mpi_comm_available( + self.args.parallel_config.world_size), + return_logits=return_logits, + postproc_worker_config=PostprocWorkerConfig( + num_postprocess_workers=self.args.num_postprocess_workers, + postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir, + ), + is_llm_executor=True, + lora_config=self.args.lora_config) + @append_docstring(TORCH_LLM_DOCSTRING) class _TorchLLM(BaseLLM): @@ -899,6 +880,102 @@ def __init__(self, backend='pytorch', **kwargs) + def _build_model(self): + super()._build_model() + assert self._engine_dir is None + + # Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub. + # It should also be before bindings ExecutorConfig, which may depend on tokenizer info. + self._tokenizer = self._try_load_tokenizer() + + # Multimodal special handling: + # 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor + # 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__ + self.input_processor = create_input_processor(self._hf_model_dir, + self.tokenizer) + self.tokenizer = self.input_processor.tokenizer + + max_batch_size = self.args.max_batch_size + max_num_tokens = self.args.max_num_tokens + max_seq_len = self.args.max_seq_len + + self._executor_config = tllm.ExecutorConfig( + max_beam_width=self.args.max_beam_width, + scheduler_config=PybindMirror.maybe_to_pybind( + self.args.scheduler_config), + batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type) + or tllm.BatchingType.INFLIGHT, + max_batch_size=max_batch_size, + max_num_tokens=max_num_tokens, + gather_generation_logits=self.args.gather_generation_logits) + + if self.args.kv_cache_config is not None: + self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind( + self.args.kv_cache_config) + if os.getenv("FORCE_DETERMINISTIC", "0") == "1": + # Disable KV cache reuse for deterministic mode + self._executor_config.kv_cache_config.enable_block_reuse = False + self._executor_config.kv_cache_config.enable_partial_reuse = False + if self.args.peft_cache_config is not None: + self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind( + self.args.peft_cache_config) + if self.args.decoding_config is not None: + self._executor_config.decoding_config = self.args.decoding_config + if self.args.guided_decoding_backend == 'xgrammar': + self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( + backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. + XGRAMMAR, + **_xgrammar_tokenizer_info(self.tokenizer)) + elif self.args.guided_decoding_backend == 'llguidance': + self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( + backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. + LLGUIDANCE, + **_llguidance_tokenizer_info(self.tokenizer)) + elif self.args.guided_decoding_backend is not None: + raise ValueError( + f"Unsupported guided decoding backend {self.args.guided_decoding_backend}" + ) + + self._executor_config.normalize_log_probs = self.args.normalize_log_probs + self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill + self._executor_config.max_beam_width = self.args.max_beam_width + if self.args.cache_transceiver_config is not None: + self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind( + self.args.cache_transceiver_config) + from tensorrt_llm._torch.pyexecutor.config import update_executor_config + update_executor_config( + self._executor_config, + backend=self.args.backend, + pytorch_backend_config=self.args.get_pytorch_backend_config() + if self.args.backend in ["pytorch", "_autodeploy"] else None, + mapping=self.args.parallel_config.to_mapping(), + speculative_config=self.args.speculative_config, + hf_model_dir=self._hf_model_dir, + trt_engine_dir=self._engine_dir, + max_input_len=self.args.max_input_len, + max_seq_len=max_seq_len) + + # TODO: revisit gather_context_logits + return_logits = self.args.gather_generation_logits + + self._executor = self._executor_cls.create( + self._engine_dir, + executor_config=self._executor_config, + batched_logits_processor=self.args.batched_logits_processor, + model_world_size=self.args.parallel_config.world_size, + mpi_session=self.mpi_session, + reuse_mpi_comm=external_mpi_comm_available( + self.args.parallel_config.world_size), + return_logits=return_logits, + postproc_worker_config=PostprocWorkerConfig( + num_postprocess_workers=self.args.num_postprocess_workers, + postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir, + ), + is_llm_executor=True, + lora_config=self.args.lora_config, + garbage_collection_gen0_threshold=self.args. + garbage_collection_gen0_threshold) + def _validate_args_for_torch_backend(self, kwargs: dict) -> None: """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend. """ diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 636740d5998b..7dec4f304128 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -954,12 +954,6 @@ class BaseLlmArgs(BaseModel): default=None, description="The parser to separate reasoning content from output.") - garbage_collection_gen0_threshold: int = Field( - default=20000, - description= - "Threshold for Python garbage collection of generation 0 objects." - "Lower values trigger more frequent garbage collection.") - # TODO[Superjomn]: To deprecate this config. decoding_config: Optional[object] = Field( default=None, @@ -1622,14 +1616,11 @@ class TorchCompileConfig(BaseModel): class TorchLlmArgs(BaseLlmArgs): - # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs - build_config: Optional[object] = Field( - default=None, - description="Build config.", - exclude_from_json=True, - json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"}) - - # PyTorch backend specific configurations + garbage_collection_gen0_threshold: int = Field( + default=20000, + description= + "Threshold for Python garbage collection of generation 0 objects." + "Lower values trigger more frequent garbage collection.") use_cuda_graph: bool = Field( default=False, From 8fafd59befcf4bc92945731573ea0b0116eab8e5 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 24 Jun 2025 14:32:44 +0800 Subject: [PATCH 02/14] fix ci Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm_args.py | 48 ++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 7dec4f304128..bd922320132b 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1154,30 +1154,6 @@ def validate_model_format_misc(self): self._model_format = model_format return self - @model_validator(mode="after") - def init_build_config(self): - """ - Creating a default BuildConfig if none is provided - """ - if self.build_config is None: - kwargs = {} - if self.max_batch_size: - kwargs["max_batch_size"] = self.max_batch_size - if self.max_num_tokens: - kwargs["max_num_tokens"] = self.max_num_tokens - if self.max_seq_len: - kwargs["max_seq_len"] = self.max_seq_len - if self.max_beam_width: - kwargs["max_beam_width"] = self.max_beam_width - if self.max_input_len: - kwargs["max_input_len"] = self.max_input_len - self.build_config = BuildConfig(**kwargs) - - assert isinstance( - self.build_config, BuildConfig - ), f"build_config is not initialized: {self.build_config}" - return self - @model_validator(mode="after") def set_runtime_knobs_from_build_config(self): # TODO: remove this after PyT become default to adapt PyT with build_config as input @@ -1542,6 +1518,30 @@ def init_calib_config(cls, v): return CalibConfig() return v + @model_validator(mode="after") + def init_build_config(self): + """ + Creating a default BuildConfig if none is provided + """ + if self.build_config is None: + kwargs = {} + if self.max_batch_size: + kwargs["max_batch_size"] = self.max_batch_size + if self.max_num_tokens: + kwargs["max_num_tokens"] = self.max_num_tokens + if self.max_seq_len: + kwargs["max_seq_len"] = self.max_seq_len + if self.max_beam_width: + kwargs["max_beam_width"] = self.max_beam_width + if self.max_input_len: + kwargs["max_input_len"] = self.max_input_len + self.build_config = BuildConfig(**kwargs) + + assert isinstance( + self.build_config, BuildConfig + ), f"build_config is not initialized: {self.build_config}" + return self + @model_validator(mode="after") def setup_embedding_parallel_mode(self): if self.embedding_parallel_mode == 'NONE': From 03506eb4011bbd2123319f89e8dd64401069eee8 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 24 Jun 2025 14:58:36 +0800 Subject: [PATCH 03/14] clean Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm_args.py | 56 +++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index bd922320132b..0fa78ad5701a 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1154,6 +1154,30 @@ def validate_model_format_misc(self): self._model_format = model_format return self + @model_validator(mode="after") + def init_build_config(self): + """ + Creating a default BuildConfig if none is provided + """ + if self.build_config is None: + kwargs = {} + if self.max_batch_size: + kwargs["max_batch_size"] = self.max_batch_size + if self.max_num_tokens: + kwargs["max_num_tokens"] = self.max_num_tokens + if self.max_seq_len: + kwargs["max_seq_len"] = self.max_seq_len + if self.max_beam_width: + kwargs["max_beam_width"] = self.max_beam_width + if self.max_input_len: + kwargs["max_input_len"] = self.max_input_len + self.build_config = BuildConfig(**kwargs) + + assert isinstance( + self.build_config, BuildConfig + ), f"build_config is not initialized: {self.build_config}" + return self + @model_validator(mode="after") def set_runtime_knobs_from_build_config(self): # TODO: remove this after PyT become default to adapt PyT with build_config as input @@ -1518,30 +1542,6 @@ def init_calib_config(cls, v): return CalibConfig() return v - @model_validator(mode="after") - def init_build_config(self): - """ - Creating a default BuildConfig if none is provided - """ - if self.build_config is None: - kwargs = {} - if self.max_batch_size: - kwargs["max_batch_size"] = self.max_batch_size - if self.max_num_tokens: - kwargs["max_num_tokens"] = self.max_num_tokens - if self.max_seq_len: - kwargs["max_seq_len"] = self.max_seq_len - if self.max_beam_width: - kwargs["max_beam_width"] = self.max_beam_width - if self.max_input_len: - kwargs["max_input_len"] = self.max_input_len - self.build_config = BuildConfig(**kwargs) - - assert isinstance( - self.build_config, BuildConfig - ), f"build_config is not initialized: {self.build_config}" - return self - @model_validator(mode="after") def setup_embedding_parallel_mode(self): if self.embedding_parallel_mode == 'NONE': @@ -1615,6 +1615,14 @@ class TorchCompileConfig(BaseModel): class TorchLlmArgs(BaseLlmArgs): + # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs + build_config: Optional[object] = Field( + default=None, + description="Build config.", + exclude_from_json=True, + json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"}) + + # PyTorch backend specific configurations garbage_collection_gen0_threshold: int = Field( default=20000, From 7d638750bfbdf5b407db8134cfe758317d4d9739 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 24 Jun 2025 15:27:59 +0800 Subject: [PATCH 04/14] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index e85224e83fe7..0257e2f4a07f 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -806,7 +806,7 @@ def _build_model(self): backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. XGRAMMAR, **_xgrammar_tokenizer_info(self.tokenizer)) - else: + elif self.args.guided_decoding_backend is not None: raise ValueError( f"Unsupported guided decoding backend {self.args.guided_decoding_backend}" ) From b0df829aa877e69607245a2c960444825b979343 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 24 Jun 2025 16:53:05 +0800 Subject: [PATCH 05/14] fix ci Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 33a458a12345..83d9e62fed99 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -2141,13 +2141,17 @@ def run_llm_with_postprocess_parallel_and_result_handler( kwargs = {} if backend not in ["pytorch", "autodeploy"]: kwargs["fast_build"] = True - llm = LLM(model=llama_model_path, - backend=backend, - kv_cache_config=global_kvcache_config, - tensor_parallel_size=tp_size, - num_postprocess_workers=2, - postprocess_tokenizer_dir=llama_model_path, - **kwargs) + LLM_CLASS = LLM + else: + LLM_CLASS = LLM_torch + + llm = LLM_CLASS(model=llama_model_path, + backend=backend, + kv_cache_config=global_kvcache_config, + tensor_parallel_size=tp_size, + num_postprocess_workers=2, + postprocess_tokenizer_dir=llama_model_path, + **kwargs) golden_result = "DEFGHI" for i, output in enumerate( llm.generate_async(prompts[0], From a7b2e5d047b2771bbfb232e74624e31c6acb0885 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 09:42:28 +0800 Subject: [PATCH 06/14] refactor _build_model method of TorchLlm Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 129 ++++++++++++++++++++++++-------- tensorrt_llm/llmapi/llm.py | 70 ++--------------- 2 files changed, 104 insertions(+), 95 deletions(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index dfe95ff2b773..6d2d08f91a8e 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -18,8 +18,10 @@ mpi_comm, mpi_rank, nvtx_range_debug) from ..bindings import executor as tllm from ..builder import ConfigEncoder, Engine, EngineConfig -from ..llmapi.llm_args import PybindMirror +from ..llmapi.llm_args import PybindMirror, TorchLlmArgs from ..llmapi.mpi_session import set_mpi_session_cpp +from ..llmapi.tokenizer import (_llguidance_tokenizer_info, + _xgrammar_tokenizer_info) from ..llmapi.tracer import VizTracer, global_tracer, set_global_tracer from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue, clear_sched_affinity, print_colored_debug, @@ -59,6 +61,8 @@ def __init__( is_llm_executor: Optional[bool] = None, lora_config: Optional[LoraConfig] = None, garbage_collection_gen0_threshold: Optional[int] = None, + hf_model_dir: Optional[Path] = None, + llm_args: Optional[TorchLlmArgs] = None, ) -> None: postproc_config = postproc_worker_config or PostprocWorkerConfig() super().__init__( @@ -79,8 +83,7 @@ def __init__( self._await_response_helper = AwaitResponseHelper( self) # TODO: make it weakref self._executor_config = executor_config - self._is_pytorch_backend = getattr(self._executor_config, "backend", - None) == "pytorch" + self._is_pytorch_backend = llm_args is not None and llm_args.backend == "pytorch" if global_mpi_size() > 1: logger.set_rank(self.global_rank) @@ -88,13 +91,98 @@ def __init__( if isinstance(engine, list): engine = engine[self.rank] - if executor_config is None: - executor_config = tllm.ExecutorConfig(1) + def _create_py_executor(): + device_id = self.global_rank % torch.cuda.device_count() + torch.cuda.set_device(device_id) - executor_config.logits_post_processor_config = tllm.LogitsPostProcessorConfig( - processor_batched=batched_logits_processor, replicate=False) + max_batch_size = llm_args.max_batch_size + max_num_tokens = llm_args.max_num_tokens + max_seq_len = llm_args.max_seq_len + + self._executor_config = tllm.ExecutorConfig( + max_beam_width=llm_args.max_beam_width, + scheduler_config=PybindMirror.maybe_to_pybind( + llm_args.scheduler_config), + batching_type=PybindMirror.maybe_to_pybind( + llm_args.batching_type) or tllm.BatchingType.INFLIGHT, + max_batch_size=max_batch_size, + max_num_tokens=max_num_tokens, + gather_generation_logits=llm_args.gather_generation_logits) + + if llm_args.kv_cache_config is not None: + self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind( + llm_args.kv_cache_config) + if os.getenv("FORCE_DETERMINISTIC", "0") == "1": + # Disable KV cache reuse for deterministic mode + self._executor_config.kv_cache_config.enable_block_reuse = False + self._executor_config.kv_cache_config.enable_partial_reuse = False + if llm_args.peft_cache_config is not None: + self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind( + llm_args.peft_cache_config) + if llm_args.decoding_config is not None: + self._executor_config.decoding_config = llm_args.decoding_config + if llm_args.guided_decoding_backend == 'xgrammar': + self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( + backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. + XGRAMMAR, + **_xgrammar_tokenizer_info(self.tokenizer)) + elif llm_args.guided_decoding_backend == 'llguidance': + self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( + backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. + LLGUIDANCE, + **_llguidance_tokenizer_info(self.tokenizer)) + elif llm_args.guided_decoding_backend is not None: + raise ValueError( + f"Unsupported guided decoding backend {llm_args.guided_decoding_backend}" + ) + + self._executor_config.normalize_log_probs = llm_args.normalize_log_probs + self._executor_config.enable_chunked_context = llm_args.enable_chunked_prefill + self._executor_config.max_beam_width = llm_args.max_beam_width + if llm_args.cache_transceiver_config is not None: + self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind( + llm_args.cache_transceiver_config) + from tensorrt_llm._torch.pyexecutor.config import \ + update_executor_config + update_executor_config( + self._executor_config, + backend=llm_args.backend, + pytorch_backend_config=llm_args.get_pytorch_backend_config() + if llm_args.backend in ["pytorch", "_autodeploy"] else None, + mapping=llm_args.parallel_config.to_mapping(), + speculative_config=llm_args.speculative_config, + hf_model_dir=hf_model_dir, + max_input_len=llm_args.max_input_len, + max_seq_len=max_seq_len) + + self._executor_config.logits_post_processor_config = tllm.LogitsPostProcessorConfig( + processor_batched=batched_logits_processor, replicate=False) + args = { + "executor_config": self._executor_config, + "checkpoint_dir": hf_model_dir, + } + if llm_args.backend == "pytorch": + from tensorrt_llm._torch.pyexecutor.py_executor_creator import \ + create_py_executor + create_executor = create_py_executor + args["lora_config"] = lora_config + args[ + "garbage_collection_gen0_threshold"] = garbage_collection_gen0_threshold + elif executor_config.backend == "_autodeploy": + from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \ + create_autodeploy_executor + create_executor = create_autodeploy_executor + else: + raise ValueError( + f"Unsupported backend config: {executor_config.backend}") + return create_executor(**args) def _create_engine(): + if executor_config is None: + executor_config = tllm.ExecutorConfig(1) + + executor_config.logits_post_processor_config = tllm.LogitsPostProcessorConfig( + processor_batched=batched_logits_processor, replicate=False) device_id = self.global_rank % torch.cuda.device_count() torch.cuda.set_device(device_id) @@ -113,30 +201,11 @@ def _create_engine(): executor_config=executor_config, managed_weights=engine.managed_weights) - if not hasattr(executor_config, "backend"): - return tllm.Executor(engine, tllm.ModelType.DECODER_ONLY, - executor_config) - args = { - "executor_config": executor_config, - "checkpoint_dir": executor_config.hf_model_dir, - } - if executor_config.backend == "pytorch": - from tensorrt_llm._torch.pyexecutor.py_executor_creator import \ - create_py_executor - create_executor = create_py_executor - args["lora_config"] = lora_config - args[ - "garbage_collection_gen0_threshold"] = garbage_collection_gen0_threshold - elif executor_config.backend == "_autodeploy": - from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \ - create_autodeploy_executor - create_executor = create_autodeploy_executor - else: - raise ValueError( - f"Unsupported backend config: {executor_config.backend}") - return create_executor(**args) + return tllm.Executor(engine, tllm.ModelType.DECODER_ONLY, + executor_config) - self.engine = _create_engine() + self.engine = _create_py_executor if llm_args is not None else _create_engine( + ) self._lora_manager: Optional[LoraManager] = None self._prompt_adapter_manager: Optional[PromptAdapterManager] = None diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 758ee3bbe3c8..b5dca350a567 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -35,8 +35,7 @@ from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig, LlmBuildStats, ModelLoader, _ModelRuntimeContext) from .mpi_session import MpiPoolSession, external_mpi_comm_available -from .tokenizer import (TokenizerBase, _llguidance_tokenizer_info, - _xgrammar_tokenizer_info) +from .tokenizer import TokenizerBase, _xgrammar_tokenizer_info # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import from .utils import (append_docstring, exception_handler, get_device_count, print_colored_debug) @@ -894,71 +893,12 @@ def _build_model(self): self.tokenizer) self.tokenizer = self.input_processor.tokenizer - max_batch_size = self.args.max_batch_size - max_num_tokens = self.args.max_num_tokens - max_seq_len = self.args.max_seq_len - - self._executor_config = tllm.ExecutorConfig( - max_beam_width=self.args.max_beam_width, - scheduler_config=PybindMirror.maybe_to_pybind( - self.args.scheduler_config), - batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type) - or tllm.BatchingType.INFLIGHT, - max_batch_size=max_batch_size, - max_num_tokens=max_num_tokens, - gather_generation_logits=self.args.gather_generation_logits) - - if self.args.kv_cache_config is not None: - self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind( - self.args.kv_cache_config) - if os.getenv("FORCE_DETERMINISTIC", "0") == "1": - # Disable KV cache reuse for deterministic mode - self._executor_config.kv_cache_config.enable_block_reuse = False - self._executor_config.kv_cache_config.enable_partial_reuse = False - if self.args.peft_cache_config is not None: - self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind( - self.args.peft_cache_config) - if self.args.decoding_config is not None: - self._executor_config.decoding_config = self.args.decoding_config - if self.args.guided_decoding_backend == 'xgrammar': - self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( - backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. - XGRAMMAR, - **_xgrammar_tokenizer_info(self.tokenizer)) - elif self.args.guided_decoding_backend == 'llguidance': - self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig( - backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend. - LLGUIDANCE, - **_llguidance_tokenizer_info(self.tokenizer)) - elif self.args.guided_decoding_backend is not None: - raise ValueError( - f"Unsupported guided decoding backend {self.args.guided_decoding_backend}" - ) - - self._executor_config.normalize_log_probs = self.args.normalize_log_probs - self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill - self._executor_config.max_beam_width = self.args.max_beam_width - if self.args.cache_transceiver_config is not None: - self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind( - self.args.cache_transceiver_config) - from tensorrt_llm._torch.pyexecutor.config import update_executor_config - update_executor_config( - self._executor_config, - backend=self.args.backend, - pytorch_backend_config=self.args.get_pytorch_backend_config() - if self.args.backend in ["pytorch", "_autodeploy"] else None, - mapping=self.args.parallel_config.to_mapping(), - speculative_config=self.args.speculative_config, - hf_model_dir=self._hf_model_dir, - max_input_len=self.args.max_input_len, - max_seq_len=max_seq_len) - # TODO: revisit gather_context_logits return_logits = self.args.gather_generation_logits self._executor = self._executor_cls.create( - self._engine_dir, - executor_config=self._executor_config, + engine=None, + executor_config=None, batched_logits_processor=self.args.batched_logits_processor, model_world_size=self.args.parallel_config.world_size, mpi_session=self.mpi_session, @@ -971,8 +911,8 @@ def _build_model(self): ), is_llm_executor=True, lora_config=self.args.lora_config, - garbage_collection_gen0_threshold=self.args. - garbage_collection_gen0_threshold) + hf_model_dir=self._hf_model_dir, + llm_args=self.args) def _validate_args_for_torch_backend(self, kwargs: dict) -> None: """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend. From a39a09eb550221dcefb904d76dda4a142e523cb7 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:02:32 +0800 Subject: [PATCH 07/14] clean Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index b5dca350a567..bf958e2c5730 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -897,8 +897,6 @@ def _build_model(self): return_logits = self.args.gather_generation_logits self._executor = self._executor_cls.create( - engine=None, - executor_config=None, batched_logits_processor=self.args.batched_logits_processor, model_world_size=self.args.parallel_config.world_size, mpi_session=self.mpi_session, From 1e453d3bf251be2ba8eb15fbffe1f8f4d5a63985 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:20:57 +0800 Subject: [PATCH 08/14] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/executor.py | 14 +++++++------- tensorrt_llm/executor/worker.py | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py index 9d6dd966058b..fe91e1f65173 100644 --- a/tensorrt_llm/executor/executor.py +++ b/tensorrt_llm/executor/executor.py @@ -21,6 +21,7 @@ from ..bindings import executor as tllm from ..builder import Engine from ..disaggregated_params import DisaggregatedParams +from ..llmapi import TorchLlmArgs from ..llmapi.llm_utils import KvCacheRetentionConfig from ..llmapi.mpi_session import (MpiSession, external_mpi_comm_available, need_spawn_mpi_workers) @@ -350,7 +351,8 @@ def create( postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, lora_config: Optional[LoraConfig] = None, - garbage_collection_gen0_threshold: Optional[int] = None, + hf_model_dir: Optional[Path] = None, + llm_args: Optional[TorchLlmArgs] = None, ) -> Union["GenerationExecutorProxy", "GenerationExecutorWorker"]: # local imports to avoid cyclic importing from .proxy import GenerationExecutorProxy @@ -377,6 +379,8 @@ def create( "engine": engine, "executor_config": executor_config, "batched_logits_processor": batched_logits_processor, + "hf_model_dir": hf_model_dir, + "llm_args": llm_args, } if lora_config: @@ -394,9 +398,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - garbage_collection_gen0_threshold= - garbage_collection_gen0_threshold) + is_llm_executor=is_llm_executor) # WAR: For the performance of gathering logits, we use single process worker # for TP1 to avoid the large overhead of IPC. @@ -407,9 +409,7 @@ def create( "Using single process worker for TP1, this may hurt streaming generation performance." ) return GenerationExecutorWorker(**worker_kwargs, - is_llm_executor=is_llm_executor, - garbage_collection_gen0_threshold= - garbage_collection_gen0_threshold) + is_llm_executor=is_llm_executor) # For single-gpu case: # Partition the workload to multiple process for streaming performance. diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 6d2d08f91a8e..c39d09307db4 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -60,7 +60,6 @@ def __init__( postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, lora_config: Optional[LoraConfig] = None, - garbage_collection_gen0_threshold: Optional[int] = None, hf_model_dir: Optional[Path] = None, llm_args: Optional[TorchLlmArgs] = None, ) -> None: @@ -167,7 +166,7 @@ def _create_py_executor(): create_executor = create_py_executor args["lora_config"] = lora_config args[ - "garbage_collection_gen0_threshold"] = garbage_collection_gen0_threshold + "garbage_collection_gen0_threshold"] = llm_args.garbage_collection_gen0_threshold elif executor_config.backend == "_autodeploy": from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \ create_autodeploy_executor From cb753c9205b28759907f1482ca9e473568ef2432 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:29:11 +0800 Subject: [PATCH 09/14] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/proxy.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index b47f444006a4..3895cf1172b9 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -45,7 +45,6 @@ def __init__( worker_cls: type = GenerationExecutorWorker, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - garbage_collection_gen0_threshold: Optional[int] = None, ) -> None: postproc_worker_config = postproc_worker_config or PostprocWorkerConfig( ) @@ -88,14 +87,14 @@ def __init__( self.model_world_size = model_world_size - self.garbage_collection_gen0_threshold = garbage_collection_gen0_threshold + self.garbage_collection_gen0_threshold = worker_kwargs[ + "llm_args"].garbage_collection_gen0_threshold if worker_kwargs.get( + "llm_args", None) is not None else None worker_kwargs = dict(**worker_kwargs, worker_queues=self._setup_queues(), postproc_worker_config=postproc_worker_config, - is_llm_executor=False, - garbage_collection_gen0_threshold=self. - garbage_collection_gen0_threshold) + is_llm_executor=False) if "log_level" not in worker_kwargs: worker_kwargs["log_level"] = logger.level From 7f67e935de9602078c1487624ec4fd24b91ca6b0 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:36:53 +0800 Subject: [PATCH 10/14] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/executor.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py index fe91e1f65173..6b924e933890 100644 --- a/tensorrt_llm/executor/executor.py +++ b/tensorrt_llm/executor/executor.py @@ -21,7 +21,6 @@ from ..bindings import executor as tllm from ..builder import Engine from ..disaggregated_params import DisaggregatedParams -from ..llmapi import TorchLlmArgs from ..llmapi.llm_utils import KvCacheRetentionConfig from ..llmapi.mpi_session import (MpiSession, external_mpi_comm_available, need_spawn_mpi_workers) @@ -352,7 +351,7 @@ def create( is_llm_executor: Optional[bool] = None, lora_config: Optional[LoraConfig] = None, hf_model_dir: Optional[Path] = None, - llm_args: Optional[TorchLlmArgs] = None, + llm_args=None, ) -> Union["GenerationExecutorProxy", "GenerationExecutorWorker"]: # local imports to avoid cyclic importing from .proxy import GenerationExecutorProxy @@ -421,9 +420,7 @@ def create( model_world_size=model_world_size, mpi_session=None, # use mpi4py postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - garbage_collection_gen0_threshold= - garbage_collection_gen0_threshold) + is_llm_executor=is_llm_executor) else: ctx = multiprocessing.get_context("spawn") # The ProcessPoolExecutorSession is used to support Windows, as mpi4py cannot. @@ -434,9 +431,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - garbage_collection_gen0_threshold= - garbage_collection_gen0_threshold) + is_llm_executor=is_llm_executor) def wait_first_completed( self, futures: List[GenerationResult] From 593e5d901569d2f363be7a0c82f1d6dbd3789b6e Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:40:37 +0800 Subject: [PATCH 11/14] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/llmapi/llm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index bf958e2c5730..b5dca350a567 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -897,6 +897,8 @@ def _build_model(self): return_logits = self.args.gather_generation_logits self._executor = self._executor_cls.create( + engine=None, + executor_config=None, batched_logits_processor=self.args.batched_logits_processor, model_world_size=self.args.parallel_config.world_size, mpi_session=self.mpi_session, From b5e300bb59ce325065f7f8021a0e6c2cb6c6c304 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:52:48 +0800 Subject: [PATCH 12/14] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index c39d09307db4..b31b05a50d8e 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -664,7 +664,8 @@ def worker_main( is_llm_executor: Optional[ bool] = True, # whether it's the main executor instance lora_config: Optional[LoraConfig] = None, - garbage_collection_gen0_threshold: Optional[int] = None, + hf_model_dir: Optional[Path] = None, + llm_args: Optional[TorchLlmArgs] = None, ) -> None: mpi_comm().barrier() print_colored_debug(f"Worker {mpi_rank()} entering worker_main...\n", @@ -791,7 +792,8 @@ def notify_proxy_threads_to_quit(): postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, lora_config=lora_config, - garbage_collection_gen0_threshold=garbage_collection_gen0_threshold) + hf_model_dir=hf_model_dir, + llm_args=llm_args) except Exception as e: logger.error(f"Failed to initialize executor on rank {mpi_rank()}: {e}") logger.error(traceback.format_exc()) From 42a131d6e7ac3b2581bcb371cf92dea9e8e6bf5f Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 25 Jun 2025 11:02:43 +0800 Subject: [PATCH 13/14] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index b31b05a50d8e..02aa307071fe 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -203,8 +203,8 @@ def _create_engine(): return tllm.Executor(engine, tllm.ModelType.DECODER_ONLY, executor_config) - self.engine = _create_py_executor if llm_args is not None else _create_engine( - ) + self.engine = _create_py_executor( + ) if llm_args is not None else _create_engine() self._lora_manager: Optional[LoraManager] = None self._prompt_adapter_manager: Optional[PromptAdapterManager] = None From 34ea621208fb88c865b09841420868eafe826011 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Thu, 26 Jun 2025 08:52:36 +0800 Subject: [PATCH 14/14] fix Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 02aa307071fe..45b716210513 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -167,13 +167,13 @@ def _create_py_executor(): args["lora_config"] = lora_config args[ "garbage_collection_gen0_threshold"] = llm_args.garbage_collection_gen0_threshold - elif executor_config.backend == "_autodeploy": + elif llm_args.backend == "_autodeploy": from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \ create_autodeploy_executor create_executor = create_autodeploy_executor else: raise ValueError( - f"Unsupported backend config: {executor_config.backend}") + f"Unsupported backend config: {llm_args.backend}") return create_executor(**args) def _create_engine():