From 53a7adf28f1e1de5a224e85f71acd18101d57195 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Tue, 24 Jun 2025 11:40:28 +0800
Subject: [PATCH 01/14] split _build_model method for TorchLlm and TrtLlm

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py      | 335 ++++++++++++++++++++------------
 tensorrt_llm/llmapi/llm_args.py |  19 +-
 2 files changed, 211 insertions(+), 143 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 5635d4016f66..e85224e83fe7 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -12,7 +12,6 @@
 from tqdm import tqdm
 from transformers import PreTrainedTokenizerBase
 
-from tensorrt_llm.builder import BuildConfig
 from tensorrt_llm.inputs.data import TextPrompt
 from tensorrt_llm.inputs.registry import DefaultInputProcessor
 
@@ -592,134 +591,6 @@ def _build_model(self):
                                          llm_build_stats=weakref.proxy(
                                              self.llm_build_stats))
         self._engine_dir, self._hf_model_dir = model_loader()
-        # update the model_dir to a local dir for the runtime, such as tokenizer loading.
-        if self._engine_dir is not None:
-            self.args.model = self._engine_dir
-
-        # Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub.
-        # It should also be before bindings ExecutorConfig, which may depend on tokenizer info.
-        self._tokenizer = self._try_load_tokenizer()
-
-        # Multimodal special handling:
-        # 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor
-        # 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__
-        self.input_processor = create_input_processor(self._hf_model_dir,
-                                                      self.tokenizer)
-        self.tokenizer = self.input_processor.tokenizer
-
-        max_batch_size = self.args.max_batch_size
-        max_num_tokens = self.args.max_num_tokens
-        max_seq_len = self.args.max_seq_len
-
-        build_config = self.args.build_config if self._on_trt_backend else BuildConfig(
-        )
-
-        max_batch_size = max_batch_size or build_config.max_batch_size
-        max_num_tokens = max_num_tokens or build_config.max_num_tokens
-        max_seq_len = max_seq_len or build_config.max_seq_len
-
-        self._executor_config = tllm.ExecutorConfig(
-            max_beam_width=self.args.max_beam_width,
-            scheduler_config=PybindMirror.maybe_to_pybind(
-                self.args.scheduler_config),
-            batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
-            or tllm.BatchingType.INFLIGHT,
-            max_batch_size=max_batch_size,
-            max_num_tokens=max_num_tokens,
-            gather_generation_logits=self.args.gather_generation_logits)
-        if self.args.backend is None:
-            # also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens
-            if max_seq_len is not None:
-                self._executor_config.max_seq_len = max_seq_len
-            else:
-                engine_config = EngineConfig.from_json_file(self._engine_dir /
-                                                            "config.json")
-                self._executor_config.max_seq_len = engine_config.build_config.max_seq_len
-        if self.args.kv_cache_config is not None:
-            self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
-                self.args.kv_cache_config)
-        if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
-            # Disable KV cache reuse for deterministic mode
-            self._executor_config.kv_cache_config.enable_block_reuse = False
-            self._executor_config.kv_cache_config.enable_partial_reuse = False
-        if self.args.peft_cache_config is not None:
-            self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
-                self.args.peft_cache_config)
-        elif self._on_trt_backend and self.args.build_config.plugin_config.lora_plugin:
-            engine_config = EngineConfig.from_json_file(self._engine_dir /
-                                                        "config.json")
-            lora_config = engine_config.build_config.lora_config
-            max_lora_rank = lora_config.max_lora_rank
-            num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \
-                len(lora_config.lora_target_modules + lora_config.missing_qkv_modules)
-            self._executor_config.peft_cache_config = tllm.PeftCacheConfig(
-                num_device_module_layer=max_lora_rank * num_lora_modules *
-                self.args.max_loras,
-                num_host_module_layer=max_lora_rank * num_lora_modules *
-                self.args.max_cpu_loras,
-            )
-        if self.args.decoding_config is not None:
-            self._executor_config.decoding_config = self.args.decoding_config
-        if self.args.guided_decoding_backend == 'xgrammar':
-            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
-                backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
-                XGRAMMAR,
-                **_xgrammar_tokenizer_info(self.tokenizer))
-        elif self.args.guided_decoding_backend == 'llguidance':
-            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
-                backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
-                LLGUIDANCE,
-                **_llguidance_tokenizer_info(self.tokenizer))
-        elif self.args.guided_decoding_backend is not None:
-            raise ValueError(
-                f"Unrecognized guided decoding backend {self.args.guided_decoding_backend}"
-            )
-
-        self._executor_config.normalize_log_probs = self.args.normalize_log_probs
-        self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
-        self._executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width
-        if self._on_trt_backend and self.args.extended_runtime_perf_knob_config is not None:
-            self._executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind(
-                self.args.extended_runtime_perf_knob_config)
-        if self.args.cache_transceiver_config is not None:
-            self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
-                self.args.cache_transceiver_config)
-        from tensorrt_llm._torch.pyexecutor.config import update_executor_config
-        update_executor_config(
-            self._executor_config,
-            backend=self.args.backend,
-            pytorch_backend_config=self.args.get_pytorch_backend_config()
-            if self.args.backend in ["pytorch", "_autodeploy"] else None,
-            mapping=self.args.parallel_config.to_mapping(),
-            build_config=self.args.build_config
-            if self._on_trt_backend else None,
-            speculative_config=self.args.speculative_config,
-            hf_model_dir=self._hf_model_dir,
-            trt_engine_dir=self._engine_dir,
-            max_input_len=self.args.max_input_len,
-            max_seq_len=max_seq_len)
-        self._executor_config.llm_parallel_config = self.args.parallel_config
-        return_logits = (self.args.gather_generation_logits
-                         or (self.args.build_config
-                             and self.args.build_config.gather_context_logits))
-
-        self._executor = self._executor_cls.create(
-            self._engine_dir,
-            executor_config=self._executor_config,
-            batched_logits_processor=self.args.batched_logits_processor,
-            model_world_size=self.args.parallel_config.world_size,
-            mpi_session=self.mpi_session,
-            reuse_mpi_comm=external_mpi_comm_available(
-                self.args.parallel_config.world_size),
-            return_logits=return_logits,
-            postproc_worker_config=PostprocWorkerConfig(
-                num_postprocess_workers=self.args.num_postprocess_workers,
-                postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,
-            ),
-            is_llm_executor=True,
-            lora_config=self.args.lora_config,
-            garbage_collection_gen0_threshold=self.args.
-            garbage_collection_gen0_threshold)
 
     @property
     def _on_trt_backend(self) -> bool:
@@ -860,6 +731,116 @@ def save(self, engine_dir: str) -> None:
                     f"Copying {file} to {target_engine_dir / file.name}\n")
                 shutil.copy(file, target_engine_dir / file.name)
 
+    def _build_model(self):
+        super()._build_model()
+        # update the model_dir to a local dir for the runtime, such as tokenizer loading.
+        if self._engine_dir is not None:
+            self.args.model = self._engine_dir
+
+        # Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub.
+        # It should also be before bindings ExecutorConfig, which may depend on tokenizer info.
+        self._tokenizer = self._try_load_tokenizer()
+
+        # Multimodal special handling:
+        # 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor
+        # 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__
+        self.input_processor = create_input_processor(self._hf_model_dir,
+                                                      self.tokenizer)
+        self.tokenizer = self.input_processor.tokenizer
+
+        max_batch_size = self.args.max_batch_size
+        max_num_tokens = self.args.max_num_tokens
+        max_seq_len = self.args.max_seq_len
+
+        build_config = self.args.build_config
+
+        max_batch_size = max_batch_size or build_config.max_batch_size
+        max_num_tokens = max_num_tokens or build_config.max_num_tokens
+        max_seq_len = max_seq_len or build_config.max_seq_len
+
+        self._executor_config = tllm.ExecutorConfig(
+            max_beam_width=self.args.max_beam_width,
+            scheduler_config=PybindMirror.maybe_to_pybind(
+                self.args.scheduler_config),
+            batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
+            or tllm.BatchingType.INFLIGHT,
+            max_batch_size=max_batch_size,
+            max_num_tokens=max_num_tokens,
+            gather_generation_logits=self.args.gather_generation_logits)
+
+        # also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens
+        if max_seq_len is not None:
+            self._executor_config.max_seq_len = max_seq_len
+        else:
+            engine_config = EngineConfig.from_json_file(self._engine_dir /
+                                                        "config.json")
+            self._executor_config.max_seq_len = engine_config.build_config.max_seq_len
+
+        if self.args.kv_cache_config is not None:
+            self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
+                self.args.kv_cache_config)
+        if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
+            # Disable KV cache reuse for deterministic mode
+            self._executor_config.kv_cache_config.enable_block_reuse = False
+            self._executor_config.kv_cache_config.enable_partial_reuse = False
+        if self.args.peft_cache_config is not None:
+            self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
+                self.args.peft_cache_config)
+        elif self.args.build_config.plugin_config.lora_plugin:
+            engine_config = EngineConfig.from_json_file(self._engine_dir /
+                                                        "config.json")
+            lora_config = engine_config.build_config.lora_config
+            max_lora_rank = lora_config.max_lora_rank
+            num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \
+                len(lora_config.lora_target_modules + lora_config.missing_qkv_modules)
+            self._executor_config.peft_cache_config = tllm.PeftCacheConfig(
+                num_device_module_layer=max_lora_rank * num_lora_modules *
+                self.args.max_loras,
+                num_host_module_layer=max_lora_rank * num_lora_modules *
+                self.args.max_cpu_loras,
+            )
+        if self.args.decoding_config is not None:
+            self._executor_config.decoding_config = self.args.decoding_config
+        if self.args.guided_decoding_backend == 'xgrammar':
+            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+                backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+                XGRAMMAR,
+                **_xgrammar_tokenizer_info(self.tokenizer))
+        else:
+            raise ValueError(
+                f"Unsupported guided decoding backend {self.args.guided_decoding_backend}"
+            )
+
+        self._executor_config.normalize_log_probs = self.args.normalize_log_probs
+        self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
+        self._executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width
+        if self.args.extended_runtime_perf_knob_config is not None:
+            self._executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind(
+                self.args.extended_runtime_perf_knob_config)
+        if self.args.cache_transceiver_config is not None:
+            self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
+                self.args.cache_transceiver_config)
+        self._executor_config.llm_parallel_config = self.args.parallel_config
+        return_logits = (self.args.gather_generation_logits
+                         or (self.args.build_config
+                             and self.args.build_config.gather_context_logits))
+
+        self._executor = self._executor_cls.create(
+            self._engine_dir,
+            executor_config=self._executor_config,
+            batched_logits_processor=self.args.batched_logits_processor,
+            model_world_size=self.args.parallel_config.world_size,
+            mpi_session=self.mpi_session,
+            reuse_mpi_comm=external_mpi_comm_available(
+                self.args.parallel_config.world_size),
+            return_logits=return_logits,
+            postproc_worker_config=PostprocWorkerConfig(
+                num_postprocess_workers=self.args.num_postprocess_workers,
+                postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,
+            ),
+            is_llm_executor=True,
+            lora_config=self.args.lora_config)
+
 
 @append_docstring(TORCH_LLM_DOCSTRING)
 class _TorchLLM(BaseLLM):
@@ -899,6 +880,102 @@ def __init__(self,
                          backend='pytorch',
                          **kwargs)
 
+    def _build_model(self):
+        super()._build_model()
+        assert self._engine_dir is None
+
+        # Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub.
+        # It should also be before bindings ExecutorConfig, which may depend on tokenizer info.
+        self._tokenizer = self._try_load_tokenizer()
+
+        # Multimodal special handling:
+        # 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor
+        # 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__
+        self.input_processor = create_input_processor(self._hf_model_dir,
+                                                      self.tokenizer)
+        self.tokenizer = self.input_processor.tokenizer
+
+        max_batch_size = self.args.max_batch_size
+        max_num_tokens = self.args.max_num_tokens
+        max_seq_len = self.args.max_seq_len
+
+        self._executor_config = tllm.ExecutorConfig(
+            max_beam_width=self.args.max_beam_width,
+            scheduler_config=PybindMirror.maybe_to_pybind(
+                self.args.scheduler_config),
+            batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
+            or tllm.BatchingType.INFLIGHT,
+            max_batch_size=max_batch_size,
+            max_num_tokens=max_num_tokens,
+            gather_generation_logits=self.args.gather_generation_logits)
+
+        if self.args.kv_cache_config is not None:
+            self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
+                self.args.kv_cache_config)
+        if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
+            # Disable KV cache reuse for deterministic mode
+            self._executor_config.kv_cache_config.enable_block_reuse = False
+            self._executor_config.kv_cache_config.enable_partial_reuse = False
+        if self.args.peft_cache_config is not None:
+            self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
+                self.args.peft_cache_config)
+        if self.args.decoding_config is not None:
+            self._executor_config.decoding_config = self.args.decoding_config
+        if self.args.guided_decoding_backend == 'xgrammar':
+            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+                backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+                XGRAMMAR,
+                **_xgrammar_tokenizer_info(self.tokenizer))
+        elif self.args.guided_decoding_backend == 'llguidance':
+            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+                backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+                LLGUIDANCE,
+                **_llguidance_tokenizer_info(self.tokenizer))
+        elif self.args.guided_decoding_backend is not None:
+            raise ValueError(
+                f"Unsupported guided decoding backend {self.args.guided_decoding_backend}"
+            )
+
+        self._executor_config.normalize_log_probs = self.args.normalize_log_probs
+        self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
+        self._executor_config.max_beam_width = self.args.max_beam_width
+        if self.args.cache_transceiver_config is not None:
+            self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
+                self.args.cache_transceiver_config)
+        from tensorrt_llm._torch.pyexecutor.config import update_executor_config
+        update_executor_config(
+            self._executor_config,
+            backend=self.args.backend,
+            pytorch_backend_config=self.args.get_pytorch_backend_config()
+            if self.args.backend in ["pytorch", "_autodeploy"] else None,
+            mapping=self.args.parallel_config.to_mapping(),
+            speculative_config=self.args.speculative_config,
+            hf_model_dir=self._hf_model_dir,
+            trt_engine_dir=self._engine_dir,
+            max_input_len=self.args.max_input_len,
+            max_seq_len=max_seq_len)
+
+        # TODO: revisit gather_context_logits
+        return_logits = self.args.gather_generation_logits
+
+        self._executor = self._executor_cls.create(
+            self._engine_dir,
+            executor_config=self._executor_config,
+            batched_logits_processor=self.args.batched_logits_processor,
+            model_world_size=self.args.parallel_config.world_size,
+            mpi_session=self.mpi_session,
+            reuse_mpi_comm=external_mpi_comm_available(
+                self.args.parallel_config.world_size),
+            return_logits=return_logits,
+            postproc_worker_config=PostprocWorkerConfig(
+                num_postprocess_workers=self.args.num_postprocess_workers,
+                postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,
+            ),
+            is_llm_executor=True,
+            lora_config=self.args.lora_config,
+            garbage_collection_gen0_threshold=self.args.
+            garbage_collection_gen0_threshold)
+
     def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
         """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.
         """
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 636740d5998b..7dec4f304128 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -954,12 +954,6 @@ class BaseLlmArgs(BaseModel):
         default=None,
         description="The parser to separate reasoning content from output.")
 
-    garbage_collection_gen0_threshold: int = Field(
-        default=20000,
-        description=
-        "Threshold for Python garbage collection of generation 0 objects."
-        "Lower values trigger more frequent garbage collection.")
-
     # TODO[Superjomn]: To deprecate this config.
     decoding_config: Optional[object] = Field(
         default=None,
@@ -1622,14 +1616,11 @@ class TorchCompileConfig(BaseModel):
 
 class TorchLlmArgs(BaseLlmArgs):
 
-    # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
-    build_config: Optional[object] = Field(
-        default=None,
-        description="Build config.",
-        exclude_from_json=True,
-        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
-
-    # PyTorch backend specific configurations
+    garbage_collection_gen0_threshold: int = Field(
+        default=20000,
+        description=
+        "Threshold for Python garbage collection of generation 0 objects."
+        "Lower values trigger more frequent garbage collection.")
 
     use_cuda_graph: bool = Field(
         default=False,

From 8fafd59befcf4bc92945731573ea0b0116eab8e5 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Tue, 24 Jun 2025 14:32:44 +0800
Subject: [PATCH 02/14] fix ci

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm_args.py | 48 ++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 7dec4f304128..bd922320132b 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1154,30 +1154,6 @@ def validate_model_format_misc(self):
         self._model_format = model_format
         return self
 
-    @model_validator(mode="after")
-    def init_build_config(self):
-        """
-        Creating a default BuildConfig if none is provided
-        """
-        if self.build_config is None:
-            kwargs = {}
-            if self.max_batch_size:
-                kwargs["max_batch_size"] = self.max_batch_size
-            if self.max_num_tokens:
-                kwargs["max_num_tokens"] = self.max_num_tokens
-            if self.max_seq_len:
-                kwargs["max_seq_len"] = self.max_seq_len
-            if self.max_beam_width:
-                kwargs["max_beam_width"] = self.max_beam_width
-            if self.max_input_len:
-                kwargs["max_input_len"] = self.max_input_len
-            self.build_config = BuildConfig(**kwargs)
-
-        assert isinstance(
-            self.build_config, BuildConfig
-        ), f"build_config is not initialized: {self.build_config}"
-        return self
-
     @model_validator(mode="after")
     def set_runtime_knobs_from_build_config(self):
         # TODO: remove this after PyT become default to adapt PyT with build_config as input
@@ -1542,6 +1518,30 @@ def init_calib_config(cls, v):
             return CalibConfig()
         return v
 
+    @model_validator(mode="after")
+    def init_build_config(self):
+        """
+        Creating a default BuildConfig if none is provided
+        """
+        if self.build_config is None:
+            kwargs = {}
+            if self.max_batch_size:
+                kwargs["max_batch_size"] = self.max_batch_size
+            if self.max_num_tokens:
+                kwargs["max_num_tokens"] = self.max_num_tokens
+            if self.max_seq_len:
+                kwargs["max_seq_len"] = self.max_seq_len
+            if self.max_beam_width:
+                kwargs["max_beam_width"] = self.max_beam_width
+            if self.max_input_len:
+                kwargs["max_input_len"] = self.max_input_len
+            self.build_config = BuildConfig(**kwargs)
+
+        assert isinstance(
+            self.build_config, BuildConfig
+        ), f"build_config is not initialized: {self.build_config}"
+        return self
+
     @model_validator(mode="after")
     def setup_embedding_parallel_mode(self):
         if self.embedding_parallel_mode == 'NONE':

From 03506eb4011bbd2123319f89e8dd64401069eee8 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Tue, 24 Jun 2025 14:58:36 +0800
Subject: [PATCH 03/14] clean

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm_args.py | 56 +++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index bd922320132b..0fa78ad5701a 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1154,6 +1154,30 @@ def validate_model_format_misc(self):
         self._model_format = model_format
         return self
 
+    @model_validator(mode="after")
+    def init_build_config(self):
+        """
+        Creating a default BuildConfig if none is provided
+        """
+        if self.build_config is None:
+            kwargs = {}
+            if self.max_batch_size:
+                kwargs["max_batch_size"] = self.max_batch_size
+            if self.max_num_tokens:
+                kwargs["max_num_tokens"] = self.max_num_tokens
+            if self.max_seq_len:
+                kwargs["max_seq_len"] = self.max_seq_len
+            if self.max_beam_width:
+                kwargs["max_beam_width"] = self.max_beam_width
+            if self.max_input_len:
+                kwargs["max_input_len"] = self.max_input_len
+            self.build_config = BuildConfig(**kwargs)
+
+        assert isinstance(
+            self.build_config, BuildConfig
+        ), f"build_config is not initialized: {self.build_config}"
+        return self
+
     @model_validator(mode="after")
     def set_runtime_knobs_from_build_config(self):
         # TODO: remove this after PyT become default to adapt PyT with build_config as input
@@ -1518,30 +1542,6 @@ def init_calib_config(cls, v):
             return CalibConfig()
         return v
 
-    @model_validator(mode="after")
-    def init_build_config(self):
-        """
-        Creating a default BuildConfig if none is provided
-        """
-        if self.build_config is None:
-            kwargs = {}
-            if self.max_batch_size:
-                kwargs["max_batch_size"] = self.max_batch_size
-            if self.max_num_tokens:
-                kwargs["max_num_tokens"] = self.max_num_tokens
-            if self.max_seq_len:
-                kwargs["max_seq_len"] = self.max_seq_len
-            if self.max_beam_width:
-                kwargs["max_beam_width"] = self.max_beam_width
-            if self.max_input_len:
-                kwargs["max_input_len"] = self.max_input_len
-            self.build_config = BuildConfig(**kwargs)
-
-        assert isinstance(
-            self.build_config, BuildConfig
-        ), f"build_config is not initialized: {self.build_config}"
-        return self
-
     @model_validator(mode="after")
     def setup_embedding_parallel_mode(self):
         if self.embedding_parallel_mode == 'NONE':
@@ -1615,6 +1615,14 @@ class TorchCompileConfig(BaseModel):
 
 
 class TorchLlmArgs(BaseLlmArgs):
+    # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
+    build_config: Optional[object] = Field(
+        default=None,
+        description="Build config.",
+        exclude_from_json=True,
+        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
+
+    # PyTorch backend specific configurations
 
     garbage_collection_gen0_threshold: int = Field(
         default=20000,

From 7d638750bfbdf5b407db8134cfe758317d4d9739 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Tue, 24 Jun 2025 15:27:59 +0800
Subject: [PATCH 04/14] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index e85224e83fe7..0257e2f4a07f 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -806,7 +806,7 @@ def _build_model(self):
                 backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
                 XGRAMMAR,
                 **_xgrammar_tokenizer_info(self.tokenizer))
-        else:
+        elif self.args.guided_decoding_backend is not None:
             raise ValueError(
                 f"Unsupported guided decoding backend {self.args.guided_decoding_backend}"
             )

From b0df829aa877e69607245a2c960444825b979343 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Tue, 24 Jun 2025 16:53:05 +0800
Subject: [PATCH 05/14] fix ci

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tests/unittest/llmapi/test_llm.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 33a458a12345..83d9e62fed99 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -2141,13 +2141,17 @@ def run_llm_with_postprocess_parallel_and_result_handler(
     kwargs = {}
     if backend not in ["pytorch", "autodeploy"]:
         kwargs["fast_build"] = True
-    llm = LLM(model=llama_model_path,
-              backend=backend,
-              kv_cache_config=global_kvcache_config,
-              tensor_parallel_size=tp_size,
-              num_postprocess_workers=2,
-              postprocess_tokenizer_dir=llama_model_path,
-              **kwargs)
+        LLM_CLASS = LLM
+    else:
+        LLM_CLASS = LLM_torch
+
+    llm = LLM_CLASS(model=llama_model_path,
+                    backend=backend,
+                    kv_cache_config=global_kvcache_config,
+                    tensor_parallel_size=tp_size,
+                    num_postprocess_workers=2,
+                    postprocess_tokenizer_dir=llama_model_path,
+                    **kwargs)
     golden_result = "DEFGHI"
     for i, output in enumerate(
             llm.generate_async(prompts[0],

From a7b2e5d047b2771bbfb232e74624e31c6acb0885 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 09:42:28 +0800
Subject: [PATCH 06/14] refactor _build_model method of TorchLlm

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/worker.py | 129 ++++++++++++++++++++++++--------
 tensorrt_llm/llmapi/llm.py      |  70 ++---------------
 2 files changed, 104 insertions(+), 95 deletions(-)

diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index dfe95ff2b773..6d2d08f91a8e 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -18,8 +18,10 @@
                       mpi_comm, mpi_rank, nvtx_range_debug)
 from ..bindings import executor as tllm
 from ..builder import ConfigEncoder, Engine, EngineConfig
-from ..llmapi.llm_args import PybindMirror
+from ..llmapi.llm_args import PybindMirror, TorchLlmArgs
 from ..llmapi.mpi_session import set_mpi_session_cpp
+from ..llmapi.tokenizer import (_llguidance_tokenizer_info,
+                                _xgrammar_tokenizer_info)
 from ..llmapi.tracer import VizTracer, global_tracer, set_global_tracer
 from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue,
                             clear_sched_affinity, print_colored_debug,
@@ -59,6 +61,8 @@ def __init__(
         is_llm_executor: Optional[bool] = None,
         lora_config: Optional[LoraConfig] = None,
         garbage_collection_gen0_threshold: Optional[int] = None,
+        hf_model_dir: Optional[Path] = None,
+        llm_args: Optional[TorchLlmArgs] = None,
     ) -> None:
         postproc_config = postproc_worker_config or PostprocWorkerConfig()
         super().__init__(
@@ -79,8 +83,7 @@ def __init__(
         self._await_response_helper = AwaitResponseHelper(
             self)  # TODO: make it weakref
         self._executor_config = executor_config
-        self._is_pytorch_backend = getattr(self._executor_config, "backend",
-                                           None) == "pytorch"
+        self._is_pytorch_backend = llm_args is not None and llm_args.backend == "pytorch"
 
         if global_mpi_size() > 1:
             logger.set_rank(self.global_rank)
@@ -88,13 +91,98 @@ def __init__(
         if isinstance(engine, list):
             engine = engine[self.rank]
 
-        if executor_config is None:
-            executor_config = tllm.ExecutorConfig(1)
+        def _create_py_executor():
+            device_id = self.global_rank % torch.cuda.device_count()
+            torch.cuda.set_device(device_id)
 
-        executor_config.logits_post_processor_config = tllm.LogitsPostProcessorConfig(
-            processor_batched=batched_logits_processor, replicate=False)
+            max_batch_size = llm_args.max_batch_size
+            max_num_tokens = llm_args.max_num_tokens
+            max_seq_len = llm_args.max_seq_len
+
+            self._executor_config = tllm.ExecutorConfig(
+                max_beam_width=llm_args.max_beam_width,
+                scheduler_config=PybindMirror.maybe_to_pybind(
+                    llm_args.scheduler_config),
+                batching_type=PybindMirror.maybe_to_pybind(
+                    llm_args.batching_type) or tllm.BatchingType.INFLIGHT,
+                max_batch_size=max_batch_size,
+                max_num_tokens=max_num_tokens,
+                gather_generation_logits=llm_args.gather_generation_logits)
+
+            if llm_args.kv_cache_config is not None:
+                self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
+                    llm_args.kv_cache_config)
+            if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
+                # Disable KV cache reuse for deterministic mode
+                self._executor_config.kv_cache_config.enable_block_reuse = False
+                self._executor_config.kv_cache_config.enable_partial_reuse = False
+            if llm_args.peft_cache_config is not None:
+                self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
+                    llm_args.peft_cache_config)
+            if llm_args.decoding_config is not None:
+                self._executor_config.decoding_config = llm_args.decoding_config
+            if llm_args.guided_decoding_backend == 'xgrammar':
+                self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+                    backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+                    XGRAMMAR,
+                    **_xgrammar_tokenizer_info(self.tokenizer))
+            elif llm_args.guided_decoding_backend == 'llguidance':
+                self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+                    backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+                    LLGUIDANCE,
+                    **_llguidance_tokenizer_info(self.tokenizer))
+            elif llm_args.guided_decoding_backend is not None:
+                raise ValueError(
+                    f"Unsupported guided decoding backend {llm_args.guided_decoding_backend}"
+                )
+
+            self._executor_config.normalize_log_probs = llm_args.normalize_log_probs
+            self._executor_config.enable_chunked_context = llm_args.enable_chunked_prefill
+            self._executor_config.max_beam_width = llm_args.max_beam_width
+            if llm_args.cache_transceiver_config is not None:
+                self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
+                    llm_args.cache_transceiver_config)
+            from tensorrt_llm._torch.pyexecutor.config import \
+                update_executor_config
+            update_executor_config(
+                self._executor_config,
+                backend=llm_args.backend,
+                pytorch_backend_config=llm_args.get_pytorch_backend_config()
+                if llm_args.backend in ["pytorch", "_autodeploy"] else None,
+                mapping=llm_args.parallel_config.to_mapping(),
+                speculative_config=llm_args.speculative_config,
+                hf_model_dir=hf_model_dir,
+                max_input_len=llm_args.max_input_len,
+                max_seq_len=max_seq_len)
+
+            self._executor_config.logits_post_processor_config = tllm.LogitsPostProcessorConfig(
+                processor_batched=batched_logits_processor, replicate=False)
+            args = {
+                "executor_config": self._executor_config,
+                "checkpoint_dir": hf_model_dir,
+            }
+            if llm_args.backend == "pytorch":
+                from tensorrt_llm._torch.pyexecutor.py_executor_creator import \
+                    create_py_executor
+                create_executor = create_py_executor
+                args["lora_config"] = lora_config
+                args[
+                    "garbage_collection_gen0_threshold"] = garbage_collection_gen0_threshold
+            elif executor_config.backend == "_autodeploy":
+                from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \
+                    create_autodeploy_executor
+                create_executor = create_autodeploy_executor
+            else:
+                raise ValueError(
+                    f"Unsupported backend config: {executor_config.backend}")
+            return create_executor(**args)
 
         def _create_engine():
+            if executor_config is None:
+                executor_config = tllm.ExecutorConfig(1)
+
+            executor_config.logits_post_processor_config = tllm.LogitsPostProcessorConfig(
+                processor_batched=batched_logits_processor, replicate=False)
             device_id = self.global_rank % torch.cuda.device_count()
             torch.cuda.set_device(device_id)
 
@@ -113,30 +201,11 @@ def _create_engine():
                                      executor_config=executor_config,
                                      managed_weights=engine.managed_weights)
 
-            if not hasattr(executor_config, "backend"):
-                return tllm.Executor(engine, tllm.ModelType.DECODER_ONLY,
-                                     executor_config)
-            args = {
-                "executor_config": executor_config,
-                "checkpoint_dir": executor_config.hf_model_dir,
-            }
-            if executor_config.backend == "pytorch":
-                from tensorrt_llm._torch.pyexecutor.py_executor_creator import \
-                    create_py_executor
-                create_executor = create_py_executor
-                args["lora_config"] = lora_config
-                args[
-                    "garbage_collection_gen0_threshold"] = garbage_collection_gen0_threshold
-            elif executor_config.backend == "_autodeploy":
-                from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \
-                    create_autodeploy_executor
-                create_executor = create_autodeploy_executor
-            else:
-                raise ValueError(
-                    f"Unsupported backend config: {executor_config.backend}")
-            return create_executor(**args)
+            return tllm.Executor(engine, tllm.ModelType.DECODER_ONLY,
+                                 executor_config)
 
-        self.engine = _create_engine()
+        self.engine = _create_py_executor if llm_args is not None else _create_engine(
+        )
 
         self._lora_manager: Optional[LoraManager] = None
         self._prompt_adapter_manager: Optional[PromptAdapterManager] = None
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 758ee3bbe3c8..b5dca350a567 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -35,8 +35,7 @@
 from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig,
                         LlmBuildStats, ModelLoader, _ModelRuntimeContext)
 from .mpi_session import MpiPoolSession, external_mpi_comm_available
-from .tokenizer import (TokenizerBase, _llguidance_tokenizer_info,
-                        _xgrammar_tokenizer_info)
+from .tokenizer import TokenizerBase, _xgrammar_tokenizer_info
 # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import
 from .utils import (append_docstring, exception_handler, get_device_count,
                     print_colored_debug)
@@ -894,71 +893,12 @@ def _build_model(self):
                                                       self.tokenizer)
         self.tokenizer = self.input_processor.tokenizer
 
-        max_batch_size = self.args.max_batch_size
-        max_num_tokens = self.args.max_num_tokens
-        max_seq_len = self.args.max_seq_len
-
-        self._executor_config = tllm.ExecutorConfig(
-            max_beam_width=self.args.max_beam_width,
-            scheduler_config=PybindMirror.maybe_to_pybind(
-                self.args.scheduler_config),
-            batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
-            or tllm.BatchingType.INFLIGHT,
-            max_batch_size=max_batch_size,
-            max_num_tokens=max_num_tokens,
-            gather_generation_logits=self.args.gather_generation_logits)
-
-        if self.args.kv_cache_config is not None:
-            self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
-                self.args.kv_cache_config)
-        if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
-            # Disable KV cache reuse for deterministic mode
-            self._executor_config.kv_cache_config.enable_block_reuse = False
-            self._executor_config.kv_cache_config.enable_partial_reuse = False
-        if self.args.peft_cache_config is not None:
-            self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
-                self.args.peft_cache_config)
-        if self.args.decoding_config is not None:
-            self._executor_config.decoding_config = self.args.decoding_config
-        if self.args.guided_decoding_backend == 'xgrammar':
-            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
-                backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
-                XGRAMMAR,
-                **_xgrammar_tokenizer_info(self.tokenizer))
-        elif self.args.guided_decoding_backend == 'llguidance':
-            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
-                backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
-                LLGUIDANCE,
-                **_llguidance_tokenizer_info(self.tokenizer))
-        elif self.args.guided_decoding_backend is not None:
-            raise ValueError(
-                f"Unsupported guided decoding backend {self.args.guided_decoding_backend}"
-            )
-
-        self._executor_config.normalize_log_probs = self.args.normalize_log_probs
-        self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
-        self._executor_config.max_beam_width = self.args.max_beam_width
-        if self.args.cache_transceiver_config is not None:
-            self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
-                self.args.cache_transceiver_config)
-        from tensorrt_llm._torch.pyexecutor.config import update_executor_config
-        update_executor_config(
-            self._executor_config,
-            backend=self.args.backend,
-            pytorch_backend_config=self.args.get_pytorch_backend_config()
-            if self.args.backend in ["pytorch", "_autodeploy"] else None,
-            mapping=self.args.parallel_config.to_mapping(),
-            speculative_config=self.args.speculative_config,
-            hf_model_dir=self._hf_model_dir,
-            max_input_len=self.args.max_input_len,
-            max_seq_len=max_seq_len)
-
         # TODO: revisit gather_context_logits
         return_logits = self.args.gather_generation_logits
 
         self._executor = self._executor_cls.create(
-            self._engine_dir,
-            executor_config=self._executor_config,
+            engine=None,
+            executor_config=None,
             batched_logits_processor=self.args.batched_logits_processor,
             model_world_size=self.args.parallel_config.world_size,
             mpi_session=self.mpi_session,
@@ -971,8 +911,8 @@ def _build_model(self):
             ),
             is_llm_executor=True,
             lora_config=self.args.lora_config,
-            garbage_collection_gen0_threshold=self.args.
-            garbage_collection_gen0_threshold)
+            hf_model_dir=self._hf_model_dir,
+            llm_args=self.args)
 
     def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
         """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.

From a39a09eb550221dcefb904d76dda4a142e523cb7 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 10:02:32 +0800
Subject: [PATCH 07/14] clean

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index b5dca350a567..bf958e2c5730 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -897,8 +897,6 @@ def _build_model(self):
         return_logits = self.args.gather_generation_logits
 
         self._executor = self._executor_cls.create(
-            engine=None,
-            executor_config=None,
             batched_logits_processor=self.args.batched_logits_processor,
             model_world_size=self.args.parallel_config.world_size,
             mpi_session=self.mpi_session,

From 1e453d3bf251be2ba8eb15fbffe1f8f4d5a63985 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 10:20:57 +0800
Subject: [PATCH 08/14] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/executor.py | 14 +++++++-------
 tensorrt_llm/executor/worker.py   |  3 +--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
index 9d6dd966058b..fe91e1f65173 100644
--- a/tensorrt_llm/executor/executor.py
+++ b/tensorrt_llm/executor/executor.py
@@ -21,6 +21,7 @@
 from ..bindings import executor as tllm
 from ..builder import Engine
 from ..disaggregated_params import DisaggregatedParams
+from ..llmapi import TorchLlmArgs
 from ..llmapi.llm_utils import KvCacheRetentionConfig
 from ..llmapi.mpi_session import (MpiSession, external_mpi_comm_available,
                                   need_spawn_mpi_workers)
@@ -350,7 +351,8 @@ def create(
         postproc_worker_config: Optional[PostprocWorkerConfig] = None,
         is_llm_executor: Optional[bool] = None,
         lora_config: Optional[LoraConfig] = None,
-        garbage_collection_gen0_threshold: Optional[int] = None,
+        hf_model_dir: Optional[Path] = None,
+        llm_args: Optional[TorchLlmArgs] = None,
     ) -> Union["GenerationExecutorProxy", "GenerationExecutorWorker"]:
         # local imports to avoid cyclic importing
         from .proxy import GenerationExecutorProxy
@@ -377,6 +379,8 @@ def create(
             "engine": engine,
             "executor_config": executor_config,
             "batched_logits_processor": batched_logits_processor,
+            "hf_model_dir": hf_model_dir,
+            "llm_args": llm_args,
         }
 
         if lora_config:
@@ -394,9 +398,7 @@ def create(
                 model_world_size=model_world_size,
                 mpi_session=mpi_session,
                 postproc_worker_config=postproc_worker_config,
-                is_llm_executor=is_llm_executor,
-                garbage_collection_gen0_threshold=
-                garbage_collection_gen0_threshold)
+                is_llm_executor=is_llm_executor)
 
         # WAR: For the performance of gathering logits, we use single process worker
         # for TP1 to avoid the large overhead of IPC.
@@ -407,9 +409,7 @@ def create(
                 "Using single process worker for TP1, this may hurt streaming generation performance."
             )
             return GenerationExecutorWorker(**worker_kwargs,
-                                            is_llm_executor=is_llm_executor,
-                                            garbage_collection_gen0_threshold=
-                                            garbage_collection_gen0_threshold)
+                                            is_llm_executor=is_llm_executor)
 
         # For single-gpu case:
         # Partition the workload to multiple process for streaming performance.
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 6d2d08f91a8e..c39d09307db4 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -60,7 +60,6 @@ def __init__(
         postproc_worker_config: Optional[PostprocWorkerConfig] = None,
         is_llm_executor: Optional[bool] = None,
         lora_config: Optional[LoraConfig] = None,
-        garbage_collection_gen0_threshold: Optional[int] = None,
         hf_model_dir: Optional[Path] = None,
         llm_args: Optional[TorchLlmArgs] = None,
     ) -> None:
@@ -167,7 +166,7 @@ def _create_py_executor():
                 create_executor = create_py_executor
                 args["lora_config"] = lora_config
                 args[
-                    "garbage_collection_gen0_threshold"] = garbage_collection_gen0_threshold
+                    "garbage_collection_gen0_threshold"] = llm_args.garbage_collection_gen0_threshold
             elif executor_config.backend == "_autodeploy":
                 from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \
                     create_autodeploy_executor

From cb753c9205b28759907f1482ca9e473568ef2432 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 10:29:11 +0800
Subject: [PATCH 09/14] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/proxy.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index b47f444006a4..3895cf1172b9 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -45,7 +45,6 @@ def __init__(
         worker_cls: type = GenerationExecutorWorker,
         postproc_worker_config: Optional[PostprocWorkerConfig] = None,
         is_llm_executor: Optional[bool] = None,
-        garbage_collection_gen0_threshold: Optional[int] = None,
     ) -> None:
         postproc_worker_config = postproc_worker_config or PostprocWorkerConfig(
         )
@@ -88,14 +87,14 @@ def __init__(
 
         self.model_world_size = model_world_size
 
-        self.garbage_collection_gen0_threshold = garbage_collection_gen0_threshold
+        self.garbage_collection_gen0_threshold = worker_kwargs[
+            "llm_args"].garbage_collection_gen0_threshold if worker_kwargs.get(
+                "llm_args", None) is not None else None
 
         worker_kwargs = dict(**worker_kwargs,
                              worker_queues=self._setup_queues(),
                              postproc_worker_config=postproc_worker_config,
-                             is_llm_executor=False,
-                             garbage_collection_gen0_threshold=self.
-                             garbage_collection_gen0_threshold)
+                             is_llm_executor=False)
 
         if "log_level" not in worker_kwargs:
             worker_kwargs["log_level"] = logger.level

From 7f67e935de9602078c1487624ec4fd24b91ca6b0 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 10:36:53 +0800
Subject: [PATCH 10/14] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/executor.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
index fe91e1f65173..6b924e933890 100644
--- a/tensorrt_llm/executor/executor.py
+++ b/tensorrt_llm/executor/executor.py
@@ -21,7 +21,6 @@
 from ..bindings import executor as tllm
 from ..builder import Engine
 from ..disaggregated_params import DisaggregatedParams
-from ..llmapi import TorchLlmArgs
 from ..llmapi.llm_utils import KvCacheRetentionConfig
 from ..llmapi.mpi_session import (MpiSession, external_mpi_comm_available,
                                   need_spawn_mpi_workers)
@@ -352,7 +351,7 @@ def create(
         is_llm_executor: Optional[bool] = None,
         lora_config: Optional[LoraConfig] = None,
         hf_model_dir: Optional[Path] = None,
-        llm_args: Optional[TorchLlmArgs] = None,
+        llm_args=None,
     ) -> Union["GenerationExecutorProxy", "GenerationExecutorWorker"]:
         # local imports to avoid cyclic importing
         from .proxy import GenerationExecutorProxy
@@ -421,9 +420,7 @@ def create(
                 model_world_size=model_world_size,
                 mpi_session=None,  # use mpi4py
                 postproc_worker_config=postproc_worker_config,
-                is_llm_executor=is_llm_executor,
-                garbage_collection_gen0_threshold=
-                garbage_collection_gen0_threshold)
+                is_llm_executor=is_llm_executor)
         else:
             ctx = multiprocessing.get_context("spawn")
             # The ProcessPoolExecutorSession is used to support Windows, as mpi4py cannot.
@@ -434,9 +431,7 @@ def create(
                 model_world_size=model_world_size,
                 mpi_session=mpi_session,
                 postproc_worker_config=postproc_worker_config,
-                is_llm_executor=is_llm_executor,
-                garbage_collection_gen0_threshold=
-                garbage_collection_gen0_threshold)
+                is_llm_executor=is_llm_executor)
 
     def wait_first_completed(
         self, futures: List[GenerationResult]

From 593e5d901569d2f363be7a0c82f1d6dbd3789b6e Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 10:40:37 +0800
Subject: [PATCH 11/14] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index bf958e2c5730..b5dca350a567 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -897,6 +897,8 @@ def _build_model(self):
         return_logits = self.args.gather_generation_logits
 
         self._executor = self._executor_cls.create(
+            engine=None,
+            executor_config=None,
             batched_logits_processor=self.args.batched_logits_processor,
             model_world_size=self.args.parallel_config.world_size,
             mpi_session=self.mpi_session,

From b5e300bb59ce325065f7f8021a0e6c2cb6c6c304 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 10:52:48 +0800
Subject: [PATCH 12/14] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/worker.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index c39d09307db4..b31b05a50d8e 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -664,7 +664,8 @@ def worker_main(
     is_llm_executor: Optional[
         bool] = True,  # whether it's the main executor instance
     lora_config: Optional[LoraConfig] = None,
-    garbage_collection_gen0_threshold: Optional[int] = None,
+    hf_model_dir: Optional[Path] = None,
+    llm_args: Optional[TorchLlmArgs] = None,
 ) -> None:
     mpi_comm().barrier()
     print_colored_debug(f"Worker {mpi_rank()} entering worker_main...\n",
@@ -791,7 +792,8 @@ def notify_proxy_threads_to_quit():
             postproc_worker_config=postproc_worker_config,
             is_llm_executor=is_llm_executor,
             lora_config=lora_config,
-            garbage_collection_gen0_threshold=garbage_collection_gen0_threshold)
+            hf_model_dir=hf_model_dir,
+            llm_args=llm_args)
     except Exception as e:
         logger.error(f"Failed to initialize executor on rank {mpi_rank()}: {e}")
         logger.error(traceback.format_exc())

From 42a131d6e7ac3b2581bcb371cf92dea9e8e6bf5f Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 25 Jun 2025 11:02:43 +0800
Subject: [PATCH 13/14] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index b31b05a50d8e..02aa307071fe 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -203,8 +203,8 @@ def _create_engine():
             return tllm.Executor(engine, tllm.ModelType.DECODER_ONLY,
                                  executor_config)
 
-        self.engine = _create_py_executor if llm_args is not None else _create_engine(
-        )
+        self.engine = _create_py_executor(
+        ) if llm_args is not None else _create_engine()
 
         self._lora_manager: Optional[LoraManager] = None
         self._prompt_adapter_manager: Optional[PromptAdapterManager] = None

From 34ea621208fb88c865b09841420868eafe826011 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Thu, 26 Jun 2025 08:52:36 +0800
Subject: [PATCH 14/14] fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 02aa307071fe..45b716210513 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -167,13 +167,13 @@ def _create_py_executor():
                 args["lora_config"] = lora_config
                 args[
                     "garbage_collection_gen0_threshold"] = llm_args.garbage_collection_gen0_threshold
-            elif executor_config.backend == "_autodeploy":
+            elif llm_args.backend == "_autodeploy":
                 from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \
                     create_autodeploy_executor
                 create_executor = create_autodeploy_executor
             else:
                 raise ValueError(
-                    f"Unsupported backend config: {executor_config.backend}")
+                    f"Unsupported backend config: {llm_args.backend}")
             return create_executor(**args)
 
         def _create_engine():